## Create genre matrix

In [1]:
import pandas as pd

In [2]:
genres_data = pd.read_csv('../data/fin_genres.csv')

In [3]:
genres_data.loc[genres_data['Genre'] == 'other', 'NumberOfPeople'] += 1

In [4]:
genre_mat = genres_data.pivot(index='bookId', columns='Genre', values='NumberOfPeople').fillna(0)

In [5]:
genre_mat

Genre,adult,adventure,art,biography,bussiness,children,classics,comedy,contemporary,culture,...,religion,romance,school,science,science-fiction,self-help,short-stories,sociology,tragedy,young-adult
bookId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,132.0,174.0,0.0,0.0,0.0,0.0,921.0,2638.0,41.0,0.0,...,0.0,0.0,0.0,95.0,8313.0,0.0,0.0,0.0,0.0,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45536,303.0,308.0,0.0,613.0,0.0,0.0,6941.0,0.0,156.0,1177.0,...,0.0,0.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45557,0.0,3.0,0.0,109.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45592,9.0,0.0,0.0,0.0,0.0,0.0,28.0,14.0,21.0,8.0,...,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
45633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0


In [6]:
genre_mat = genre_mat.applymap(lambda x: 1 if x > 0 else 0)
genre_mat
genre_mat.to_csv('bin_genre_mat.csv')

  genre_mat = genre_mat.applymap(lambda x: 1 if x > 0 else 0)


In [7]:
genre_mat['bookId'] = genre_mat.index

In [8]:
# genre_mat.reset_index()

In [9]:
genre_mat['index'] = range(0, len(genre_mat))

In [10]:
genre_mat.set_index("index", inplace = True)

## Create feature matrix

In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import joblib

In [13]:
book_data = pd.read_csv('../data/book_data.csv')

In [14]:
# Replace NaN values with 0
book_data['Series'] = book_data['Series'].fillna(0)

# Convert all non-zero values to 1
book_data['Series'] = np.where(book_data['Series'] == 0, 0, 1)

In [15]:
# Calculate average rating
rating_columns = ['5 stars', '4 stars', '3 stars', '2 stars', '1 star']
book_data['average_rating'] = (5*book_data['5 stars'] + 4*book_data['4 stars'] + 3*book_data['3 stars'] +
                               2*book_data['2 stars'] + 1*book_data['1 star']) / book_data[rating_columns].sum(axis=1)
book_data['average_rating']

0       4.571836
1       4.499098
2       4.357352
3       4.428023
4       4.357350
          ...   
3415    3.616282
3416    4.135051
3417    3.929368
3418    4.077090
3419    3.966531
Name: average_rating, Length: 3420, dtype: float64

In [16]:
book_data['num_years'] = 2024-book_data['PublishYear']

In [17]:
# Select required columns
book_data_reduced = book_data[['BookID', 'average_rating', 'Pages', 'Book format', 'Series', 'num_years']]
book_data_reduced.head()

Unnamed: 0,BookID,average_rating,Pages,Book format,Series,num_years
0,5,4.571836,435,Mass Market Paperback,1,25
1,2,4.499098,870,Paperback,1,21
2,13,4.357352,815,Paperback,1,28
3,4,4.428023,352,Hardcover,1,26
4,18,4.35735,815,Hardcover,1,28


In [18]:
# Merge the genre matrix with book data
merged_data = pd.merge(genre_mat, book_data_reduced, left_on='bookId', right_on='BookID')
merged_data.head()

Unnamed: 0,adult,adventure,art,biography,bussiness,children,classics,comedy,contemporary,culture,...,sociology,tragedy,young-adult,bookId,BookID,average_rating,Pages,Book format,Series,num_years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,2,4.499098,870,Paperback,1,21
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,4,4.428023,352,Hardcover,1,26
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,5,5,4.571836,435,Mass Market Paperback,1,25
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,13,13,4.357352,815,Paperback,1,28
4,1,1,0,0,0,0,1,1,1,0,...,0,0,1,18,18,4.35735,815,Hardcover,1,28


In [19]:
# One-hot encode categorical variables
merged_data = pd.get_dummies(merged_data, columns=['Book format'])

In [20]:
merged_data.head()

Unnamed: 0,adult,adventure,art,biography,bussiness,children,classics,comedy,contemporary,culture,...,Book format_Novelty Book,Book format_Paperback,Book format_Poche,Book format_Taschenbuch,Book format_Trade Paperback,Book format_Unknown Binding,Book format_cloth,Book format_ebook,Book format_paper,Book format_softcover
0,0,0,0,0,0,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,False
1,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,0,0,0,0,0,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,False
4,1,1,0,0,0,0,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False


In [21]:
scaler = StandardScaler()

In [23]:
scaler = joblib.load('../models/scaler.save')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [24]:
X = merged_data.drop(columns=['bookId', 'BookID', 'average_rating']).values
y = merged_data['average_rating'].values

In [25]:
X_scaled = scaler.transform(X)

In [26]:
# Convert arrays to PyTorch tensors
X_torch = torch.tensor(X_scaled, dtype=torch.float32)
y_torch = torch.tensor(y, dtype=torch.float32).view(-1, 1)

In [27]:
# Create dataset and dataloader
dataset = TensorDataset(X_torch, y_torch)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [28]:
class ANN(nn.Module):
    def __init__(self,input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        # x = self.relu(x)
        return x

In [29]:
# Extract features
def extract_features(model, dataloader):
    model.eval()
    features = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            features.append(model(inputs))
    return torch.cat(features, dim=0)

In [36]:
input_size = X.shape[1]
# Instantiate the model
model = ANN(input_size)

model.load_state_dict(torch.load('../models/book_rating_model.pth')) 
features = extract_features(model, dataloader)

# Convert features to DataFrame and save to CSV
feature_mat = pd.DataFrame(features.numpy(), index=merged_data['BookID'])

In [37]:
feature_mat['bookId'] = feature_mat.index

In [38]:
feature_mat['index'] = range(0, len(feature_mat))

In [39]:
feature_mat.set_index("index", inplace = True)

## Final recommendation

In [40]:
import pandas as pd
import numpy as np

In [41]:
# genre_mat = pd.read_csv('data/bin_genre_mat.csv')
# feature_mat = pd.read_csv('data/book_features.csv')

In [42]:
genre_mat.head()

Genre,adult,adventure,art,biography,bussiness,children,classics,comedy,contemporary,culture,...,romance,school,science,science-fiction,self-help,short-stories,sociology,tragedy,young-adult,bookId
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13
4,1,1,0,0,0,0,1,1,1,0,...,0,0,1,1,0,0,0,0,1,18


In [43]:
feature_mat.head()

Unnamed: 0_level_0,0,bookId
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3.960832,2
1,4.15232,4
2,4.424885,5
3,4.177643,13
4,3.658966,18


In [44]:
def cosine_similarity(x, y):
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y,axis=1)
    cosim = x@y.T/(norm_x*norm_y)
    return cosim[0]

In [45]:
def cosine_bookid(book_id,data):
    genre_id = data[data.bookId==book_id]
    except_id = data[data.bookId!=book_id]
    genre_id = genre_id.to_numpy()[:, :-1]
    except_id = except_id.to_numpy()[:, :-1]
    cos_sim = cosine_similarity(genre_id, except_id)
    return cos_sim

In [46]:
def top_k_book(book_id, k, data):
    cos_sim = cosine_bookid(book_id, data)
    index = data.loc[data['bookId'] == book_id].index
    cos_sim = np.insert(cos_sim, index, 0)
    sort_sim = np.argsort(-cos_sim)
    top_id = sort_sim[:k]
    top_bookid = [int(data.iloc[index]['bookId']) for index in top_id]
    return top_bookid

In [47]:
top_k_book(18, 5, genre_mat)

[17707, 8694, 34921, 18878, 33342]

In [48]:
top_k_book(18, 5, feature_mat)

[2, 27999, 28001, 28003, 28012]