In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()
from surprise import Reader, Dataset, SVD
from sklearn.model_selection import GridSearchCV, cross_validate
svd = SVD()

  from tqdm.autonotebook import tqdm, trange


In [2]:
read = pd.read_feather('Feather/read.feather')
books = pd.read_feather('Feather/books.feather')    
reviews = pd.read_feather('Feather/reviews.feather')
interactions = pd.read_feather('Feather/interactions.feather')

In [3]:
interactions = interactions[interactions['is_read']== True]

In [4]:
reviews = reviews.dropna(subset=['review_text', 'rating'])
user_review_counts = reviews.groupby('user_id').size()

users_with_more_than_3_reviews = user_review_counts[user_review_counts > 3].index

valid_reviews = reviews[reviews['user_id'].isin(users_with_more_than_3_reviews)]
valid_reviews = valid_reviews.head(10000)

In [5]:
len(reviews['review_text'])
reviews = reviews.dropna(subset=['review_text', 'rating'])
user_review_counts = reviews.groupby('user_id').size()

users_with_more_than_3_reviews = user_review_counts[user_review_counts > 3].index

valid_reviews = reviews[reviews['user_id'].isin(users_with_more_than_3_reviews)]
valid_reviews = valid_reviews.head(10000)


In [6]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

# Prepare surprise dataset
reader = Reader()
data = Dataset.load_from_df(interactions[['user_id', 'book_id', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Define the parameter grid
param_grid = {
    'n_epochs': [10, 20],
    'lr_all': [0.001, 0.002],
    'reg_all': [0.01, 0.02],
    'n_factors': [20, 50,]  # Number of latent factors
}

# Initialize GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

# Train SVD with the best hyperparameters
best_svd = gs.best_estimator['rmse']
cross_validate(best_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train on the full dataset
trainset = data.build_full_trainset()
best_svd.fit(trainset)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2037  1.2015  1.2002  1.2003  1.2109  1.2033  0.0040  
MAE (testset)     0.8900  0.8925  0.8923  0.8884  0.8954  0.8917  0.0024  
Fit time          1.24    1.22    1.27    1.17    1.32    1.24    0.05    
Test time         0.22    0.20    0.49    0.19    0.46    0.31    0.13    


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cf1f899ca0>

In [7]:
# Merge genres into reviews
reviews_with_genres = valid_reviews.merge(books[['book_id', 'genres']], on='book_id', how='left')

# Combine review text and genres
reviews_with_genres['combined_text'] = reviews_with_genres.apply(lambda row: f"{row['review_text']} {row['genres']}", axis=1)

# Calculate combined embeddings for reviews
model = SentenceTransformer('all-MiniLM-L6-v2')
reviews_with_genres['combined_embeddings'] = reviews_with_genres['combined_text'].progress_apply(lambda x: model.encode(x))

# Combine combined_embeddings by user
user_embeddings = reviews_with_genres.groupby('user_id')['combined_embeddings'].apply(lambda x: np.mean(np.vstack(x), axis=0)).reset_index()

# Normalize ratings in training data
interactions['rating_normalized'] = (interactions['rating'] - interactions['rating'].min()) / (interactions['rating'].max() - interactions['rating'].min())

# Merge user embeddings with normalized ratings
user_ratings = interactions.groupby('user_id')['rating_normalized'].mean().reset_index()
combined_features = pd.merge(user_embeddings, user_ratings, on='user_id')


100%|██████████| 10000/10000 [07:04<00:00, 23.57it/s]


In [8]:
combined_features.isnull().sum().sum()

0

In [9]:
# Calculate User-User Similarity
user_features = np.vstack(combined_features['combined_embeddings'])
user_similarity = cosine_similarity(user_features, user_features)

In [10]:

def recommend_books(user_id, user_similarity, interactions_df, books_df, combined_features, num_recommendations=5):
    # Check if user_id is in the combined_features dataframe
    if user_id not in combined_features['user_id'].values:
        return pd.DataFrame(columns=['book_id', 'title', 'authors', 'genres'])

    # Identify similar users
    user_index = combined_features[combined_features['user_id'] == user_id].index[0]
    similar_user_ids = combined_features['user_id'][user_similarity[user_index].argsort()[-num_recommendations-1:-1][::-1]].values

    # Get books read by similar users
    similar_users_books = interactions_df[interactions_df['user_id'].isin(similar_user_ids)]['book_id'].unique()

    # Filter out books already read by the user
    user_books = interactions_df[interactions_df['user_id'] == user_id]['book_id'].unique()
    recommended_books = [book_id for book_id in similar_users_books if book_id not in user_books]

    # Get details of the recommended books
    recommended_books_df = books_df[books_df['book_id'].isin(recommended_books)].head(num_recommendations)

    return recommended_books_df[['book_id', 'title']]


In [11]:
recommend_books(4, user_similarity, interactions, books, combined_features)

Unnamed: 0,book_id,title
2,229,Sideswipe
10,423,Where I Was From
14,427,Miami
16,461,The Inner Life of Martin Frost
33,759,Collected Stories


In [12]:
interactions[interactions['user_id']==4]

Unnamed: 0,review_id,is_read,rating,user_id,book_id,rating_normalized
1598,f0b78d1a04244153390d345e0295b9c5,True,0,4,1497,0.0
1599,eb9c9e0ade892983555b9e5ac4d7350f,True,3,4,1498,0.6
1600,41208bf93df8da40386ac6aaac7f62a1,True,1,4,1330,0.2
1601,253d4adc044fb40ac0006e21c9f4bf0a,True,0,4,1499,0.0
1602,d6bbdec866ca2d36742dd3128b3f275d,True,4,4,1500,0.8
...,...,...,...,...,...,...
1781,9b0d977104d1e21b2ed40c62987b0b26,True,4,4,1630,0.8
1782,2ad239102e4f8272c5bf2a5da8980809,True,4,4,996,0.8
1783,f8c7780b9b901efdcf8da98abd6f7c7d,True,5,4,1631,1.0
1784,34e675619eb8a75638fe7ac5e1b26292,True,3,4,1632,0.6
