In [1]:
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from tqdm import tqdm
tqdm.pandas()
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

  from tqdm.autonotebook import tqdm, trange


In [2]:
read = pd.read_feather('Feather/read.feather')
books = pd.read_feather('Feather/books.feather')    
reviews = pd.read_feather('Feather/reviews.feather')
interactions = pd.read_feather('Feather/interactions.feather')

In [3]:
interactions = interactions[interactions['is_read']== True]

In [4]:
len(reviews['review_text'])

200000

In [5]:
reviews = reviews.dropna(subset=['review_text', 'rating'])

In [6]:
user_review_counts = reviews.groupby('user_id').size()

users_with_more_than_3_reviews = user_review_counts[user_review_counts > 3].index

valid_reviews = reviews[reviews['user_id'].isin(users_with_more_than_3_reviews)]

In [7]:
valid_reviews = valid_reviews.head(1000)

In [8]:
len(valid_reviews['user_id'].unique())

9

In [9]:
train_interactions, test_interactions = train_test_split(interactions, test_size=0.2, random_state=42)

In [10]:
# Initialize Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Calculate text embeddings for reviews
valid_reviews['review_embeddings'] = valid_reviews['review_text'].progress_apply(lambda x: model.encode(x))

100%|██████████| 1000/1000 [00:38<00:00, 26.21it/s]


In [11]:
# Combine review embeddings by user
user_embeddings = valid_reviews.groupby('user_id')['review_embeddings'].apply(lambda x: np.mean(np.vstack(x), axis=0)).reset_index()

In [12]:
train_interactions['rating_normalized'] = (interactions['rating'] - interactions['rating'].min()) / (interactions['rating'].max() - interactions['rating'].min())

In [13]:
user_ratings = train_interactions.groupby('user_id')['rating_normalized'].mean().reset_index()
combined_features = pd.merge(user_embeddings, user_ratings, on='user_id')

In [14]:
# Calculate User-User Similarity
user_features = combined_features['review_embeddings'].apply(pd.Series).values
user_similarity = cosine_similarity(user_features, user_features)

In [15]:
books = books.drop_duplicates(subset='book_id')

In [18]:
combined_features['user_id'].unique()

array([ 0,  1,  4,  5,  7,  8,  9, 12, 14], dtype=int64)

In [16]:
# Function to recommend books based on user similarity
def recommend_books(user_id, user_similarity, interactions_df, books_df, combined_features, num_recommendations=5):
    if user_id not in combined_features['user_id'].values:
        return pd.DataFrame(columns=['book_id', 'title', 'authors'])

    # Identify similar users
    user_index = combined_features[combined_features['user_id'] == user_id].index[0]
    similar_user_ids = combined_features['user_id'].iloc[user_similarity[user_index].argsort()[-num_recommendations-1:-1][::-1]].values

    # Get books read by similar users
    similar_users_books = interactions_df[interactions_df['user_id'].isin(similar_user_ids)]['book_id'].unique()

    # Exclude books already read by the current user
    user_books = interactions_df[interactions_df['user_id'] == user_id]['book_id'].values
    recommended_books = [book for book in similar_users_books if book not in user_books]

    return books_df[books_df['book_id'].isin(recommended_books)][['book_id']].head(num_recommendations)


In [19]:
print(recommend_books(user_id=0, user_similarity=user_similarity, interactions_df=train_interactions, books_df=books, combined_features=combined_features, num_recommendations=5))

    book_id
45     1064
59     1366
61     1368
63     1402
72     1500


In [20]:
print(recommend_books(user_id=1, user_similarity=user_similarity, interactions_df=train_interactions, books_df=books, combined_features=combined_features, num_recommendations=5))

    book_id
4       231
5       234
7       420
8       421
13      426


In [21]:
print(recommend_books(user_id=4, user_similarity=user_similarity, interactions_df=train_interactions, books_df=books, combined_features=combined_features, num_recommendations=5))

    book_id
4       231
7       420
8       421
13      426
15      428


In [22]:
print(recommend_books(user_id=5, user_similarity=user_similarity, interactions_df=train_interactions, books_df=books, combined_features=combined_features, num_recommendations=5))

    book_id
4       231
5       234
7       420
8       421
13      426


In [23]:
print(recommend_books(user_id=7, user_similarity=user_similarity, interactions_df=train_interactions, books_df=books, combined_features=combined_features, num_recommendations=5))

    book_id
4       231
5       234
7       420
8       421
13      426


In [24]:
print(recommend_books(user_id=8, user_similarity=user_similarity, interactions_df=train_interactions, books_df=books, combined_features=combined_features, num_recommendations=5))

    book_id
4       231
5       234
7       420
8       421
13      426


In [25]:
print(recommend_books(user_id=9, user_similarity=user_similarity, interactions_df=train_interactions, books_df=books, combined_features=combined_features, num_recommendations=5))

    book_id
5       234
16      461
59     1366
61     1368
63     1402
