In [141]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()


In [142]:
read = pd.read_pickle('Pickle/read.pkl')
books = pd.read_pickle('Pickle/books.pkl')
reviews = pd.read_pickle('Pickle/reviews.pkl')
interactions = pd.read_pickle('Pickle/interactions.pkl')    

In [143]:
# Step 1: Group by user_id and count ratings
user_rating_counts = interactions.groupby('user_id').size().reset_index(name='rating_count')

# Step 2: Filter users with less than 3 ratings
users_with_enough_ratings = user_rating_counts[user_rating_counts['rating_count'] >= 5]['user_id']

# Step 3: Filter the interactions DataFrame to include only these users
interactions = interactions[interactions['user_id'].isin(users_with_enough_ratings)]

In [144]:
interactions = interactions[interactions['is_read']== True]

In [145]:
reviews = reviews.dropna(subset=['review_text'])

In [146]:
# Get unique user IDs from both DataFrames
interaction_user_ids = set(interactions['user_id'].unique())
review_user_ids = set(reviews['user_id'].unique())
read_user_ids = set(read['user_id'].unique())


# Find the common user IDs
common = interaction_user_ids.intersection(review_user_ids)
common_user_ids = common.intersection(read_user_ids)

In [147]:
len(common_user_ids)

1368

In [148]:
# Filter interactions DataFrame
interactions = interactions[interactions['user_id'].isin(common_user_ids)]
# Filter reviews DataFrame
reviews = reviews[reviews['user_id'].isin(common_user_ids)]
read = read[read['user_id'].isin(common_user_ids)]

In [149]:
len(reviews)

71108

In [150]:
import os

model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(reviews_df, model, interval=100):
    embeddings_file = 'Pickle/review_embeddings.pkl'
    
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'embeddings'])
    
    reviews_df = reviews_df.dropna(subset=['review_text']).reset_index(drop=True)

    for i in tqdm(range(len(reviews_df))):
        if i in embeddings_df['index'].values:
            continue  # Skip if already processed
        
        embedding = model.encode(reviews_df.loc[i, 'review_text'])
        new_row = pd.DataFrame({'index': [i], 'embeddings': [embedding]})
        embeddings_df = pd.concat([embeddings_df, new_row], ignore_index=True)
        
        if i % interval == 0:
            embeddings_df.to_pickle(embeddings_file)
    
    # Save the final version
    embeddings_df.to_pickle(embeddings_file)

# Save embeddings incrementally
save_embeddings_incrementally(reviews, model, interval=100)


100%|██████████| 71108/71108 [14:08<00:00, 83.81it/s]  


In [151]:
# Load the incremental embeddings
embeddings_df = pd.read_pickle('Pickle/review_embeddings.pkl')

# Ensure the reviews DataFrame has a proper index
reviews.reset_index(drop=True, inplace=True)

# Initialize the embeddings column in reviews DataFrame
reviews['embeddings'] = None

# Merge embeddings back into the reviews DataFrame
for i in tqdm(range(len(reviews))):
    if i in embeddings_df['index'].values:
        embedding = embeddings_df.loc[embeddings_df['index'] == i, 'embeddings'].values[0]
        reviews.at[i, 'embeddings'] = embedding

# Ensure all embeddings are numpy arrays
def convert_to_array(x):
    if isinstance(x, list):
        return np.array(x)
    return x

reviews['embeddings'] = reviews['embeddings'].apply(convert_to_array)

100%|██████████| 71108/71108 [04:48<00:00, 246.68it/s]


In [152]:
def calculate_weighted_embeddings(reviews_df, base_weight=0.1):
    # Normalize n_votes and n_comments
    reviews_df.loc[:, 'n_votes_normalized'] = reviews_df['n_votes'] / reviews_df['n_votes'].max()
    reviews_df.loc[:, 'n_comments_normalized'] = reviews_df['n_comments'] / reviews_df['n_comments'].max()

    # Calculate weights with a base weight
    reviews_df.loc[:, 'weight'] = base_weight + reviews_df['n_votes_normalized'] + reviews_df['n_comments_normalized']

    # Apply weights to embeddings
    reviews_df.loc[:, 'weighted_embeddings'] = reviews_df.apply(lambda row: row['embeddings'] * row['weight'], axis=1)
    return reviews_df

# Apply the function
reviews = calculate_weighted_embeddings(reviews)


In [153]:
# Aggregate weighted embeddings by user
user_embeddings = reviews.groupby('user_id')['weighted_embeddings'].apply(lambda x: np.mean(np.vstack(x.dropna()), axis=0)).reset_index()

# Calculate User-User Similarity
def calculate_user_similarity(user_embeddings):
    user_features = np.vstack(user_embeddings['weighted_embeddings'])
    user_similarity = cosine_similarity(user_features, user_features)
    return user_similarity

user_similarity = calculate_user_similarity(user_embeddings)

In [154]:
books['average_rating'] = books['average_rating'].astype(float)

In [155]:
def recommend_books(user_id, num_recommendations=5):
    # Check if user_id is in the user_embeddings dataframe
    if user_id not in user_embeddings['user_id'].values:
        print("User not found in user embeddings.")
        return pd.DataFrame(columns=['book_id', 'title'])

    # Identify similar users
    user_index = user_embeddings[user_embeddings['user_id'] == user_id].index[0]
    similar_user_indices = user_similarity[user_index].argsort()[-(num_recommendations+10):-1][::-1]  # Consider more similar users
    similar_user_ids = user_embeddings.iloc[similar_user_indices]['user_id'].values

    # Ensure similar_user_ids are in the read DataFrame
    valid_similar_user_ids = [uid for uid in similar_user_ids if uid in read['user_id'].unique()]

    if len(valid_similar_user_ids) == 0:
        print("No valid similar users found.")
        return pd.DataFrame(columns=['book_id', 'title'])

    # Get books read by similar users
    similar_users_books = read[read['user_id'].isin(valid_similar_user_ids) & (read['is_read'] == 1)]['book_id'].unique()

    # Get books the user has read
    user_books = read[(read['user_id'] == user_id) & (read['is_read'] == 1)]['book_id'].unique()

    # Filter out books already read by the user
    recommended_books = [book_id for book_id in similar_users_books if book_id not in user_books]

    # Filter books with an average rating of 3 or above
    recommended_books = books[(books['book_id'].isin(recommended_books)) & (books['average_rating'] >= 3)]

    if len(recommended_books) == 0:
        print("No new books to recommend.")
        return pd.DataFrame(columns=['book_id', 'title'])

    # Get details of the recommended books
    recommended_books_df = recommended_books.head(num_recommendations)
    return recommended_books_df[['book_id', 'title']]
