In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

tqdm.pandas()

# Load data
read = pd.read_feather('Feather/read.feather')
books = pd.read_feather('Feather/books.feather')    
reviews = pd.read_feather('Feather/reviews.feather')
interactions = pd.read_feather('Feather/interactions.feather')
interactions = interactions[interactions['is_read'] == True]

# Clean reviews
reviews = reviews.dropna(subset=['review_text', 'rating'])
user_review_counts = reviews.groupby('user_id').size()
users_with_more_than_3_reviews = user_review_counts[user_review_counts > 3].index
valid_reviews = reviews[reviews['user_id'].isin(users_with_more_than_3_reviews)].head(1000)

# Train-test split
train_interactions, test_interactions = train_test_split(interactions, test_size=0.2, random_state=42)

# Initialize Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Calculate text embeddings for reviews
valid_reviews['review_embeddings'] = valid_reviews['review_text'].progress_apply(lambda x: model.encode(x))

In [None]:
# Combine review embeddings by user
user_embeddings = valid_reviews.groupby('user_id')['review_embeddings'].apply(lambda x: np.mean(np.vstack(x), axis=0)).reset_index()

In [None]:
# Normalize ratings
train_interactions['rating_normalized'] = (train_interactions['rating'] - train_interactions['rating'].min()) / (train_interactions['rating'].max() - train_interactions['rating'].min())
user_ratings = train_interactions.groupby('user_id')['rating_normalized'].mean().reset_index()

# Merge features
combined_features = pd.merge(user_embeddings, user_ratings, on='user_id')

In [None]:
# Calculate User-User Similarity
user_features = combined_features['review_embeddings'].apply(pd.Series).values
user_similarity = cosine_similarity(user_features, user_features)
books = books.drop_duplicates(subset='book_id')

In [None]:

# Function to recommend books based on user similarity
def recommend_books(user_id, user_similarity, interactions_df, books_df, combined_features, num_recommendations=5):
    if user_id not in combined_features['user_id'].values:
        return pd.DataFrame(columns=['book_id', 'title', 'authors'])

    # Identify similar users
    user_index = combined_features[combined_features['user_id'] == user_id].index[0]
    similar_user_ids = combined_features['user_id'].iloc[user_similarity[user_index].argsort()[-num_recommendations-1:-1][::-1]].values

    # Get books read by similar users
    similar_users_books = interactions_df[interactions_df['user_id'].isin(similar_user_ids)]['book_id'].unique()

    # Exclude books already read by the current user
    user_books = interactions_df[interactions_df['user_id'] == user_id]['book_id'].values
    recommended_books = [book for book in similar_users_books if book not in user_books]

    return books_df[books_df['book_id'].isin(recommended_books)][['book_id']].head(num_recommendations)

In [None]:

# RMSE function
def calculate_rmse(predictions, targets):
    return np.sqrt(mean_squared_error(targets, predictions))

In [None]:
# Precision@k function
def precision_at_k(recommended_books, relevant_books, k):
    recommended_books = recommended_books[:k]
    relevant_set = set(relevant_books)
    recommended_set = set(recommended_books)
    return len(recommended_set & relevant_set) / k

# Recall@k function
def recall_at_k(recommended_books, relevant_books, k):
    relevant_set = set(relevant_books)
    if len(relevant_set) == 0:
        return 0
    recommended_set = set(recommended_books[:k])
    return len(recommended_set & relevant_set) / len(relevant_set)


In [None]:
# Evaluation function
def evaluate_model(test_df, user_similarity, interactions_df, books_df, combined_features, k=5):
    precisions = []
    recalls = []
    actuals = []
    predictions = []

    for index, row in test_df.iterrows():
        user_id = row['user_id']
        actual_rating = row['rating']
        recommended_books = recommend_books(user_id, user_similarity, interactions_df, books_df, combined_features, k)
        if len(recommended_books) > 0:
            predicted_rating = combined_features[combined_features['user_id'] == user_id]['rating_normalized'].values[0]
            predictions.append(predicted_rating)
            actuals.append(actual_rating)

            # Precision@k and Recall@k
            user_relevant_books = interactions_df[(interactions_df['user_id'] == user_id) & (interactions_df['rating'] >= 3)]['book_id'].values
            precisions.append(precision_at_k(recommended_books['book_id'].values, user_relevant_books, k))
            recalls.append(recall_at_k(recommended_books['book_id'].values, user_relevant_books, k))

    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)

    return rmse, mean_precision, mean_recall

In [None]:

# Example usage:
k = 5
rmse, mean_precision, mean_recall = evaluate_model(test_interactions, user_similarity, train_interactions, books, combined_features, k)
print(f"RMSE: {rmse}")
print(f"Precision@{k}: {mean_precision}")
print(f"Recall@{k}: {mean_recall}")