In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
tqdm.pandas()

In [24]:
# Load data
read = pd.read_pickle('Pickle/read.pkl')
books = pd.read_pickle('Pickle/books.pkl')
reviews = pd.read_pickle('Pickle/reviews.pkl')

In [25]:
embeddings_df = pd.read_pickle('Pickle/review_embeddings.pkl')

In [26]:
# Prepare embeddings
embeddings_df.set_index('review_id', inplace=True)
def get_review_embedding(review_id): 
    try: return embeddings_df.at[review_id, 'embeddings'] 
    except KeyError: return None # Return None if the review_id is not found 
reviews['embeddings'] = reviews['review_id'].progress_apply(get_review_embedding) 
reviews = reviews.dropna(subset=['embeddings'])

100%|██████████| 1000000/1000000 [00:07<00:00, 141515.25it/s]


In [27]:
review_counts = reviews['user_id'].value_counts()
valid_users = review_counts[review_counts >= 3].index
reviews = reviews[reviews['user_id'].isin(valid_users)]

In [28]:
# Get unique user IDs from all relevant DataFrames
interaction_user_ids = set(read['user_id'].unique())
review_user_ids = set(reviews['user_id'].unique())
read_user_ids = set(read['user_id'].unique())

In [29]:
# Find common user IDs
common_user_ids = interaction_user_ids.intersection(review_user_ids).intersection(read_user_ids)

In [30]:
user_review_embeddings = reviews.groupby('user_id')['embeddings'].apply(lambda x: np.mean(np.vstack(x.dropna()), axis=0)).reset_index()

In [31]:
user_review_embeddings.set_index('user_id', inplace=True)

In [32]:
def calculate_weighted_embeddings(reviews_df, base_weight=0.6, min_weight=0.6):
    # Create a copy of the DataFrame
    reviews_df = reviews_df.copy()

    # Normalize the number of votes
    if reviews_df['n_votes'].max() > 0:  # Ensure no division by zero
        reviews_df['n_votes_normalized'] = reviews_df['n_votes'] / reviews_df['n_votes'].max()
    else:
        reviews_df['n_votes_normalized'] = 0

    # Calculate weights with a base weight and ensure min weight is applied
    reviews_df['weight'] = base_weight + reviews_df['n_votes_normalized']
    reviews_df['weight'] = reviews_df['weight'].apply(lambda x: max(x, min_weight))

    # Apply weights to embeddings
    reviews_df['embeddings'] = reviews_df.apply(lambda row: row['embeddings'] * row['weight'], axis=1)

    return reviews_df

# Apply the function
reviews = calculate_weighted_embeddings(reviews)


In [33]:
# Train-test split
train_reviews, test_reviews = train_test_split(reviews[reviews['user_id'].isin(common_user_ids)], test_size=0.2, random_state=42)
train_users = train_reviews['user_id'].unique()
test_users = test_reviews['user_id'].unique()
train_read = read[read['user_id'].isin(train_users)]
test_read = read[read['user_id'].isin(test_users)]

In [34]:
# Reading history
user_books_read = read.groupby('user_id')['book_id'].apply(list).reset_index()
user_books_read.set_index('user_id', inplace=True)

In [35]:
# Most commonly read genres using filtered_genres
user_genres = read.merge(books[['book_id', 'filtered_genres']], on='book_id', how='left')
def most_common_genres(books_read, n=3):
    genres = books_read['filtered_genres'].value_counts().index.tolist()[:n]
    return genres

user_most_common_genres = user_genres.groupby('user_id').progress_apply(most_common_genres).reset_index()
user_most_common_genres.columns = ['user_id', 'most_common_genres']
user_most_common_genres.set_index('user_id', inplace=True)

100%|██████████| 533907/533907 [01:34<00:00, 5666.56it/s]


In [36]:
# Combine user features
user_features = pd.concat([user_review_embeddings, user_books_read, user_most_common_genres], axis=1)
user_features = user_features.dropna(subset=['embeddings', 'book_id', 'most_common_genres'])

In [37]:
# Combine all features into a single vector
mlb = MultiLabelBinarizer()
user_genre_features = pd.DataFrame(mlb.fit_transform(user_features['most_common_genres']), index=user_features.index)
combined_features = user_features.apply(lambda row: np.concatenate([row['embeddings'], user_genre_features.loc[row.name]]), axis=1)
user_features['combined'] = combined_features

In [38]:
# Calculate Jaccard similarity between sets of books read
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Create a matrix to store Jaccard similarities
num_users = len(user_features)
jaccard_similarity_matrix = np.zeros((num_users, num_users))

for i, user1 in enumerate(user_features.index):
    for j, user2 in enumerate(user_features.index):
        if i <= j:  # Calculate only once for each pair
            jaccard_sim = jaccard_similarity(user_features.at[user1, 'book_id'], user_features.at[user2, 'book_id'])
            jaccard_similarity_matrix[i, j] = jaccard_similarity_matrix[j, i] = jaccard_sim


In [39]:
# Calculate user similarity using the combined features
cosine_sim = cosine_similarity(np.vstack(user_features['combined']))
combined_similarity = (cosine_sim + jaccard_similarity_matrix) / 2
user_ids = user_features.index.tolist()

In [None]:
def recommend_books(user_id, user_similarity, user_ids, books, train_read, num_recommendations=20, include_read=False):
    if user_id not in user_ids:
        print(f"User {user_id} not found in user ids.")
        return pd.DataFrame(columns=['book_id', 'title'])
    
    try:
        # Find the index of the user_id in user_ids list without np.where
        user_index = user_ids.index(user_id)
    except ValueError:
        print(f"User ID {user_id} not found in user IDs list.")
        return pd.DataFrame(columns=['book_id', 'title'])
    
    similar_user_indices = user_similarity[user_index].argsort()[-(num_recommendations + 20):-1][::-1]
    similar_user_ids = [user_ids[i] for i in similar_user_indices]
    valid_similar_user_ids = [uid for uid in similar_user_ids if uid in train_read['user_id'].unique()]

    if len(valid_similar_user_ids) == 0:
        print(f"No valid similar users found for user {user_id}.")
        return pd.DataFrame(columns=['book_id', 'title'])

    similar_users_books = train_read[train_read['user_id'].isin(valid_similar_user_ids) & (train_read['is_read'] == 1)]['book_id'].unique()

    if not include_read:
        user_books = train_read[(train_read['user_id'] == user_id) & (train_read['is_read'] == 1)]['book_id'].unique()
        recommended_books = [book_id for book_id in similar_users_books if book_id not in user_books]
    else:
        recommended_books = similar_users_books

    if len(recommended_books) == 0:
        print(f"No new books to recommend for user {user_id}.")
        return pd.DataFrame(columns=['book_id', 'title'])

    recommended_books_df = books[books['book_id'].isin(recommended_books)].head(num_recommendations)
    return recommended_books_df[['book_id', 'title']]


In [52]:
from tqdm import tqdm

def evaluate_model(test_users, user_similarity, user_ids, books, test_read, k=5):
    def precision_at_k(y_true, y_pred, k):
        y_true_set = set(y_true)
        y_pred_k = y_pred[:k]
        return len(set(y_pred_k) & y_true_set) / k

    def recall_at_k(y_true, y_pred, k):
        y_true_set = set(y_true)
        y_pred_k = y_pred[:k]
        return len(set(y_pred_k) & y_true_set) / len(y_true_set)

    def ndcg_at_k(y_true, y_pred, k):
        def dcg(relevance_scores):
            return sum((2**rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores))

        y_true_set = set(y_true)
        y_pred_k = y_pred[:k]
        relevance_scores = [1 if item in y_true_set else 0 for item in y_pred_k]
        ideal_relevance_scores = [1] * min(len(y_true), k) + [0] * (k - min(len(y_true), k))
        return dcg(relevance_scores) / dcg(ideal_relevance_scores)

    precision_scores = []
    recall_scores = []
    ndcg_scores = []

    for user_id in tqdm(test_users):
        if user_id not in user_ids:
            continue
        
        # Actual books read by the user, filtered to common user IDs
        actual_books = set(read[(read['user_id'] == user_id) & (read['is_read'] == 1)]['book_id'])        

        # Get top K recommended books
        recommended_books = recommend_books(user_id, user_similarity, user_ids, books, test_read, num_recommendations=k, include_read=True)['book_id'].tolist()

        if len(recommended_books) == 0:
            continue

        precision_scores.append(precision_at_k(actual_books, recommended_books, k))
        recall_scores.append(recall_at_k(actual_books, recommended_books, k))
        ndcg_scores.append(ndcg_at_k(actual_books, recommended_books, k))

    precision_avg = np.mean(precision_scores)
    recall_avg = np.mean(recall_scores)
    ndcg_avg = np.mean(ndcg_scores)

    return precision_avg, recall_avg, ndcg_avg


In [53]:
# Example usage for evaluation:
precision, recall, ndcg = evaluate_model(test_users, combined_similarity, user_ids, books, test_read, k=10)
print(f"Precision@K: {precision:.4f}, Recall@K: {recall:.4f}, NDCG@K: {ndcg:.4f}")

100%|██████████| 10605/10605 [08:56<00:00, 19.76it/s]

Precision@K: 0.1308, Recall@K: 0.2739, NDCG@K: 0.2314



