In [None]:
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import nltk
nltk.download('all')

In [2]:
interactions_csv = pd.read_csv('Data/goodreads_interactions.csv', header = 0)

In [None]:
DIR = 'Data'

def load_data(file_path, chunk_size, num_chunks):
    chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')
    return pd.concat([chunk for i, chunk in enumerate(chunks) if i < num_chunks], ignore_index=True)

# Load interactions data
interactions_csv = pd.read_csv(os.path.join(DIR, 'goodreads_interactions.csv'), header=0)

# Load books data
books_file_path = os.path.join(DIR, 'goodreads_books.json.gz')
books = load_data(books_file_path, chunk_size=1000, num_chunks=100)

# Load genres data
genres_file_path = os.path.join(DIR, 'goodreads_book_genres_initial.json.gz')
genres = load_data(genres_file_path, chunk_size=1000, num_chunks=1)

In [4]:
def remove_blank_rows(df, column_name):
    df_cleaned = df[df[column_name].str.strip() != '']
    return df_cleaned

books = remove_blank_rows(books, 'description')

In [6]:
interactions_csv = interactions_csv[interactions_csv['is_read'] == 1]

In [7]:
eligible_users = interactions_csv.groupby('user_id').filter(lambda x: len(x) >= 3)['user_id'].drop_duplicates()

# Sample 60 random users from eligible users
sampled_users = eligible_users.sample(n=10, random_state=42)

# Get all books read by these sampled users
sampled_books = interactions_csv[interactions_csv['user_id'].isin(sampled_users)]['book_id'].unique()

# Ensure books_df contains these books
filtered_books_df = books[books['book_id'].isin(sampled_books)]

# If there are fewer than required books, add more randomly from books_df
if len(filtered_books_df) < 10000:
    additional_books_needed = 10000 - len(filtered_books_df)
    remaining_books = books[~books['book_id'].isin(filtered_books_df['book_id'])]
    additional_books = remaining_books.sample(n=additional_books_needed, random_state=42)
    books = pd.concat([filtered_books_df, additional_books], ignore_index=True)

In [8]:
def extract_genres(genre_dict):
    return list(genre_dict.keys())

genres['genre_names'] = genres['genres'].apply(extract_genres)

In [9]:
genres = genres.sort_values(by='book_id')
books = books.sort_values(by='book_id')

In [10]:
books = books.copy()
books = books[['description', 'title', 'authors', 'book_id']]
books = pd.merge(books, genres, on='book_id', how='left')

In [11]:
books.reset_index(drop=True, inplace=True)
interactions_csv.reset_index(drop=True, inplace=True)

In [12]:
def combine_description_and_genres(row):
    genres_str = ' '.join(row['genre_names'])
    return f"{row['description']} {genres_str}"

books['combined_text'] = books.apply(combine_description_and_genres, axis=1)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
from tqdm import tqdm
tqdm.pandas()
books['embeddings'] = books['combined_text'].progress_apply(lambda x: model.encode(x))

In [20]:
embedding_matrix = np.vstack(books['embeddings'].values)

In [None]:
pca = PCA(n_components=50)
embedding_matrix = pca.fit_transform(embedding_matrix)

In [21]:
cosine_sim = cosine_similarity(embedding_matrix, embedding_matrix)

given book get top 5 most similar

In [22]:
def get_recommendations(book_id, books_df, cosine_sim):
    # Check if book_id is in books_df
    if book_id not in books_df['book_id'].values:
        print(f"Book ID {book_id} not found in the books dataframe.")
        return pd.DataFrame(columns=['title', 'authors'])

    # Get the index of the given book_id
    book_idx = books_df[books_df['book_id'] == book_id].index[0]

    # Calculate similarity scores
    sim_scores = list(enumerate(cosine_sim[book_idx]))

    # Sort books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 5 most similar books
    top_book_indices = [i[0] for i in sim_scores[1:6]]

    # Get the top 5 most similar books
    top_books = books_df.iloc[top_book_indices]

    return top_books[['title', 'authors']]



In [None]:
book_id = 1402	
recommendations = get_recommendations(book_id, books, cosine_sim)
print(recommendations)


In [24]:
def get_top_rated_books(books_df, interactions_df):
    top_rated_books = interactions_df.groupby('book_id')['rating'].mean().sort_values(ascending=False).head(5)
    top_rated_books_df = pd.merge(books_df, top_rated_books.reset_index(), on='book_id')
    return top_rated_books_df[['title', 'authors']]

given user get to 5 most similar books based on all books read

In [27]:
def get_user_recommendations(user_id, books_df, interactions_df, cosine_sim):
    # Check if user_id is in interactions_df
    if user_id not in interactions_df['user_id'].values:
        print(f"User {user_id} not found in the interactions dataframe.")
        return pd.DataFrame(columns=['title', 'authors'])
    
    # Books read by user
    user_books = interactions_df[(interactions_df['user_id'] == user_id) & (interactions_df['is_read'] == 1)]['book_id']
    
    if user_books.empty:
        return get_top_rated_books(books_df, interactions_df)
    
    # Find indices of books read by user in books_df
    read_books_indices = books_df[books_df['book_id'].isin(user_books)].index.tolist()
    
    if not read_books_indices:
        print(f"No valid book indices found for user {user_id}.")
        return pd.DataFrame(columns=['title', 'authors'])
    
    # Similarity scores for unread books compared to read books
    sim_scores = cosine_sim[read_books_indices].mean(axis=0)
    
    # All book_ids in books_df minus user_books
    unread_books = books_df[~books_df['book_id'].isin(user_books)]
    unread_books_indices = unread_books.index.tolist()
    
    if not unread_books_indices:
        return get_top_rated_books(books_df, interactions_df)
    
    # Sort unread books on similarity scores
    sim_scores_unread = [(i, sim_scores[i]) for i in unread_books_indices]
    sim_scores_unread = sorted(sim_scores_unread, key=lambda x: x[1], reverse=True)
    
    # Indices of top 5 most similar unread books
    top_book_indices = [i[0] for i in sim_scores_unread[:5]]
    
    # Top 5 most similar unread books
    top_books = books_df.iloc[top_book_indices]
    
    return top_books[['book_id','title', 'authors']]


In [None]:
user_id = 382087            

recommendations = get_user_recommendations(user_id, books, interactions_csv, cosine_sim)
print(recommendations)

In [None]:
user_id = 190085                      

recommendations = get_user_recommendations(user_id, books, interactions_csv, cosine_sim)
print(recommendations)

In [None]:
user_id = 722257                                            

recommendations = get_user_recommendations(user_id, books, interactions_csv, cosine_sim)
print(recommendations)

In [None]:
user_id = 69996                              

recommendations = get_user_recommendations(user_id, books, interactions_csv, cosine_sim)
print(recommendations)

In [48]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import numpy as np

def evaluate_recommendations(user_id, books_df, interactions_df, cosine_sim, k=5):
    # Get recommendations
    recommendations = get_user_recommendations(user_id, books_df, interactions_df, cosine_sim)
    
    # Get actual books read by the user
    actual_books = interactions_df[(interactions_df['user_id'] == user_id) & (interactions_df['is_read'] == 1)]['book_id'].tolist()
    
    if not actual_books:
        return {'ndcg_at_k': float('nan'), 'mae': float('nan'), 'rmse': float('nan')}
    
    # NDCG @ k
    def dcg_at_k(rank_ordered):
        dcg = 0
        for i, rating in enumerate(rank_ordered):
            dcg += rating / np.log2(i + 2)
        return dcg

    def ndcg_at_k(ratings):
        idcg = dcg_at_k(sorted(ratings, reverse=True))
        dcg = dcg_at_k(ratings)
        return dcg / idcg if idcg > 0 else 0

    recommended_books = recommendations['book_id'].tolist()
    actual_ratings_df = interactions_df[interactions_df['book_id'].isin(recommended_books) & (interactions_df['user_id'] == user_id)]
    
    actual_ratings = actual_ratings_df['rating'].fillna(0).tolist()
    if not actual_ratings:
        # Use average rating from overall interactions as fallback
        actual_ratings = [interactions_df[interactions_df['book_id'] == book]['rating'].mean() for book in recommended_books]
        actual_ratings = [rating if not np.isnan(rating) else 0 for rating in actual_ratings]
    
    ndcg_at_k_score = ndcg_at_k(actual_ratings[:k])
    
    # MAE and RMSE
    mae = mean_absolute_error(actual_ratings, actual_ratings) if actual_ratings else float('nan')
    rmse = root_mean_squared_error(actual_ratings, actual_ratings) if actual_ratings else float('nan')
    
    return {
        'ndcg_at_k': ndcg_at_k_score,
        'mae': mae,
        'rmse': rmse
    }


In [None]:
user_id = 382087
evaluation_metrics = evaluate_recommendations(user_id, books, interactions_csv, cosine_sim)
print(evaluation_metrics)

In [None]:
def evaluate_recommendations_all_users(books_df, interactions_df, cosine_sim, k=5):
    all_users = interactions_df['user_id'].unique()
    results = {
        'ndcg_at_k': [],
        'mae': [],
        'rmse': []
    }

    for user_id in all_users:
        metrics = evaluate_recommendations(user_id, books_df, interactions_df, cosine_sim, k)
        results['ndcg_at_k'].append(metrics['ndcg_at_k'])
        results['mae'].append(metrics['mae'])
        results['rmse'].append(metrics['rmse'])
    
    # Aggregate results
    aggregated_results = {metric: np.nanmean(results[metric]) for metric in results}
    
    return aggregated_results

# Example usage
aggregated_evaluation_metrics = evaluate_recommendations_all_users(books, interactions_csv, cosine_sim)
print(aggregated_evaluation_metrics)
