In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm


  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load data
read = pd.read_feather('Feather/read.feather')
books = pd.read_feather('Feather/books.feather')

In [3]:
read = read[read['is_read'] == 1]

In [4]:
read = read.sample(n=100000, random_state=42)

In [5]:
# Function to drop empty rows
def drop_empty_rows(df, column_name):
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

books = drop_empty_rows(books, 'description')
books = books[books['language_code'] == 'eng'].sample(n=50000, random_state=42)
books = books.reset_index(drop=True)

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')
tqdm.pandas()
books['embeddings'] = books['combined_text'].progress_apply(lambda x: model.encode(x))
embedding_matrix = np.vstack(books['embeddings'].values)
print(embedding_matrix.shape)


100%|██████████| 50000/50000 [27:31<00:00, 30.28it/s]


(50000, 384)


In [7]:
# Reduce dimensionality
pca = PCA(n_components=50)
embedding_matrix = pca.fit_transform(embedding_matrix)
cosine_sim = cosine_similarity(embedding_matrix, embedding_matrix)

In [8]:
# Define function to get recommendations
def get_recommendations(book_id, books_df, cosine_sim):
    if book_id not in books_df['book_id'].values:
        print(f"Book ID {book_id} not found in the books dataframe.")
        return pd.DataFrame(columns=['title', 'authors'])

    book_idx = books_df[books_df['book_id'] == book_id].index[0]
    sim_scores = list(enumerate(cosine_sim[book_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_book_ids = [books_df['book_id'].iloc[i[0]] for i in sim_scores[1:6]]
    top_books = books_df[books_df['book_id'].isin(top_book_ids)]

    return top_books[['title', 'authors']]

In [9]:
#Function to filter high-rated books
def filter_high_rated_books(ratings_df, min_rating=3):
    read_ratings = ratings_df[ratings_df['is_read'] == 1]
    average_ratings = read_ratings.groupby('book_id')['rating'].mean().reset_index()
    high_rated_books = average_ratings[average_ratings['rating'] >= min_rating]['book_id'].values
    return high_rated_books

# Split the read DataFrame into train and test sets
train_read, test_read = train_test_split(read, test_size=0.2, random_state=42)

# Filter high-rated books
high_rated_books = filter_high_rated_books(train_read, min_rating=3)

In [10]:
def predict_ratings(user_id, book_id, train_df, cosine_sim, books_df):
    user_books = train_df[train_df['user_id'] == user_id]['book_id'].values
    if len(user_books) == 0:
        return np.mean(train_df['rating']), []
    
    sim_scores = []
    for user_book in user_books:
        if user_book in books_df['book_id'].values:
            book_idx = books_df[books_df['book_id'] == user_book].index[0]
            sim_scores.append(cosine_sim[book_idx])

    if len(sim_scores) == 0:
        return np.mean(train_df['rating']), []

    avg_sim_scores = np.mean(sim_scores, axis=0)
    similar_books = sorted(zip(books_df['book_id'].values, avg_sim_scores), key=lambda x: x[1], reverse=True)
    recommended_books = [book for book, score in similar_books if book in high_rated_books]

    if len(recommended_books) == 0:
        return np.mean(train_df['rating']), []

    return np.mean(train_df[train_df['book_id'].isin(recommended_books)]['rating'].values), recommended_books[:5]

In [11]:
read['user_id']

224651417    768517
129149545    265659
29523005      58105
152786040    317211
215104984    511587
              ...  
76803624     154460
112029135    229036
140360662    289563
174734593    365740
219047790    613287
Name: user_id, Length: 100000, dtype: int64

In [12]:
print(predict_ratings(user_id = 0, book_id = 21523717, train_df = train_read, cosine_sim = cosine_sim, books_df = books))

(3.673525, [])


In [13]:
# Precision@k function
def precision_at_k(recommended_books, relevant_books, k):
    recommended_books = recommended_books[:k]
    relevant_set = set(relevant_books)
    recommended_set = set(recommended_books)
    return len(recommended_set & relevant_set) / k

# Recall@k function
def recall_at_k(recommended_books, relevant_books, k):
    relevant_set = set(relevant_books)
    if len(relevant_set) == 0:
        return 0
    recommended_set = set(recommended_books[:k])
    return len(recommended_set & relevant_set) / len(relevant_set)


In [14]:
# Evaluation function
def evaluate_model(test_df, train_df, cosine_sim, books_df, k=5):
    predictions = []
    actuals = []
    precisions = []
    recalls = []

    for index, row in test_df.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        actual_rating = row['rating']
        predicted_rating, recommended_books = predict_ratings(user_id, book_id, train_df, cosine_sim, books_df)
        predictions.append(predicted_rating)
        actuals.append(actual_rating)
        
        # Precision@k and Recall@k
        user_relevant_books = train_df[(train_df['user_id'] == user_id) & (train_df['rating'] >= 3)]['book_id'].values
        precisions.append(precision_at_k(recommended_books, user_relevant_books, k))
        recalls.append(recall_at_k(recommended_books, user_relevant_books, k))

    mse = mean_squared_error(actuals, predictions)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)

    return mse, mean_precision, mean_recall

# Example usage:
k = 5
mse, mean_precision, mean_recall = evaluate_model(test_read, train_read, cosine_sim, books, k)
print(f"Mean Squared Error: {mse}")
print(f"Precision@{k}: {mean_precision}")
print(f"Recall@{k}: {mean_recall}")


Mean Squared Error: 1.859807564987517
Precision@5: 0.00064
Recall@5: 0.002058611111111111
