In [None]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import resample
from collections import defaultdict
from surprise import accuracy

In [None]:
def load_data():
    """
    Loads the required datasets for the recommendation system.

    This function reads three pickle files: 'reviews.pkl', 'interactions.pkl', and 
    'read.pkl', which contain data related to user reviews, user-item interactions, 
    and books read by users, respectively.

    Returns:
        tuple: A tuple containing the following pandas DataFrames:
            - reviews (DataFrame): The dataset containing user reviews.
            - interactions (DataFrame): The dataset containing user-item interactions.
            - read (DataFrame): The dataset containing books read by users.
    """
    
    reviews = pd.read_pickle("../Pickle/reviews.pkl")
    interactions = pd.read_pickle("../Pickle/interactions.pkl")
    read = pd.read_pickle("../Pickle/read.pkl")
    return reviews, interactions, read


In [None]:
def preprocess_data(df):
    """
    Preprocesses the user-item interaction data by removing users with only zero ratings 
    and filtering out books that have not been rated.

    This function performs two main tasks:
    1. Removes users who have rated all books as zero (i.e., no meaningful interaction).
    2. Filters the dataset to keep only the books that have been read (is_read != 0) 
       and have a non-zero rating.

    Args:
        df (DataFrame): The input pandas DataFrame containing user-item interaction data, 
                         with columns like 'user_id', 'rating', and 'is_read'.

    Returns:
        DataFrame: A filtered pandas DataFrame containing only the rated books with 
                   valid ratings and excluding users with only zero ratings.
    """
    
    users_with_only_zeros = df.groupby('user_id')['rating'].apply(lambda x: (x == 0).all())
    users_with_only_zeros = users_with_only_zeros[users_with_only_zeros].index
    df = df[~df['user_id'].isin(users_with_only_zeros)]
    
    rated_books = df[df['is_read'] != 0].reset_index(drop=True)
    rated_books = rated_books[rated_books['rating'] != 0].reset_index(drop=True)
    
    return rated_books


In [None]:
def split_data(df):
    """
    Splits the user-item interaction dataset into training and testing sets while handling 
    interactions with only a single rating from either users or books.

    This function first identifies interactions where either the user or the book has only one 
    interaction in the dataset, and ensures that those interactions are included in the training set. 
    The remaining interactions are split into training and testing sets using an 80/20 ratio, 
    while maintaining the distribution of ratings across the sets.

    Args:
        df (DataFrame): The input pandas DataFrame containing user-item interaction data, 
                         with columns like 'user_id', 'book_id', and 'rating'.

    Returns:
        tuple: A tuple containing two pandas DataFrames:
            - train_df: The training set with interactions, including single-interaction users/books.
            - test_df: The testing set with the remaining interactions after the split.
    """
    
    user_counts = df['user_id'].value_counts()
    book_counts = df['book_id'].value_counts()
    
    single_interactions = df[df['user_id'].isin(user_counts[user_counts == 1].index) |
                             df['book_id'].isin(book_counts[book_counts == 1].index)]
    remaining_interactions = df[~df.index.isin(single_interactions.index)]
    
    train_df, test_df = train_test_split(remaining_interactions, test_size=0.2, random_state=42, stratify=remaining_interactions['rating'])
    train_df = pd.concat([train_df, single_interactions], ignore_index=True)
    
    return train_df, test_df


In [None]:
def upsample_ratings(train_df):
    """
    Upsamples the ratings in the training set to balance the distribution of ratings.
    
    For each rating class, if the number of samples is less than the maximum count of the majority class, 
    the function performs random resampling with replacement and adds slight noise to the ratings 
    to avoid overfitting. The modified ratings are clipped to remain within a valid rating range 
    (1 to 5). The resulting DataFrame is shuffled and returned.

    Args:
        train_df (DataFrame): The input pandas DataFrame containing user-item interactions, 
                               with columns like 'rating'.

    Returns:
        DataFrame: A pandas DataFrame containing the upsampled training data, 
                   with a balanced distribution of ratings.
    """
    
    rating_counts = train_df['rating'].value_counts()
    majority_count = rating_counts.max()

    MAX_TARGET_SIZE = int(1 * majority_count)

    modified_dfs = []
    for rating, count in rating_counts.items():
        class_df = train_df[train_df['rating'] == rating]
        if count >= MAX_TARGET_SIZE:
            balanced_df = class_df.copy()
        else:
            balanced_df = resample(class_df,
                               replace=True,
                               n_samples=MAX_TARGET_SIZE,
                               random_state=42)
            balanced_df['rating'] = balanced_df['rating'] + np.random.uniform(-0.1, 0.1, size=balanced_df.shape[0])
            balanced_df['rating'] = balanced_df['rating'].clip(1, 5)
        modified_dfs.append(balanced_df)
    balanced_train = pd.concat(modified_dfs, ignore_index=True)
    train_df = balanced_train.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return pd.concat([train_df] + modified_dfs, ignore_index=True)


In [None]:
def normalize_ratings(train_df, test_df):
    """
    Normalizes the ratings in the training and test datasets using log scaling.
    
    This function transforms the ratings in both datasets by subtracting the minimum rating
    value and applying a logarithmic transformation to scale them. The transformation is applied
    to both the training and test datasets to ensure consistency.

    Args:
        train_df (DataFrame): The input training DataFrame containing user-item interactions and ratings.
        test_df (DataFrame): The input test DataFrame containing user-item interactions and ratings.

    Returns:
        tuple: A tuple containing the normalized training and test DataFrames, and the minimum rating value.
    """
    
    min_rating = train_df['rating'].min()
    
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    train_df['rating'] = np.log1p(train_df['rating'] - min_rating)
    test_df['rating'] = np.log1p(test_df['rating'] - min_rating)
    
    return train_df, test_df, min_rating

def denormalize_rating(log_scaled_ratings, min_rating):
    """
    Denormalizes log-transformed ratings back to their original scale.
    
    This function reverses the log scaling applied to the ratings and clips the values to ensure
    they fall within the valid rating range (1 to 5).

    Args:
        log_scaled_ratings (array-like): The input array of log-scaled ratings.
        min_rating (float): The minimum rating value used during normalization.

    Returns:
        array: The denormalized ratings, clipped to the range [1, 5].
    """
    
    original_ratings = np.expm1(np.asarray(log_scaled_ratings, dtype=float))
    return np.clip(original_ratings + min_rating, 1, 5)


In [None]:
reviews, interactions, read = load_data()

In [None]:
users_with_only_zeros = read.groupby('user_id')['rating'].apply(lambda x: (x == 0).all())
users_with_only_zeros = users_with_only_zeros[users_with_only_zeros].index
read = read[~read['user_id'].isin(users_with_only_zeros)]

In [None]:
# Filter the dataframe to include only rows where 'is_read' is 1
read_df = read[read['is_read'] == 1].reset_index(drop=True)
# Filter the dataframe to include only rows where 'rating' is 0
zero_rating_df = read_df[read_df['rating'] == 0].reset_index(drop=True)
# Filter the reviews dataframe to include only rows where 'rating' is 0
zero_rating_reviews = reviews[reviews['rating'] == 0]
# Filter the interactions dataframe to include only rows where 'rating' is 0
zero_rating_interactions = interactions[interactions['rating'] == 0]
# Get unique user IDs from the zero-rated reviews
zero_rating_review_users = zero_rating_reviews['user_id'].unique()
# Get unique user IDs from the zero-rated interactions
zero_rating_interaction_users = zero_rating_interactions['user_id'].unique()
# Combine the zero-rated reviews and interactions into a single dataframe used for inference
inference = pd.concat([zero_rating_reviews, zero_rating_interactions], ignore_index=True)
# Create a set to store unique user IDs
unique_users = set()
# Combine the unique user IDs from both reviews and interactions, ensuring no duplicates
unique_user_ids = [user_id for user_id in list(zero_rating_review_users) + list(zero_rating_interaction_users) 
                   if user_id not in unique_users and not unique_users.add(user_id)]
# Filter the original dataframe to include only rows with user IDs from the combined list
filtered_df = read_df[read_df['user_id'].isin(unique_user_ids)]
# Filter the original dataframe to include only rows where 'rating' is not 0
non_zero_rating_df = read_df[read_df['rating'] != 0]
non_zero_rating_sample = non_zero_rating_df.sample(frac=0.02, random_state=42)
# Combine the filtered dataframe with the sampled non-zero rated rows
final_df = pd.concat([filtered_df, non_zero_rating_sample], ignore_index=True)
# Reset the index of the final dataframe
df = final_df.reset_index(drop=True)

In [None]:
df = df.astype(np.int32)
df = preprocess_data(df)
train_df, test_df = split_data(df)
train_df = upsample_ratings(train_df)
train_df, test_df, min_rating = normalize_ratings(train_df, test_df)

In [None]:
plt.hist(train_df['rating'], bins=10, edgecolor='black')
plt.title('Histogram of Balanced and Log Normalised Train Set Ratings')
plt.xlabel('Predicted Ratings')
plt.ylabel('Frequency')
plt.show()

In [None]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'rating']], reader)

In [None]:
param_grid = {
    'n_factors': [100, 200, 300],
    'n_epochs': [20, 40, 60],
    'lr_all': [0.001, 0.1],
    'reg_all': [0.001, 0.1],

}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2, n_jobs=-1)

gs.fit(train_data)

best_params = gs.best_params['rmse']  
best_model = gs.best_estimator['rmse']  

print(f"Best RMSE: {gs.best_score['rmse']:.4f}")
print(f"Best Parameters: {best_params}")

In [None]:
trainset = train_data.build_full_trainset()

In [None]:
best_model = SVD(**best_params, verbose=True)
best_model.fit(trainset)

testset = [tuple(x) for x in test_df[['user_id', 'book_id', 'rating']].values]
predictions = best_model.test(testset)

test_rmse = accuracy.rmse(predictions)
test_mae = accuracy.mae(predictions)

In [None]:
predictions = best_model.test(testset)

In [None]:
def evaluate_model(predictions, min_rating, k, threshold, item_popularity, num_users):
    """
    Evaluates the performance of a recommendation model using multiple evaluation metrics.

    This function calculates several key evaluation metrics for a recommender system, including precision, 
    recall, nDCG, MAP, MRR, novelty, hit rate, and user coverage. It also denormalizes the predicted ratings
    and compares them to true ratings for each user.

    Args:
        predictions (list): A list of predictions in the form (user_id, book_id, true_rating, estimated_rating, details).
        min_rating (float): The minimum rating value used for normalization during model training.
        k (int): The number of top recommendations to consider for each user.
        threshold (float): The rating threshold used to classify an item as relevant (e.g., a rating greater than or equal to this value).
        item_popularity (dict): A dictionary mapping book IDs to their popularity, defined by the number of users who interacted with them.
        num_users (int): The total number of unique users in the test dataset.

    Returns:
        tuple: A tuple containing the average precision, recall, nDCG, MAP, MRR, novelty, hit rate, and user coverage.
    """
    
    denormalized_predictions = [
        (uid, iid, true_r * (5 - min_rating) + min_rating, est * (5 - min_rating) + min_rating, details)
        for (uid, iid, true_r, est, details) in predictions
    ]

    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in denormalized_predictions:
        user_est_true[uid].append((iid, est, true_r))

    precisions = []
    recalls = []
    ndcgs = []
    map_scores = []
    mrr_scores = []  
    novelty_scores = []
    hit_rate_scores = []
    all_recommended_items = set()  
    recommended_relevant_items = set() 

    relevant_items = set(iid for _, iid, true_r, _, _ in denormalized_predictions if true_r >= threshold)

    for user_id, user_ratings in user_est_true.items():
        user_ratings_sorted = sorted(user_ratings, key=lambda x: x[1], reverse=True)
        top_k = user_ratings_sorted[:k]

        top_k_items = [iid for iid, _, _ in top_k]
        all_recommended_items.update(top_k_items)
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in top_k)
        n_rel_and_rec_k = sum((true_r >= threshold) for (_, _, true_r) in top_k)

        precision = n_rel_and_rec_k / k if k != 0 else 0
        recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        precisions.append(precision)
        recalls.append(recall)
        dcg = sum(
            (true_r >= threshold) / np.log2(idx + 2)
            for idx, (_, _, true_r) in enumerate(top_k)
        )
        idcg = sum(
            1.0 / np.log2(idx + 2) for idx in range(min(n_rel, k))
        )
        ndcg = dcg / idcg if idcg != 0 else 0
        ndcgs.append(ndcg)
        hits = 0
        sum_precisions = 0
        for idx, (_, _, true_r) in enumerate(top_k):
            if true_r >= threshold:
                hits += 1
                sum_precisions += hits / (idx + 1)
        ap = sum_precisions / min(n_rel, k) if n_rel != 0 else 0
        map_scores.append(ap)
        first_relevant_rank = None
        for idx, (_, _, true_r) in enumerate(top_k):
            if true_r >= threshold:
                first_relevant_rank = idx + 1  
                break

        if first_relevant_rank is not None:
            mrr_scores.append(1 / first_relevant_rank)
        else:
            mrr_scores.append(0)
        novelty = np.mean([1 - (item_popularity.get(iid, 1) / num_users) for iid in top_k_items])
        novelty_scores.append(novelty)
        hit_rate = 1 if n_rel_and_rec_k > 0 else 0
        hit_rate_scores.append(hit_rate)
        recommended_relevant_items.update([iid for iid, est, true_r in top_k if true_r >= threshold])

    user_coverage = sum(1 for user_ratings in user_est_true.values() if any(true_r >= threshold for _, _, true_r in user_ratings)) / len(user_est_true)
    return (
        np.mean(precisions),
        np.mean(recalls),
        np.mean(ndcgs),
        np.mean(map_scores),
        np.mean(mrr_scores),  
        np.mean(novelty_scores),
        np.mean(hit_rate_scores),
        user_coverage
    )


In [None]:
import pickle
with open("../Pickle/best_svd_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("../Pickle/best_svd_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
def predict_with_fallback(user_id, book_id):
    """
    Predicts the rating for a given user and book, with fallback handling for unknown users or books.

    This function checks whether the given user and book are known in the training set. If either is unknown, 
    it returns NaN. Otherwise, it uses the pre-trained model to predict the rating for the given user and book.

    Args:
        user_id (int): The ID of the user for whom the rating is predicted.
        book_id (int): The ID of the book for which the rating is predicted.

    Returns:
        float: The predicted rating for the user and book, or NaN if either the user or book is unknown.
    """
    if not trainset.knows_user(user_id):
        return np.nan
    if not trainset.knows_item(book_id):
        return np.nan  
    
    pred = loaded_model.predict(user_id, book_id)
    return pred.est  

inference["predicted_rating"] = inference.apply(
    lambda row: predict_with_fallback(row["user_id"], row["book_id"]), axis=1
)


In [None]:
inference.loc[inference["predicted_rating"].notna(), "predicted_rating"] = inference["predicted_rating"].apply(
    lambda x: denormalize_rating(x, min_rating) if pd.notna(x) else x)

In [None]:
inference = inference.dropna(subset=['predicted_rating'])
inference = inference.reset_index(drop = True)
inference = inference[['user_id', 'book_id', 'predicted_rating']]
inference = inference.rename(columns={"predicted_rating": "rating"})
inference['rating'] = inference['rating'].apply(lambda x: round(x, 0))

In [None]:
with open('../Pickle/imputed_ratings.pkl', 'wb') as f:
    pickle.dump(inference, f)