In [None]:
import pandas as pd
import numpy as np
import pickle
from surprise import Reader, Dataset, SVD
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import resample
from collections import defaultdict
from math import log2

In [None]:
def load_data():
    reviews = pd.read_pickle("../Pickle/reviews.pkl")
    interactions = pd.read_pickle("../Pickle/interactions.pkl")
    read = pd.read_pickle("../Pickle/read.pkl")
    return reviews, interactions, read

In [None]:
def preprocess_data(df):
    users_with_only_zeros = df.groupby('user_id')['rating'].apply(lambda x: (x == 0).all())
    users_with_only_zeros = users_with_only_zeros[users_with_only_zeros].index
    df = df[~df['user_id'].isin(users_with_only_zeros)]
    
    rated_books = df[df['is_read'] != 0].reset_index(drop = True)
    rated_books = rated_books[rated_books['rating'] != 0].reset_index(drop=True)
    return rated_books

In [None]:
def split_data(df):
    user_counts = df['user_id'].value_counts()
    book_counts = df['book_id'].value_counts()
    
    single_interactions = df[df['user_id'].isin(user_counts[user_counts == 1].index) |
                             df['book_id'].isin(book_counts[book_counts == 1].index)]
    remaining_interactions = df[~df.index.isin(single_interactions.index)]
    
    train_df, test_df = train_test_split(remaining_interactions, test_size=0.2, random_state=42, stratify=remaining_interactions['rating'])
    train_df = pd.concat([train_df, single_interactions], ignore_index=True)
    
    return train_df, test_df

In [None]:
def upsample_ratings(train_df):
    rating_counts = train_df['rating'].value_counts()

# Find the majority class size (highest count)
    majority_count = rating_counts.max()

    MAX_TARGET_SIZE = int(0.6 * majority_count)

    modified_dfs = []

# Perform upsampling with slight noise addition for each class
    for rating, count in rating_counts.items():
        class_df = train_df[train_df['rating'] == rating]

        if count >= MAX_TARGET_SIZE:
            balanced_df = class_df.copy()
        else:
            balanced_df = resample(class_df,
                               replace=True,
                               n_samples=MAX_TARGET_SIZE,
                               random_state=42)

        # Add random noise to the ratings
            balanced_df['rating'] = balanced_df['rating'] + np.random.uniform(-0.1, 0.1, size=balanced_df.shape[0])
            # Ensure ratings are within the valid range (e.g., between 1 and 5)
            balanced_df['rating'] = balanced_df['rating'].clip(1, 5)

        modified_dfs.append(balanced_df)


    balanced_train = pd.concat(modified_dfs, ignore_index=True)

# Shuffle dataset
    train_df = balanced_train.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return pd.concat([train_df] + modified_dfs, ignore_index=True)

In [None]:
def normalize_ratings(train_df, test_df):
    min_rating = train_df['rating'].min()
    
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    train_df['rating'] = np.log1p(train_df['rating'] - min_rating)
    test_df['rating'] = np.log1p(test_df['rating'] - min_rating)
    
    return train_df, test_df, min_rating

def denormalize_rating(log_scaled_ratings, min_rating):
    original_ratings = np.expm1(np.asarray(log_scaled_ratings, dtype=float))
    return np.clip(original_ratings + min_rating, 1, 5) 


In [None]:
reviews, interactions, read = load_data()

In [None]:
users_with_only_zeros = read.groupby('user_id')['rating'].apply(lambda x: (x == 0).all())
users_with_only_zeros = users_with_only_zeros[users_with_only_zeros].index
read = read[~read['user_id'].isin(users_with_only_zeros)]

In [None]:
# Filter the dataframe to include only rows where 'is_read' is 1
read_df = read[read['is_read'] == 1].reset_index(drop=True)

# Filter the dataframe to include only rows where 'rating' is 0
zero_rating_df = read_df[read_df['rating'] == 0].reset_index(drop=True)

# Filter the reviews dataframe to include only rows where 'rating' is 0
zero_rating_reviews = reviews[reviews['rating'] == 0]

# Filter the interactions dataframe to include only rows where 'rating' is 0
zero_rating_interactions = interactions[interactions['rating'] == 0]

# Get unique user IDs from the zero-rated reviews
zero_rating_review_users = zero_rating_reviews['user_id'].unique()

# Get unique user IDs from the zero-rated interactions
zero_rating_interaction_users = zero_rating_interactions['user_id'].unique()

# Combine the zero-rated reviews and interactions into a single dataframe used for inference
inference = pd.concat([zero_rating_reviews, zero_rating_interactions], ignore_index=True)

# Create a set to store unique user IDs
unique_users = set()

# Combine the unique user IDs from both reviews and interactions, ensuring no duplicates
unique_user_ids = [user_id for user_id in list(zero_rating_review_users) + list(zero_rating_interaction_users) 
                   if user_id not in unique_users and not unique_users.add(user_id)]

# Filter the original dataframe to include only rows with user IDs from the combined list
filtered_df = read_df[read_df['user_id'].isin(unique_user_ids)]

# Filter the original dataframe to include only rows where 'rating' is not 0
non_zero_rating_df = read_df[read_df['rating'] != 0]

# Sample 1% of the non-zero rated rows for more varety in the dataset
non_zero_rating_sample = non_zero_rating_df.sample(frac=0.01, random_state=42)

# Combine the filtered dataframe with the sampled non-zero rated rows
final_df = pd.concat([filtered_df, non_zero_rating_sample], ignore_index=True)

# Reset the index of the final dataframe
df = final_df.reset_index(drop=True)

In [None]:
df = df.astype(np.int32)
df = preprocess_data(df)
train_df, test_df = split_data(df)
train_df = upsample_ratings(train_df)
train_df, test_df, min_rating = normalize_ratings(train_df, test_df)

In [None]:
plt.hist(train_df['rating'], bins=10, edgecolor='black')
plt.title('Histogram of Balanced and Log Normalised Train Set Ratings')
plt.xlabel('Predicted Ratings')
plt.ylabel('Frequency')
plt.show()

In [None]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'rating']], reader)

In [None]:
param_grid = {
    'n_factors': [100],
    'n_epochs': [100],
    'lr_all': [0.001],
    'reg_all': [0.0001]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2, n_jobs=-1)

gs.fit(train_data)

best_params = gs.best_params['rmse']  
best_model = gs.best_estimator['rmse']  

print(f"Best RMSE: {gs.best_score['rmse']:.4f}")
print(f"Best Parameters: {best_params}")

In [None]:
trainset = train_data.build_full_trainset()

In [None]:
from surprise import accuracy

best_model = SVD(**best_params, verbose=True)
best_model.fit(trainset)

testset = [tuple(x) for x in test_df[['user_id', 'book_id', 'rating']].values]
predictions = best_model.test(testset)

test_rmse = accuracy.rmse(predictions)
test_mae = accuracy.mae(predictions)

In [None]:
predictions = best_model.test(testset)

In [None]:
def precision_recall_ndcg_mrr_at_k(predictions, min_rating, k, threshold, item_popularity, num_users):
    """
    Compute Precision@k, Recall@k, nDCG@k, MRR@k, Novelty, Hit Rate, and User Coverage.

    Parameters:
        predictions (list): List of tuples (user_id, item_id, true_rating, predicted_rating, _).
        min_rating (float): Minimum rating value for denormalization.
        k (int): Number of recommendations to consider.
        threshold (float): Threshold for considering an item as relevant.
        item_popularity (dict): Dictionary of item popularity counts.
        num_users (int): Total number of users.

    Returns:
        Tuple of (precision, recall, ndcg, mrr, novelty, hit_rate, user_coverage).
    """
    # Denormalize ratings
    denormalized_ratings = [denormalize_rating(est, min_rating) for _, _, _, est, _ in predictions]
    denormalized_true_ratings = [denormalize_rating(true_r, min_rating) for _, _, true_r, _, _ in predictions]

    denormalized_predictions = [(uid, iid, denormalized_true_ratings[idx], denormalized_ratings[idx], _) 
                                for idx, (uid, iid, true_r, est, _) in enumerate(predictions)]

    # Organize predictions by user
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in denormalized_predictions:
        user_est_true[uid].append((iid, est, true_r))

    precisions, recalls, ndcgs, mrrs = [], [], [], []
    hit_counts = defaultdict(int)  # Track hits for Hit Rate
    user_coverage_set = set()  # Track users with at least one recommendation
    novelty_scores = []  # Track novelty scores

    max_self_info = log2(num_users) if num_users > 0 else 1  # Avoid division by zero

    for user_id, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[1], reverse=True)

        # Get relevant items (ratings >= threshold)
        relevant_items = [(iid, est, true_r) for iid, est, true_r in user_ratings if true_r >= threshold]
        n_rel = len(relevant_items)

        if n_rel == 0:
            precisions.append(0)
            recalls.append(0)
            ndcgs.append(0)
            mrrs.append(0)
            continue

        # Get top-k predictions
        top_k = user_ratings[:k]
        n_rec_k = sum(est >= threshold for _, est, _ in top_k)
        n_rel_and_rec_k = sum((true_r >= threshold) and (est >= threshold) for _, est, true_r in top_k)

        # Precision at k
        precision = n_rel_and_rec_k / n_rec_k if n_rec_k > 0 else 0
        recall = n_rel_and_rec_k / n_rel if n_rel > 0 else 0

        # NDCG at k
        actual_dcg = sum((2 ** rel - 1) / log2(idx + 2) for idx, (_, _, rel) in enumerate(top_k))
        ideal_ratings_sorted = sorted(user_ratings, key=lambda x: x[2], reverse=True)[:k]
        ideal_dcg = sum((2 ** rel - 1) / log2(idx + 2) for idx, (_, _, rel) in enumerate(ideal_ratings_sorted))
        ndcg = actual_dcg / ideal_dcg if ideal_dcg > 0 else 0

        # MRR at k
        mrr = 0
        for rank, (iid, est, true_r) in enumerate(top_k, start=1):
            if true_r >= threshold:
                mrr = 1 / rank
                break

        # Hit Rate
        if n_rel_and_rec_k > 0:
            hit_counts[user_id] = 1

        # User Coverage
        if n_rec_k > 0:
            user_coverage_set.add(user_id)

        # Novelty (normalized to 0-1)
        raw_self_info = [-log2(item_popularity.get(iid, 1) / num_users) for iid, _, _ in top_k]
        normalized_self_info = [score / max_self_info for score in raw_self_info]
        mean_self_info = np.mean(normalized_self_info)
        novelty_scores.append(mean_self_info)

        # Append the metrics
        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)
        mrrs.append(mrr)

    # Compute overall metrics
    precision = np.mean(precisions)
    recall = np.mean(recalls)
    ndcg = np.mean(ndcgs)
    mrr = np.mean(mrrs)
    hit_rate = np.mean(list(hit_counts.values())) if hit_counts else 0
    user_coverage = len(user_coverage_set) / len(user_est_true) if user_est_true else 0
    novelty = np.mean(novelty_scores) if novelty_scores else 0

    return (
        precision,
        recall,
        ndcg,
        mrr,
        novelty,
        hit_rate,
        user_coverage
    )

threshold = 4  # Relevance threshold
item_popularity = test_df['book_id'].value_counts().to_dict()
num_users = test_df['user_id'].nunique()

# Compute metrics
precision, recall, ndcg, mrr, novelty, hit_rate, user_coverage = precision_recall_ndcg_mrr_at_k(
    predictions, min_rating=1.0, k=5, threshold=threshold, item_popularity=item_popularity, num_users=num_users
)

# Print results
print(f'Precision@{k}: {precision:.4f}')
print(f'Recall@{k}: {recall:.4f}')
print(f'nDCG@{k}: {ndcg:.4f}')
print(f'MRR@{k}: {mrr:.4f}')
print(f'Novelty (0-1): {novelty:.4f}')
print(f'Hit Rate: {hit_rate:.4f}')
print(f'User Coverage: {user_coverage:.4f}')


In [None]:
import pickle
with open("../Pickle/best_svd_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("../Pickle/best_svd_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
trainset = loaded_model.trainset

def predict_with_fallback(user_id, book_id):
    if not trainset.knows_user(user_id):
        return np.nan  # User not in training
    if not trainset.knows_item(book_id):
        return np.nan  # Book not in training
    
    pred = loaded_model.predict(user_id, book_id)
    return pred.est  # Return estimated rating

inference["predicted_rating"] = inference.apply(
    lambda row: predict_with_fallback(row["user_id"], row["book_id"]), axis=1
)

In [None]:
inference.loc[inference["predicted_rating"].notna(), "predicted_rating"] = inference["predicted_rating"].apply(
    lambda x: denormalize_rating(x, min_rating) if pd.notna(x) else x)

In [None]:
inference = inference.dropna(subset=['predicted_rating'])
inference = inference.reset_index(drop = True)
inference = inference[['user_id', 'book_id', 'predicted_rating']]
inference = inference.rename(columns={"predicted_rating": "rating"})
inference['rating'] = inference['rating'].apply(lambda x: round(x, 0))

In [None]:
with open('../Pickle/imputed_ratings.pkl', 'wb') as f:
    pickle.dump(inference, f)