In [None]:
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, accuracy
from tqdm import tqdm
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle
from collections import defaultdict
from math import log2
from sklearn.utils import resample
tqdm.pandas()
import surprise
from surprise import NMF


# Load and filter data
interactions = pd.read_pickle('../Pickle/interactions.pkl')
interactions = interactions[['user_id', 'book_id', 'rating', 'is_read']]  

# Filter user IDs that appear more than 5 times
# Filter book IDs that appear more than 5 times
book_counts = interactions['book_id'].value_counts()
interactions = interactions[interactions['book_id'].isin(book_counts[book_counts >= 5].index)]
user_counts = interactions['user_id'].value_counts()
interactions = interactions[interactions['user_id'].isin(user_counts[user_counts >= 5].index)]

# Shuffle user interactions
# Initialize empty lists for train and test splits
train_list, test_list = [], []

for user_id, user_data in interactions.groupby('user_id'):
    # Shuffle the ratings for the user
    user_data_shuffled = user_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split the data into train and test (80% for train, 20% for test)
    train, test = train_test_split(user_data_shuffled, test_size=0.2, random_state=42, stratify=user_data_shuffled['user_id'])
    
    # Add the train and test data for this user to the overall list
    train_list.append(train)
    test_list.append(test)

# Concatenate all train and test data
train_df = pd.concat(train_list, ignore_index=True)
test_df = pd.concat(test_list, ignore_index=True)


In [16]:
train_df['rating'].value_counts()

rating
4    78926
5    69973
3    46464
2    12181
0    10234
1     3680
Name: count, dtype: int64

In [17]:
# Get rating counts
rating_counts = train_df['rating'].value_counts()

# Find the majority class size (highest count)
majority_count = rating_counts.max()

# Dictionary to store oversampled dataframes
oversampled_dfs = []

# Perform upsampling for each class to match a proportion of the majority count
for rating, count in rating_counts.items():
    class_df = train_df[train_df['rating'] == rating]
    
    # Upsample based on a percentage of the majority class size
    if count < majority_count:
        upsampled_df = resample(class_df, replace=True, n_samples=int(majority_count * 0.4  ), random_state=42)  # 75% of the majority class
    else:
        upsampled_df = class_df  # Keep majority class as is
    
    oversampled_dfs.append(upsampled_df)

# Combine all upsampled data
balanced_train = pd.concat(oversampled_dfs)

# Shuffle dataset
train_df = balanced_train.sample(frac=1, random_state=42).reset_index(drop=True)


In [18]:
lambda_reg = 0.1
global_mean = train_df['rating'].mean()
# Calculate user bias with regularization
user_sum_ratings = train_df.groupby('user_id')['rating'].sum()
user_count_ratings = train_df.groupby('user_id')['rating'].count()

# Recalculate user bias with the new lambda_reg value
user_bias = (user_sum_ratings - user_count_ratings * global_mean) / (user_count_ratings + lambda_reg)

# Map user bias back to the original dataframe
train_df['user_bias'] = train_df['user_id'].map(user_bias)

# Calculate item bias with regularization
item_sum_ratings = train_df.groupby('book_id')['rating'].sum()
item_count_ratings = train_df.groupby('book_id')['rating'].count()
item_bias = (item_sum_ratings - item_count_ratings * global_mean) / (item_count_ratings + lambda_reg)

# Map item bias back to the original dataframe
train_df['item_bias'] = train_df['book_id'].map(item_bias)

# Calculate the global mean
global_mean = train_df['rating'].mean()

# Normalize ratings by Z-score per user and item
train_df['user_mean'] = train_df.groupby('user_id')['rating'].transform('mean')
train_df['user_std'] = train_df.groupby('user_id')['rating'].transform('std')

train_df['item_mean'] = train_df.groupby('book_id')['rating'].transform('mean')
train_df['item_std'] = train_df.groupby('book_id')['rating'].transform('std')

# Normalize ratings using Z-score for user and item
train_df['normalised_rating'] = (train_df['rating'] - train_df['user_mean']) / (train_df['user_std'] + 1e-5)  # avoid divide by zero


In [19]:
train_df['rating'].value_counts()

rating
4    78926
1    31570
3    31570
0    31570
2    31570
5    31570
Name: count, dtype: int64

In [20]:
# Convert standardized train_df to surprise dataset
reader = Reader(rating_scale=(train_df['normalised_rating'].min(), train_df['normalised_rating'].max()))
train_data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'normalised_rating']], reader)

# Convert test_df to surprise dataset without normalization
test_data = Dataset.load_from_df(test_df[['user_id', 'book_id', 'rating']], reader)

# Build full trainset and testset
trainset = train_data.build_full_trainset()
testset = test_data.construct_testset([(uid, iid, r, {}) for uid, iid, r in test_df[['user_id', 'book_id', 'rating']].values])

In [None]:
param_grid = { 
    'n_factors': [190, 150, 100],  # Expanded n_factors range
    'n_epochs': [80, 60, 40],  # Expanded n_epochs range
    'reg_pu': [0.01, 0.05, 0.1],  # Regularization for user factors
    'reg_qi': [0.01, 0.05, 0.1],  # Regularization for item factors
    'lr_bu': [0.002, 0.005, 0.01],  # Learning rate for user bias
    'lr_bi': [0.002, 0.005, 0.01],  # Learning rate for item bias
    'lr_all': [0.002, 0.005, 0.01],  # Learning rate for all parameters
    'bias_type': ['bias', 'none'],  # Bias type (with or without biases)
    'random_state': [42]
}


gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=4)
gs.fit(train_data)

best_params = gs.best_params['rmse']
best_nmf = NMF(**best_params)
best_nmf.fit(train_data.build_full_trainset())
predictions = best_nmf.test(testset)

TypeError: __init__() got an unexpected keyword argument 'lr_all'

In [None]:
best_params

In [None]:
def reverse_bias_terms(uid, iid, est, user_bias, item_bias, global_mean, bias_scale=0.05):
    # Retrieve user and item bias with the updated user/item bias calculations
    user_b = user_bias.get(uid, 0)  # Default to 0 if the user/item is not in the training data
    item_b = item_bias.get(iid, 0)
    
    # Recalculate the unbiased prediction
    unbiased_prediction = est - user_b - item_b + global_mean

    # Scale the unbiased prediction to reduce the effect of bias
    unbiased_prediction = unbiased_prediction * bias_scale + global_mean  # Scale towards global mean
    
    return unbiased_prediction

# Rescale predictions by reversing bias terms
def unbiased_predictions(predictions, user_bias, item_bias, global_mean):
    adjusted_predictions = []
    
    for uid, iid, true_r, est, _ in predictions:
        # Calculate the unbiased prediction using the reverse_bias_terms function
        unbiased_prediction = reverse_bias_terms(uid, iid, est, user_bias, item_bias, global_mean)
        
        # Clip the rating to the original scale (e.g., 1 to 5) to avoid out-of-bound values
        unbiased_prediction = min(5, max(1, unbiased_prediction))
        
        # Append the adjusted predictions
        adjusted_predictions.append((uid, iid, true_r, unbiased_prediction, _))
    
    return adjusted_predictions

In [None]:
# Rescale predictions
adjusted_predictions = unbiased_predictions(predictions, user_bias, item_bias, global_mean)

# Evaluation metrics: Precision, Recall, and nDCG at k
def precision_recall_ndcg_at_k(predictions, k, threshold):
    """Return precision, recall, and nDCG at k metrics for each user."""
    
    # Helper function to calculate DCG and nDCG
    def dcg_at_k(scores, k):
        return sum([rel / log2(idx + 2) for idx, rel in enumerate(scores[:k])])

    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    ndcgs = dict()
    
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
        # nDCG@K
        actual = [true_r for (_, true_r) in user_ratings]
        ideal = sorted(actual, reverse=True)
        
        idcg = dcg_at_k(ideal, k)
        dcg = dcg_at_k([rel for (est, rel) in user_ratings], k)
        
        ndcgs[uid] = dcg / idcg if idcg > 0 else 0
    
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    ndcg = sum(ndcg for ndcg in ndcgs.values()) / len(ndcgs)
    
    return precision, recall, ndcg

# Compute the evaluation metrics
precision, recall, ndcg = precision_recall_ndcg_at_k(adjusted_predictions, k=10, threshold=3.5)
print(f'Adjusted Precision: {precision}, Adjusted Recall: {recall}, Adjusted nDCG: {ndcg}')


In [None]:
adjusted_predictions

In [None]:
import matplotlib.pyplot as plt

# Extract the adjusted predictions
adjusted_rating_values = [pred[3] for pred in adjusted_predictions] 

# Plot the distribution of adjusted ratings
plt.figure(figsize=(10, 6))
plt.hist(adjusted_rating_values, bins=20, edgecolor='black', alpha=0.7)
plt.title('Distribution of Adjusted Predictions')
plt.xlabel('Adjusted Rating')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
# Check some of the predictions before and after bias adjustment
for uid, iid, true_r, est, _ in predictions[:10]:
    print(f"User: {uid}, Book: {iid}, True Rating: {true_r}, Predicted: {est}")


for uid, iid, true_r, adjusted_est, _ in adjusted_predictions[:10]:
    print(f"User: {uid}, Book: {iid}, True Rating: {true_r}, Adjusted Prediction: {adjusted_est}")


In [None]:
print("Sample user biases:", {uid: user_bias.get(uid, 0) for uid in list(train_df['user_id'].head())})
print("Sample item biases:", {iid: item_bias.get(iid, 0) for iid in list(train_df['book_id'].head())})


In [None]:
for uid, iid, true_r, est, _ in predictions[:10]:
    print(f"Raw Prediction (before bias): {est} for User: {uid}, Book: {iid}")
