In [1]:
import numpy as np
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, Trainset, AlgoBase, accuracy
from tqdm import tqdm
from surprise import AlgoBase, Trainset
from surprise.model_selection import GridSearchCV
from sklearn.utils import resample
from collections import defaultdict
from math import log2
from sklearn.utils.extmath import randomized_svd
from sklearn.model_selection import train_test_split
import pickle
from custom_svd import RandomizedSVD

tqdm.pandas()

In [2]:
interactions = pd.read_pickle('../Pickle/interactions.pkl')

In [3]:
interactions = interactions[['user_id', 'book_id', 'rating', 'is_read']]  

In [4]:
from sklearn.utils import resample

# Separate lower ratings by their respective values
ratings_0 = interactions[interactions['rating'] == 0]
ratings_1 = interactions[interactions['rating'] == 1]
ratings_2 = interactions[interactions['rating'] == 2]
higher_ratings = interactions[interactions['rating'] > 3]

# Calculate the number of samples needed for each lower rating
higher_count = len(higher_ratings)

# Define sampling factors
factor_0 = 0.2
factor_1 = 0.2
factor_2 = 0.2

# Ensure n_samples are within valid limits
n_samples_0 = min(int(higher_count * factor_0), len(ratings_0))
n_samples_1 = min(int(higher_count * factor_1), len(ratings_1))
n_samples_2 = min(int(higher_count * factor_2), len(ratings_2))

# Apply oversampling
ratings_0_oversampled = resample(ratings_0, replace=True, n_samples=n_samples_0, random_state=42)
ratings_1_oversampled = resample(ratings_1, replace=True, n_samples=n_samples_1, random_state=42)
ratings_2_oversampled = resample(ratings_2, replace=True, n_samples=n_samples_2, random_state=42)

# Combine the datasets
interactions = pd.concat([higher_ratings, ratings_0_oversampled, ratings_1_oversampled, ratings_2_oversampled])


In [5]:
train_df, test_df = train_test_split(interactions, test_size=0.2, random_state=42)

In [6]:
# Calculate the global mean rating
global_mean = train_df['rating'].mean()

# Calculate user bias with regularization
lambda_reg = 10
user_sum_ratings = train_df.groupby('user_id')['rating'].sum()
user_count_ratings = train_df.groupby('user_id')['rating'].count()
user_bias = (user_sum_ratings - user_count_ratings * global_mean) / (user_count_ratings + lambda_reg)

# Map user bias back to the original dataframe
train_df['user_bias'] = train_df['user_id'].map(user_bias)
# Calculate item bias with regularization
item_sum_ratings = train_df.groupby('book_id')['rating'].sum()
item_count_ratings = train_df.groupby('book_id')['rating'].count()
item_bias = (item_sum_ratings - item_count_ratings * global_mean) / (item_count_ratings + lambda_reg)

# Map item bias back to the original dataframe
train_df['item_bias'] = train_df['book_id'].map(item_bias)
# Normalize ratings
train_df['normalised_rating'] = train_df['rating'] - train_df['user_bias'] - train_df['item_bias']

In [7]:
# Convert standardized train_df to surprise dataset
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))

In [8]:
train_data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'normalised_rating']], reader)
# Convert test_df to surprise dataset without normalization
test_data = Dataset.load_from_df(test_df[['user_id', 'book_id', 'rating']], reader)

In [9]:
# Build full trainset and testset
trainset = train_data.build_full_trainset()
testset = test_data.construct_testset([(uid, iid, r, {}) for uid, iid, r in test_df[['user_id', 'book_id', 'rating']].values])

In [10]:
# Define a parameter grid
param_grid = { 
    'n_factors':[60], 
    'n_iter': [18], 
    'random_state': [42], 
}


In [11]:
gs = GridSearchCV(RandomizedSVD, param_grid, measures=['rmse'], cv=2)
gs.fit(train_data)

In [12]:
best_params = gs.best_params['rmse']

In [13]:
best_params

{'n_factors': 60, 'n_iter': 18, 'random_state': 42}

In [14]:
best_randomized_svd = RandomizedSVD(**best_params)
best_randomized_svd.fit(train_data.build_full_trainset())

<custom_svd.RandomizedSVD at 0x1aa52b3e120>

In [15]:
predictions = best_randomized_svd.test(testset)

In [16]:
# Reverse bias terms
def reverse_bias_terms(uid, iid, est, user_bias, item_bias, global_mean):
    user_b = user_bias.get(uid, 0)  # Default to 0 if the user/item is not in the training data
    item_b = item_bias.get(iid, 0)
    unbiased_prediction = est - user_b - item_b + global_mean
    return unbiased_prediction

# Rescale predictions by reversing bias terms
def unbiased_predictions(predictions, user_bias, item_bias, global_mean):
    adjusted_predictions = []
    for uid, iid, true_r, est, _ in predictions:
        # Calculate the unbiased prediction
        unbiased_prediction = reverse_bias_terms(uid, iid, est, user_bias, item_bias, global_mean)
        # Clip the rating to the original scale (e.g., 1 to 5)
        unbiased_prediction = min(5, max(1, unbiased_prediction))
        adjusted_predictions.append((uid, iid, true_r, unbiased_prediction, _))
    return adjusted_predictions

# Rescale predictions
adjusted_predictions = unbiased_predictions(predictions, user_bias.to_dict(), item_bias.to_dict(), train_df['rating'].mean())

In [17]:
adjusted_predictions_df = pd.DataFrame(adjusted_predictions, columns=['user_id', 'book_id', 'rating', 'adjusted_rating', 'details'])

In [18]:
adjusted_predictions_df

Unnamed: 0,user_id,book_id,rating,adjusted_rating,details
0,1594,25945,5,3.437699,{'was_impossible': False}
1,1264,21740,2,5.000000,{'was_impossible': False}
2,1688,968,4,5.000000,{'was_impossible': False}
3,1379,6868,1,3.907819,{'was_impossible': False}
4,1442,221416,0,5.000000,{'was_impossible': False}
...,...,...,...,...,...
77897,222,1621,0,4.761635,{'was_impossible': False}
77898,680,1368,5,4.673668,{'was_impossible': False}
77899,1310,40168,4,3.665991,{'was_impossible': False}
77900,297,33758,4,5.000000,{'was_impossible': False}


In [19]:
def precision_recall_ndcg_at_k(predictions, k, threshold):
    """Return precision, recall, and nDCG at k metrics for each user."""
    
    # Helper function to calculate DCG and nDCG
    def dcg_at_k(scores, k):
        return sum([rel / log2(idx + 2) for idx, rel in enumerate(scores[:k])])

    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    ndcgs = dict()
    
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
        # nDCG@K
        actual = [true_r for (_, true_r) in user_ratings]
        ideal = sorted(actual, reverse=True)
        
        idcg = dcg_at_k(ideal, k)
        dcg = dcg_at_k([rel for (est, rel) in user_ratings], k)
        
        ndcgs[uid] = dcg / idcg if idcg > 0 else 0
    
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    ndcg = sum(ndcg for ndcg in ndcgs.values()) / len(ndcgs)
    
    return precision, recall, ndcg

In [20]:
precision, recall, ndcg = precision_recall_ndcg_at_k(adjusted_predictions, k=10, threshold=2)
print(f'Adjusted Precision: {precision}, Adjusted Recall: {recall}, Adjusted nDCG: {ndcg}')

Adjusted Precision: 0.886157343196182, Adjusted Recall: 0.45937581866160715, Adjusted nDCG: 0.8130718590717932


In [21]:
# Save the trained model
model_filename = '../Pickle/svd_model.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(best_randomized_svd, model_file)


In [22]:
biases_filename = '../Pickle/biases.pkl'
# Save biases
biases = {
    'user_bias': user_bias,
    'item_bias': item_bias,
    'global_mean': global_mean
}

with open(biases_filename, 'wb') as biases_file:
    pickle.dump(biases, biases_file)