In [None]:
import numpy as np
import pandas as pd
from surprise import Reader, Dataset
from tqdm import tqdm
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from collections import defaultdict
from math import log2
from sklearn.utils import resample
tqdm.pandas()
from surprise import NMF

In [None]:
# Load and filter data
interactions = pd.read_pickle('../Pickle/interactions.pkl')
books = pd.read_pickle('../Pickle/books.pkl')
interactions = interactions[['user_id', 'book_id', 'rating', 'is_read']]  

In [None]:
interactions['rating'].value_counts()

rating
4    178755
5    143523
3    111691
0     29983
2     29237
1      8360
Name: count, dtype: int64

In [None]:
interactions = interactions[interactions['rating'] != 0]

In [None]:
# Filter user IDs that appear more than 5 times
# Filter book IDs that appear more than 5 times
book_counts = interactions['book_id'].value_counts()
interactions = interactions[interactions['book_id'].isin(book_counts[book_counts >= 5].index)]
user_counts = interactions['user_id'].value_counts()
interactions = interactions[interactions['user_id'].isin(user_counts[user_counts >= 5].index)]

# Shuffle user interactions
# Initialize empty lists for train and test splits
train_list, test_list = [], []

for user_id, user_data in interactions.groupby('user_id'):
    # Shuffle the ratings for the user
    user_data_shuffled = user_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split the data into train and test (80% for train, 20% for test)
    train, test = train_test_split(user_data_shuffled, test_size=0.2, random_state=42, stratify=user_data_shuffled['user_id'])
    
    # Add the train and test data for this user to the overall list
    train_list.append(train)
    test_list.append(test)

# Concatenate all train and test data
train_df = pd.concat(train_list, ignore_index=True)
test_df = pd.concat(test_list, ignore_index=True)


In [None]:
# Get rating counts
rating_counts = train_df['rating'].value_counts()

# Find the majority class size (highest count)
majority_count = rating_counts.max()

# Dictionary to store modified dataframes
modified_dfs = []

# Perform upsampling with slight noise addition for each class
for rating, count in rating_counts.items():
    class_df = train_df[train_df['rating'] == rating]
    
    # Duplicate rows based on the majority class size, ensuring the size of the class is adjusted
    num_duplicates = int(majority_count / count)  # Number of duplications required
    
    # Duplicate the class_df rows and add small random noise to the ratings
    duplicated_df = pd.concat([class_df] * num_duplicates, ignore_index=True)
    
    # Add random noise to the ratings
    duplicated_df['rating'] = duplicated_df['rating'] + np.random.uniform(-0.1, 0.1, size=duplicated_df.shape[0])
    # Ensure ratings are within the valid range (e.g., between 1 and 5)
    duplicated_df['rating'] = duplicated_df['rating'].clip(1, 5)
    modified_dfs.append(duplicated_df)

# Combine all modified data
balanced_train = pd.concat(modified_dfs)

# Shuffle dataset
train_df = balanced_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
train_data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'rating']], reader)
test_data = Dataset.load_from_df(test_df[['user_id', 'book_id', 'rating']], reader)
trainset = train_data.build_full_trainset()
testset = test_data.construct_testset([(uid, iid, r, {}) for uid, iid, r in test_df[['user_id', 'book_id', 'rating']].values])

In [None]:
# Define the parameter grid
param_grid = { 
    'n_factors': [400,300],  
    'n_epochs': [250, 150],  
    'reg_pu': [0.1, 0.01],  # Regularization for user factors
    'reg_qi': [0.1, 0.01],  # Regularization for item factors
    'lr_bu': [0.01, 0.1],  # Learning rate for user bias
    'lr_bi': [0.01, 0.1],  # Learning rate for item bias
    'random_state': [42]
}

# Define the grid search with GridSearchCV
gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=2)

# Fit the grid search model
gs.fit(train_data)

# Get the best parameters and model
best_params = gs.best_params['rmse']
best_nmf = NMF(**best_params)
best_nmf.fit(trainset)

# Test the model
predictions = best_nmf.test(testset)

In [None]:
# import joblib
# joblib.dump(best_nmf, '../Pickle/best_nmf_model.pkl')

In [None]:
best_params

In [None]:
# Evaluation metrics: Precision, Recall, and nDCG at k
def precision_recall_ndcg_at_k(predictions, k, threshold):
    """Return precision, recall, and nDCG at k metrics for each user."""
    
    def dcg_at_k(scores, k):
        return sum([rel / log2(idx + 2) for idx, rel in enumerate(scores[:k])])

    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    ndcgs = dict()
    
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
        # nDCG@K
        actual = [true_r for (_, true_r) in user_ratings]
        ideal = sorted(actual, reverse=True)
        
        idcg = dcg_at_k(ideal, k)
        dcg = dcg_at_k([rel for (est, rel) in user_ratings], k)
        ndcgs[uid] = dcg / idcg if idcg > 0 else 0
    
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    ndcg = sum(ndcg for ndcg in ndcgs.values()) / len(ndcgs)
    
    return precision, recall, ndcg

# Compute the evaluation metrics
precision, recall, ndcg = precision_recall_ndcg_at_k(predictions, k=10, threshold=4)
print(f'Adjusted Precision: {precision}, Adjusted Recall: {recall}, Adjusted nDCG: {ndcg}')


In [None]:
import matplotlib.pyplot as plt

# Extract the adjusted predictions
rating_values = [pred[3] for pred in predictions] 

# Plot the distribution of adjusted ratings
plt.figure(figsize=(10, 6))
plt.hist([pred[3] for pred in predictions], bins=20, edgecolor='black', alpha=0.7)
plt.title('Distribution of Predictions')
plt.xlabel('Adjusted Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# import numpy as np
# import pandas as pd
# from surprise import Reader, Dataset, NMF
# from surprise.model_selection import GridSearchCV
# from sklearn.model_selection import train_test_split
# from collections import defaultdict
# from math import log2
# import matplotlib.pyplot as plt
# from sklearn.utils import resample
# from tqdm import tqdm
# tqdm.pandas()

# # Function to load and filter data
# def load_and_filter_data():
#     interactions = pd.read_pickle('../Pickle/interactions.pkl')
#     books = pd.read_pickle('../Pickle/books.pkl')
#     interactions = interactions[['user_id', 'book_id', 'rating', 'is_read']]
    
#     # Step 1: Compute the average rating per user
#     user_avg_rating = interactions[interactions['rating'] > 0].groupby('user_id')['rating'].mean().round().astype(int)

#     # Step 2: Round the book's average rating column
#     books['average_rating'] = pd.to_numeric(books['average_rating'], errors='coerce').round().astype('Int64')
#     book_avg = books[['average_rating', 'book_id']]
#     interactions = pd.merge(interactions, book_avg, on='book_id')

#     # Step 3: Impute missing ratings
#     def impute_rating(row):
#         if row['rating'] == 0:
#             return user_avg_rating.get(row['user_id'], row['average_rating'])  # Use user avg or book avg
#         return row['rating']
    
#     interactions['rating'] = interactions.apply(impute_rating, axis=1)
    
#     # Filter user IDs and book IDs with fewer than 5 occurrences
#     book_counts = interactions['book_id'].value_counts()
#     interactions = interactions[interactions['book_id'].isin(book_counts[book_counts >= 5].index)]
    
#     user_counts = interactions['user_id'].value_counts()
#     interactions = interactions[interactions['user_id'].isin(user_counts[user_counts >= 5].index)]
    
#     return interactions

# # Function to split data into train and test
# def split_data(interactions):
#     train_list, test_list = [], []
#     for user_id, user_data in interactions.groupby('user_id'):
#         user_data_shuffled = user_data.sample(frac=1, random_state=42).reset_index(drop=True)
#         train, test = train_test_split(user_data_shuffled, test_size=0.2, random_state=42, stratify=user_data_shuffled['user_id'])
#         train_list.append(train)
#         test_list.append(test)
    
#     train_df = pd.concat(train_list, ignore_index=True)
#     test_df = pd.concat(test_list, ignore_index=True)
    
#     return train_df, test_df

# # Function to apply random noise to balance the dataset
# def apply_random_noise_upsampling(train_df):
#     rating_counts = train_df['rating'].value_counts()
#     majority_count = rating_counts.max()
    
#     modified_dfs = []
    
#     for rating, count in rating_counts.items():
#         class_df = train_df[train_df['rating'] == rating]
#         num_duplicates = int(majority_count / count)
#         duplicated_df = pd.concat([class_df] * num_duplicates, ignore_index=True)
        
#         # Add random noise to ratings
#         duplicated_df['rating'] = duplicated_df['rating'] + np.random.uniform(-0.1, 0.1, size=duplicated_df.shape[0])
#         duplicated_df['rating'] = duplicated_df['rating'].clip(1, 5)
        
#         modified_dfs.append(duplicated_df)
    
#     balanced_train = pd.concat(modified_dfs)
#     train_df = balanced_train.sample(frac=1, random_state=42).reset_index(drop=True)
    
#     return train_df

# # Function to convert data to Surprise dataset
# def convert_to_surprise(train_df, test_df):
#     reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
#     train_data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'rating']], reader)
#     test_data = Dataset.load_from_df(test_df[['user_id', 'book_id', 'rating']], reader)

#     trainset = train_data.build_full_trainset()
#     testset = test_data.construct_testset([(uid, iid, r, {}) for uid, iid, r in test_df[['user_id', 'book_id', 'rating']].values])
    
#     return train_data, test_data, trainset, testset

# # Function to perform GridSearchCV and find the best parameters
# def grid_search(train_data):
#     param_grid = { 
#         'n_factors': [400, 300],  
#         'n_epochs': [250, 150],  
#         'reg_pu': [0.1, 0.01],
#         'reg_qi': [0.1, 0.01],
#         'lr_bu': [0.01, 0.1],
#         'lr_bi': [0.01, 0.1],
#         'random_state': [42]
#     }

#     gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=2)
#     gs.fit(train_data)
    
#     best_params = gs.best_params['rmse']
#     best_nmf = NMF(**best_params)
#     best_nmf.fit(train_data.build_full_trainset())
    
#     return best_nmf, gs.best_params

# # Function to compute evaluation metrics
# def precision_recall_ndcg_at_k(predictions, k, threshold):
#     def dcg_at_k(scores, k):
#         return sum([rel / log2(idx + 2) for idx, rel in enumerate(scores[:k])])

#     user_est_true = defaultdict(list)
#     for uid, _, true_r, est, _ in predictions:
#         user_est_true[uid].append((est, true_r))

#     precisions, recalls, ndcgs = dict(), dict(), dict()
    
#     for uid, user_ratings in user_est_true.items():
#         user_ratings.sort(key=lambda x: x[0], reverse=True)
        
#         n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
#         n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
#         n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
#         # Precision@K
#         precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
#         # Recall@K
#         recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
#         # nDCG@K
#         actual = [true_r for (_, true_r) in user_ratings]
#         ideal = sorted(actual, reverse=True)
        
#         idcg = dcg_at_k(ideal, k)
#         dcg = dcg_at_k([rel for (est, rel) in user_ratings], k)
#         ndcgs[uid] = dcg / idcg if idcg > 0 else 0

#     precision = sum(prec for prec in precisions.values()) / len(precisions)
#     recall = sum(rec for rec in recalls.values()) / len(recalls)
#     ndcg = sum(ndcg for ndcg in ndcgs.values()) / len(ndcgs)
    
#     return precision, recall, ndcg

# # Function to plot the results
# def plot_results(predictions):
#     rating_values = [pred[3] for pred in predictions]
#     plt.figure(figsize=(10, 6))
#     plt.hist(rating_values, bins=20, edgecolor='black', alpha=0.7)
#     plt.title('Distribution of Predictions')
#     plt.xlabel('Adjusted Rating')
#     plt.ylabel('Frequency')
#     plt.show()

# # Main function to run the entire pipeline
# def main():
#     interactions = load_and_filter_data()
#     train_df, test_df = split_data(interactions)
#     train_df = apply_random_noise_upsampling(train_df)
    
#     train_data, test_data, trainset, testset = convert_to_surprise(train_df, test_df)
#     best_nmf, best_params = grid_search(train_data)
    
#     # Test the model
#     predictions = best_nmf.test(testset)
    
#     # Compute metrics
#     precision, recall, ndcg = precision_recall_ndcg_at_k(predictions, k=10, threshold=3.5)
#     print(f'Adjusted Precision: {precision}, Adjusted Recall: {recall}, Adjusted nDCG: {ndcg}')
    
#     # Plot the distribution of predictions
#     plot_results(predictions)

# if __name__ == "__main__":
#     main()
