In [None]:
import numpy as np
import pandas as pd
from surprise import Reader, Dataset
from tqdm import tqdm
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from collections import defaultdict
from math import log2
tqdm.pandas()
from surprise import NMF
from sklearn.utils import resample

In [None]:
np.random.seed(42)

In [None]:
# Load and filter data
interactions = pd.read_pickle('../Pickle/interactions.pkl')
books = pd.read_pickle('../Pickle/books.pkl')
imputed = pd.read_pickle('../Pickle/imputed_ratings.pkl')
interactions = interactions[['user_id', 'book_id', 'rating', 'is_read']]  

In [None]:
interactions = interactions[interactions['rating'] != 0]
frames = [interactions, imputed]
interactions = pd.concat(frames)

In [None]:
def split_data_with_single_interactions(df):
    # Identify users and books that appear only once in the dataset
    user_counts = df['user_id'].value_counts()
    book_counts = df['book_id'].value_counts()

    # Find interactions where user or book appears only once
    single_interactions = df[
        df['user_id'].isin(user_counts[user_counts == 1].index) | 
        df['book_id'].isin(book_counts[book_counts == 1].index)
    ]

    # Remove those interactions from the main dataset
    remaining_interactions = df[~df.index.isin(single_interactions.index)]

    # Split the remaining interactions into train and test using sklearn's train_test_split
    train_df, test_df = train_test_split(remaining_interactions, test_size=0.2, random_state=42, stratify=remaining_interactions['rating'])

    # Add the single interactions to the training set
    train_df = pd.concat([train_df, single_interactions], ignore_index=True)


    return train_df, test_df

train_df, test_df = split_data_with_single_interactions(interactions)

In [None]:
rating_counts = train_df['rating'].value_counts()

# Find the majority class size (highest count)
majority_count = rating_counts.max()

MAX_TARGET_SIZE = int(0.75 * majority_count)

modified_dfs = []

# Perform upsampling with slight noise addition for each class
for rating, count in rating_counts.items():
    class_df = train_df[train_df['rating'] == rating]

    if count >= MAX_TARGET_SIZE:
        balanced_df = class_df.copy()
    else:
        balanced_df = resample(class_df,
                               replace=True,
                               n_samples=MAX_TARGET_SIZE,
                               random_state=42)

        # Add random noise to the ratings
        balanced_df['rating'] = balanced_df['rating'] + np.random.uniform(-0.1, 0.1, size=balanced_df.shape[0])
        # Ensure ratings are within the valid range (e.g., between 1 and 5)
        balanced_df['rating'] = balanced_df['rating'].clip(1, 5)

    modified_dfs.append(balanced_df)


balanced_train = pd.concat(modified_dfs, ignore_index=True)

# Shuffle dataset
train_df = balanced_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Log scale ratings
def normalise_ratings(train_data, test_data):
    min_rating = train_data['rating'].min()
    
    if min_rating < 0:
        train_data['rating'] = train_data['rating'] - min_rating
        test_data['rating'] = test_data['rating'] - min_rating

    train_data['rating'] = np.log1p(train_data['rating'])
    test_data['rating'] = np.log1p(test_data['rating'])

    return train_data, test_data, min_rating


def denormalize_rating(log_scaled_ratings, min_rating):
    log_scaled_ratings = np.asarray(log_scaled_ratings, dtype=float)

    # Reverse log1p transformation
    original_ratings = np.expm1(log_scaled_ratings)

    # Adjust for minimum rating
    if min_rating:
        original_ratings += min_rating

    # Clip values between 0 and 5
    return np.clip(original_ratings, 0, 5)

In [None]:
train_df, test_df, min_rating = normalise_ratings(train_df, test_df)

In [None]:
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
train_data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'rating']], reader)

In [None]:
# Define the parameter grid
param_grid = { 
    'n_factors': [50],  
    'n_epochs': [200],  
    'reg_pu': [0.01],  # Regularization for user factors
    'reg_qi': [0.01],  # Regularization for item factors
    'lr_bu': [0.01],  # Learning rate for user bias
    'lr_bi': [0.01],  # Learning rate for item bias
    'random_state': [42]
}

gs = GridSearchCV(NMF, param_grid, measures=['rmse', 'mae'], cv=2)

gs.fit(train_data)

best_params = gs.best_params['rmse']
best_nmf = NMF(**best_params, verbose = True)

print(f"Best RMSE: {gs.best_score['rmse']:.4f}")
print(f"Best MAE: {gs.best_score['mae']:.4f}")


print(f"Best Parameters: {best_params}")

In [None]:
trainset = train_data.build_full_trainset()
testset = [tuple(x) for x in test_df[['user_id', 'book_id', 'rating']].values]
best_nmf.fit(trainset)
predictions = best_nmf.test(testset)

In [None]:
from surprise import accuracy
test_rmse = accuracy.rmse(predictions)
test_mae = accuracy.mae(predictions)

In [None]:
import joblib
joblib.dump(best_nmf, '../Pickle/best_nmf_model.pkl')

In [None]:
predictions

In [None]:
def evaluate_model(predictions, min_rating, k, threshold, item_popularity, num_users):
    # Denormalize predictions
    denormalized_predictions = [
        (uid, iid, true_r * (5 - min_rating) + min_rating, est * (5 - min_rating) + min_rating, details)
        for (uid, iid, true_r, est, details) in predictions
    ]

    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in denormalized_predictions:
        user_est_true[uid].append((iid, est, true_r))

    precisions = []
    recalls = []
    ndcgs = []
    map_scores = []
    mrr_scores = []  # Add MRR scores
    novelty_scores = []
    hit_rate_scores = []
    all_recommended_items = set()  # For catalog coverage
    recommended_relevant_items = set()  # For relevant item coverage

    relevant_items = set(iid for _, iid, true_r, _, _ in denormalized_predictions if true_r >= threshold)

    for user_id, user_ratings in user_est_true.items():
        # Sort predictions by estimated value
        user_ratings_sorted = sorted(user_ratings, key=lambda x: x[1], reverse=True)
        top_k = user_ratings_sorted[:k]

        top_k_items = [iid for iid, _, _ in top_k]
        all_recommended_items.update(top_k_items)

        # Precision and Recall
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in top_k)
        n_rel_and_rec_k = sum((true_r >= threshold) for (_, _, true_r) in top_k)

        precision = n_rel_and_rec_k / k if k != 0 else 0
        recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        precisions.append(precision)
        recalls.append(recall)

        # NDCG@k
        dcg = sum(
            (true_r >= threshold) / np.log2(idx + 2)
            for idx, (_, _, true_r) in enumerate(top_k)
        )
        idcg = sum(
            1.0 / np.log2(idx + 2) for idx in range(min(n_rel, k))
        )
        ndcg = dcg / idcg if idcg != 0 else 0
        ndcgs.append(ndcg)

        # MAP@k
        hits = 0
        sum_precisions = 0
        for idx, (_, _, true_r) in enumerate(top_k):
            if true_r >= threshold:
                hits += 1
                sum_precisions += hits / (idx + 1)
        ap = sum_precisions / min(n_rel, k) if n_rel != 0 else 0
        map_scores.append(ap)

        # MRR Calculation: Find the reciprocal of the rank of the first relevant item
        first_relevant_rank = None
        for idx, (_, _, true_r) in enumerate(top_k):
            if true_r >= threshold:
                first_relevant_rank = idx + 1  # Rank is 1-based
                break

        if first_relevant_rank is not None:
            mrr_scores.append(1 / first_relevant_rank)
        else:
            mrr_scores.append(0)

        # Novelty (1 - average popularity of the recommended items)
        novelty = np.mean([1 - (item_popularity.get(iid, 1) / num_users) for iid in top_k_items])
        novelty_scores.append(novelty)

        # Hit Rate (proportion of users who have at least one relevant item in top_k)
        hit_rate = 1 if n_rel_and_rec_k > 0 else 0
        hit_rate_scores.append(hit_rate)

        # For Relevant Item Coverage@k
        recommended_relevant_items.update([iid for iid, est, true_r in top_k if true_r >= threshold])

    # Catalog Coverage@k
    catalog_coverage = len(all_recommended_items) / len(item_popularity) if item_popularity else 0

    # User Coverage@k (the proportion of users with at least one relevant recommendation)
    user_coverage = sum(1 for user_ratings in user_est_true.values() if any(true_r >= threshold for _, _, true_r in user_ratings)) / len(user_est_true)

    # Return results
    return (
        np.mean(precisions),
        np.mean(recalls),
        np.mean(ndcgs),
        np.mean(map_scores),
        np.mean(mrr_scores), 
        np.mean(novelty_scores),
        np.mean(hit_rate_scores),
        user_coverage
    )


k = 5
threshold = 4  # Relevance threshold
item_popularity = test_df['book_id'].value_counts().to_dict()
num_users = test_df['user_id'].nunique()

# Compute metrics
precision, recall, ndcg, map_score, mrr, novelty, hit_rate, user_coverage = evaluate_model(
    predictions, min_rating=min_rating, k=k, threshold=threshold, item_popularity=item_popularity, num_users=num_users
)

# Print results
print(f'Precision@{k}: {precision:.4f}')
print(f'Recall@{k}: {recall:.4f}')
print(f'nDCG@{k}: {ndcg:.4f}')
print(f'MAP@{k}: {map_score:.4f}')
print(f'MRR@{k}: {mrr:.4f}')
print(f'Novelty: {novelty:.4f}')
print(f'Hit Rate: {hit_rate:.4f}')
print(f'User Coverage: {user_coverage:.4f}')


In [None]:
est_ratings = np.array([pred.est for pred in predictions], dtype=float)
true_ratings = np.array([pred.r_ui for pred in predictions], dtype=float)
true_ratings = denormalize_rating(true_ratings, min_rating)
est_ratings = denormalize_rating(est_ratings, min_rating)

In [None]:
true_ratings.mean()

In [None]:
est_ratings.mean()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.hist(est_ratings, bins=20, edgecolor='black', alpha=0.7)
plt.title('Distribution of Predictions')
plt.xlabel('Adjusted Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.hist(interactions['rating'], bins=20, edgecolor='black', alpha=0.7)
plt.title('Distribution of True Ratings')
plt.xlabel('Adjusted Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# After fitting your best_nmf model
pu = best_nmf.pu  # user latent features (users x factors)
qi = best_nmf.qi  # item latent features (items x factors)


In [None]:

user_idx = 1611  # Internal user id (depends on Surprise’s inner mapping)
item_idx = 39335 # Internal item id (depends on Surprise’s inner mapping)

user_factors = pu[user_idx]
item_factors = qi[item_idx]

# Compute element-wise contributions
contributions = user_factors * item_factors
predicted_rating = contributions.sum()

# Get indices of the top 5 absolute contributions
top_5_indices = np.argsort(np.abs(contributions))[::-1][:5]

# Print the top 5 contributions
print(f"Top 5 contributing latent factors for prediction (user {user_idx} and item {item_idx}):\n")

for rank, i in enumerate(top_5_indices, 1):
    print(f"{rank}. Latent Factor {i+1}: User affinity {user_factors[i]:.3f} * "
          f"Item relevance {item_factors[i]:.3f} = {contributions[i]:.3f}")

In [None]:
import pickle
books_list = []
with open('../Pickle/books.pkl', 'rb') as file:
    while True:
        try:
            chunk = pickle.load(file)
            books_list.append(chunk)
        except EOFError:
            break  
books = pd.concat(books_list, ignore_index=True)
books = books.drop_duplicates(subset='title', keep='first')

In [None]:
top_n = 5

top_factors_idx = contributions.argsort()[-top_n:][::-1]

book_id_from_idx = books.iloc[item_idx]['book_id'] 
for i in top_factors_idx:
    # Get the corresponding book index (item_idx)
    book_id = books.iloc[i]['book_id']  # Internal index in the model maps to the original book_id
    book_title = books.loc[books['book_id'] == book_id, 'title'].values[0]
    
    print(f"Latent Factor {i+1}: User affinity {user_factors[i]:.3f} * Item relevance {item_factors[i]:.3f} = {contributions[i]:.3f}")
    print(f"Book: {book_title}\n")
