In [5]:
#!pip install cornac --upgrade
import pandas as pd
import cornac
import os
from cornac.eval_methods import RatioSplit
from cornac.data import Reader, Dataset
from cornac.models import MostPop, MF, PMF, BPR, NeuMF, WMF, HPF, VAECF, NMF, UserKNN,LightGCN
from cornac.models import NMF as CornacNMF 
from cornac.metrics import MAE, MSE, RMSE, Precision, Recall, NDCG, AUC, MAP, FMeasure, MRR
import numpy as np
import pickle
from collections import defaultdict
from scipy import stats
from numpy.linalg import norm
import time

from cornac.hyperopt import Discrete, Continuous, GridSearch, RandomSearch


In [6]:
df = pd.read_csv('preprocessed_book_ratings.csv')

# Rename columns to match Cornac expectations
df_cornac = df[['User-ID', 'ISBN', 'Book-Rating']].copy()
df_cornac.columns = ['user', 'item', 'rating']
df_cornac['rating'] = df_cornac['rating'].astype(np.float32)
df_cornac['user'] = df_cornac['user'].astype('object')

# Convert to list of tuples for Cornac
data = [(str(row['user']), row['item'], row['rating'])  for _, row in df_cornac.iterrows()]
# data = list(zip(df_cornac['user'], df_cornac['item'], df_cornac['rating']))
dataset = Dataset.from_uir(data)

# Create evaluation method with validation split
rs = RatioSplit(data=data,
                        test_size=0.2,   # 20% for testing
                        val_size=0.1,    # 10% for validation
                        rating_threshold=1,
                        verbose=True,
                        seed=123)


print(f"Train set size: {rs.train_size}")
print(f"Validation set size: {rs.val_size}")
print(f"Test set size: {rs.test_size}")

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 5417
Number of items = 5227
Number of ratings = 49434
Max rating = 10.0
Min rating = 1.0
Global mean = 7.8
---
Test data:
Number of users = 5417
Number of items = 5227
Number of ratings = 14098
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 5417
Number of items = 5227
Number of ratings = 7048
---
Total users = 5417
Total items = 5227
Train set size: 49434
Validation set size: 7063
Test set size: 14125


In [8]:

#Changing default vals to finetuned recommendations
'''
Best parameters for UserKNN:
{'amplify': 1.3695414455339152, 'k': np.int64(40), 'mean_centered': np.False_, 'similarity': np.str_('cosine')}
Successfully completed UserKNN evaluation

Best parameters for BPR:
{'k': 10, 'lambda_reg': 0.03438348381347186, 'learning_rate': 0.0074259722201483955, 'max_iter': 300}

Best parameters for MF:
{'k': np.int64(30), 'lambda_reg': 0.00190666813148965, 'learning_rate': 0.009597136051227133, 'max_iter': np.int64(100)}
Successfully completed MF evaluation

Best parameters for PMF:
{'k': np.int64(20), 'lambda_reg': 0.0006908111764347267, 'learning_rate': 0.0022911991489869415, 'max_iter': np.int64(100)}
Successfully completed PMF evaluation

Best parameters for NMF:
{'k': 25, 'lambda_u': 0.038555693363828886, 'lambda_v': 0.047334359075826865, 'learning_rate': 0.008796782420950295, 'max_iter': 100, 'use_bias': False}

Best parameters for WMF:
{'k': np.int64(70), 'lambda_u': 0.035934810705638136, 'lambda_v': 0.021995075383154462, 'learning_rate': 0.003608981847620878, 'max_iter': np.int64(100)}
Successfully completed WMF evaluation

Best parameters for HPF:
{'k': np.int64(30), 'max_iter': np.int64(150)}

best parameters for NeuMF_medium:
{'act_fn': np.str_('tanh'), 'batch_size': np.int64(256), 'lr': 0.0015020827412568593, 'num_epochs': np.int64(5), 'num_factors': np.int64(8), 'num_neg': np.int64(3)}
Successfully completed NeuMF_medium evaluation

Best parameters for VAECF_medium:
{'act_fn': np.str_('tanh'), 'batch_size': np.int64(100), 'beta': 1.2800277619120792, 'k': np.int64(10), 'learning_rate': 0.00345566571906583, 'likelihood': np.str_('mult'), 'n_epochs': np.int64(100)}
Successfully completed VAECF_medium evaluation


'''
               
models = [
    MostPop(),
    UserKNN(k=40, similarity='cosine', mean_centered=False, weighting=None, amplify=1.369, num_threads=0, trainable=True, verbose=True, seed=123),
    BPR(k=10, max_iter=300, learning_rate=0.007, lambda_reg=0.0343, seed=123),
    MF(k=30, max_iter=100, learning_rate=0.01, lambda_reg=0.002, seed=123),
    PMF(k=20, max_iter=100, learning_rate=0.002, lambda_reg=0.0006, seed=123),
    NMF(k=25, max_iter=100, learning_rate=0.008, lambda_u=0.06, lambda_v=0.0473, lambda_bu=0.02, lambda_bi=0.02, use_bias=False, verbose=True, seed=123),
    WMF(k=70, max_iter=100, learning_rate=0.003, lambda_u=0.0359, lambda_v=0.0219, verbose=True, seed=123),
    HPF(k=30, seed=123, hierarchical=False, name="PF"),
    NeuMF(num_factors=8, layers=[32, 16, 8], act_fn="tanh", num_epochs=5, num_neg=3, batch_size=256, lr=0.0015, seed=123, verbose=True, backend='pytorch'),
    VAECF(k=10, autoencoder_structure=[20], act_fn='tanh', likelihood='mult',n_epochs=100, batch_size=100, learning_rate=0.0034, beta=1.28, trainable=True, verbose=False, seed=123, use_gpu=True),
 
  ]


metrics = [
    MAE(), MSE(), RMSE(), AUC(), MAP(), MRR(),
    Precision(k=5), Precision(k=10), Precision(k=20), Precision(k=50),
    Recall(k=5), Recall(k=10), Recall(k=20), Recall(k=50),
    NDCG(k=5), NDCG(k=10), NDCG(k=20), NDCG(k=50),
    FMeasure(k=5), FMeasure(k=10), FMeasure(k=20), FMeasure(k=50)
]

In [9]:
exp = cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=True)
exp.run()


[MostPop] Training started!

[MostPop] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[UserKNN] Training started!


  0%|          | 0/5417 [00:00<?, ?it/s]


[UserKNN] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[BPR] Training started!

[BPR] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[MF] Training started!

[MF] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[PMF] Training started!

[PMF] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[NMF] Training started!


  0%|          | 0/100 [00:00<?, ?it/s]

Optimization finished!

[NMF] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[WMF] Training started!


E0000 00:00:1746018923.858353   57080 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746018923.892579   57080 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746018924.224954   57080 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746018924.224983   57080 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746018924.224985   57080 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746018924.224986   57080 computation_placer.cc:177] computation placer already registered. Please check linka

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[WMF] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[PF] Training started!
Learning...
Learning completed!

[PF] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[NeuMF] Training started!


  0%|          | 0/5 [00:00<?, ?it/s]


[NeuMF] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


[VAECF] Training started!

[VAECF] Evaluation started!


Rating:   0%|          | 0/14098 [00:00<?, ?it/s]

Ranking:   0%|          | 0/4577 [00:00<?, ?it/s]

Rating:   0%|          | 0/7048 [00:00<?, ?it/s]

Ranking:   0%|          | 0/3440 [00:00<?, ?it/s]


VALIDATION:
...
        |    MAE |     MSE |   RMSE |    AUC |  F1@10 |  F1@20 |   F1@5 |  F1@50 |    MAP |    MRR | NDCG@10 | NDCG@20 | NDCG@5 | NDCG@50 | Precision@10 | Precision@20 | Precision@5 | Precision@50 | Recall@10 | Recall@20 | Recall@5 | Recall@50 | Time (s)
------- + ------ + ------- + ------ + ------ + ------ + ------ + ------ + ------ + ------ + ------ + ------- + ------- + ------ + ------- + ------------ + ------------ + ----------- + ------------ + --------- + --------- + -------- + --------- + --------
MostPop | 2.6042 | 10.7763 | 2.7985 | 0.6548 | 0.0079 | 0.0079 | 0.0079 | 0.0069 | 0.0144 | 0.0257 |  0.0156 |  0.0214 | 0.0117 |  0.0323 |       0.0050 |       0.0045 |      0.0059 |       0.0037 |    0.0265 |    0.0456 |   0.0155 |    0.0924 |   6.8630
UserKNN | 1.3489 |  3.2279 | 1.4515 | 0.3861 | 0.0006 | 0.0010 | 0.0006 | 0.0010 | 0.0024 | 0.0043 |  0.0012 |  0.0024 | 0.0008 |  0.0044 |       0.0004 |       0.0005 |      0.0005 |       0.0005 |    0.0020 |    0.00

In [10]:
import pandas as pd
from collections import OrderedDict
from IPython.display import display

# Initialize list to store all model results
all_results = []

# Process each model's results
for model_result in exp.result:
    # Create a dictionary for the current model
    model_data = OrderedDict()
    model_data['Model'] = model_result.model_name
    
    # Add all metrics from metric_avg_results
    for metric_name, metric_value in model_result.metric_avg_results.items():
        model_data[metric_name] = metric_value
    
    all_results.append(model_data)

# Create DataFrame with consistent column order
metrics_order = [
    'MAE', 'MSE', 'RMSE', 'AUC',
    'F1@5', 'F1@10', 'F1@20', 'F1@50',
    'Precision@5', 'Precision@10', 'Precision@20', 'Precision@50',
    'Recall@5', 'Recall@10', 'Recall@20', 'Recall@50',
    'MAP', 'MRR', 
    'NDCG@5', 'NDCG@10', 'NDCG@20', 'NDCG@50',
    'Train (s)', 'Test (s)'
]

# Convert to DataFrame and reorder columns
results_df = pd.DataFrame(all_results)
results_df = results_df[['Model'] + metrics_order]

# Custom formatting for display
def format_metric(x):
    if isinstance(x, (int, float)):
        return f"{x:.4f}" if abs(x) < 100 else f"{x:.1f}"
    return x

styled_df = (results_df.style
             .format(format_metric)
             .background_gradient(subset=metrics_order, cmap='YlGnBu')
             .set_properties(**{'text-align': 'center'})
             .set_table_styles([{
                 'selector': 'th',
                 'props': [('background-color', '#40466e'), 
                           ('color', 'white'),
                           ('font-weight', 'bold')]
             }]))

display(styled_df)

Unnamed: 0,Model,MAE,MSE,RMSE,AUC,F1@5,F1@10,F1@20,F1@50,Precision@5,Precision@10,Precision@20,Precision@50,Recall@5,Recall@10,Recall@20,Recall@50,MAP,MRR,NDCG@5,NDCG@10,NDCG@20,NDCG@50,Train (s),Test (s)
0,MostPop,2.6249,10.9153,2.912,0.6617,0.0099,0.0112,0.0113,0.0098,0.0092,0.0081,0.007,0.0054,0.0152,0.0276,0.0474,0.0886,0.0158,0.0373,0.0145,0.0189,0.0252,0.0361,0.0442,9.544
1,UserKNN,1.3628,3.3024,1.5193,0.3922,0.0009,0.001,0.0011,0.0013,0.0007,0.0006,0.0007,0.0007,0.0016,0.0034,0.0056,0.0152,0.003,0.0053,0.0012,0.0018,0.0026,0.0049,0.7674,47.1325
2,BPR,6.6266,47.2126,6.7179,0.7206,0.0135,0.0139,0.0141,0.0123,0.0126,0.0104,0.0088,0.0068,0.0206,0.0326,0.0577,0.1097,0.0198,0.0438,0.0187,0.0229,0.0308,0.0448,1.6858,11.4104
3,MF,1.3035,2.9515,1.4465,0.5144,0.0011,0.0013,0.0016,0.0014,0.0012,0.001,0.001,0.0008,0.0012,0.0023,0.0054,0.0101,0.0025,0.0062,0.0014,0.0017,0.0027,0.0041,0.2245,12.5324
4,PMF,1.539,3.8373,1.6935,0.5587,0.0026,0.0029,0.0027,0.0024,0.0021,0.002,0.0016,0.0013,0.0046,0.0084,0.0128,0.0236,0.0049,0.0096,0.0034,0.0049,0.0063,0.0091,2.2045,13.335
5,NMF,1.5507,3.8338,1.6988,0.5389,0.0006,0.0007,0.0008,0.0011,0.0005,0.0005,0.0005,0.0006,0.0008,0.0015,0.0032,0.0109,0.0021,0.0039,0.0005,0.0008,0.0014,0.0033,0.6738,11.7512
6,WMF,5.9878,40.1785,6.1364,0.6612,0.0289,0.0257,0.0212,0.0146,0.0248,0.0181,0.013,0.008,0.0464,0.064,0.0892,0.1348,0.0375,0.076,0.0421,0.0483,0.0565,0.0687,54.0552,13.3873
7,PF,6.7559,48.9375,6.8452,0.6526,0.0113,0.0109,0.0103,0.0083,0.0103,0.008,0.0065,0.0046,0.0175,0.0269,0.0425,0.0738,0.0153,0.034,0.0156,0.0188,0.0239,0.0323,31.932,13.3132
8,NeuMF,6.7637,49.0436,6.8523,0.7057,0.0115,0.013,0.0125,0.0112,0.0105,0.0094,0.0078,0.0062,0.0177,0.0315,0.051,0.1029,0.0183,0.0411,0.0163,0.0215,0.0279,0.0415,28.4499,18.9168
9,VAECF,6.7637,49.0436,6.8523,0.7704,0.0174,0.0185,0.0175,0.0143,0.0153,0.0132,0.0109,0.0079,0.0282,0.047,0.074,0.1336,0.0259,0.0538,0.0245,0.0314,0.0402,0.0559,20.6193,19.5055


In [11]:
results_df.to_csv('finetuned_results.csv', index=False)

In [None]:
# Create directory for saved models
os.makedirs("models_finetuned", exist_ok=True)

# Save each model
for model in exp.models:
    model_name = model.name if hasattr(model, 'name') else model.__class__.__name__
    
    # Method 1: Using Cornac's native save (if available)
    if hasattr(model, 'save'):
        model.save(f"models_finetuned/{model_name}")
        print(f"Saved {model_name} using Cornac's native save()")
    
    

In [18]:
#NEXT: GET TOP N, SAVE THEM, THEN WE WILL ANALYSE THEM!!!!1 MANEL'S DIVERSITY ACCURACY TRADEOFF PAPER

In [16]:
#NOTE: we saw above that cornac makes sure that all users are present in train, valid and test set. So it doesn't matter where you generate recs from
def get_top_n(algo_name, n=10):
    top_n = defaultdict(list)
    
    for model in exp.models:
        if model.name == algo_name:
            print(f"{model.name} model is selected:")
            for uid in model.train_set.uid_map.values():
                user_id = list(model.train_set.user_ids)[uid]
                try:
                    item_rank = model.rank(user_idx=uid)[0]  # model.rank: item rank, item_score
                except:
                    item_rank = model.rank(user_idx=int(uid))[0]
                
                # Collect top N items
                item_rank_top = item_rank[:n]
                for iid in item_rank_top:
                    item_id = list(model.train_set.item_ids)[iid]
                    
                    # Check if item_id is an integer (it might be an ISBN as string)
                    try:
                        item_id = str(item_id)  # Treat ISBNs or other IDs as strings
                        top_n[int(user_id)].append((item_id, model.score(uid, iid)))  # Use item_id as string
                    except ValueError:
                        # Handle case where item_id cannot be converted to int
                        top_n[int(user_id)].append((item_id, model.score(uid, iid)))
    
    return top_n

In [17]:
# Generate and save recommendations for each model
model_names = ["MostPop", "UserKNN", "BPR", "MF", "PMF","NMF", "WMF", "PF", "NeuMF", "VAECF"]     

for name in model_names:
    recommendations = get_top_n(name, n=10)
    print('Recommendations generated for:', name)
    # Save to CSV
    data = []
    for user_id, items in recommendations.items():
        for item_id, score in items:
            data.append([user_id, item_id, float(score)])
    
    df = pd.DataFrame(data, columns=["user_id", "item_id", "score"])
    df.to_csv(f"recommendations_finetuned_{name}.csv", index=False)
    print(f"Saved recommendations for {name} to recommendations_{name}.csv")

MostPop model is selected:
Recommendations generated for: MostPop
Saved recommendations for MostPop to recommendations_MostPop.csv
UserKNN model is selected:
Recommendations generated for: UserKNN
Saved recommendations for UserKNN to recommendations_UserKNN.csv
BPR model is selected:
Recommendations generated for: BPR
Saved recommendations for BPR to recommendations_BPR.csv
MF model is selected:
Recommendations generated for: MF
Saved recommendations for MF to recommendations_MF.csv
PMF model is selected:
Recommendations generated for: PMF
Saved recommendations for PMF to recommendations_PMF.csv
NMF model is selected:
Recommendations generated for: NMF
Saved recommendations for NMF to recommendations_NMF.csv
WMF model is selected:
Recommendations generated for: WMF
Saved recommendations for WMF to recommendations_WMF.csv
PF model is selected:
Recommendations generated for: PF
Saved recommendations for PF to recommendations_PF.csv
NeuMF model is selected:
Recommendations generated for: 

  data.append([user_id, item_id, float(score)])


Saved recommendations for NeuMF to recommendations_NeuMF.csv
VAECF model is selected:
Recommendations generated for: VAECF
Saved recommendations for VAECF to recommendations_VAECF.csv


In [24]:
pd.read_csv('recommendations_finetuned_PMF.csv')

Unnamed: 0,user_id,item_id,score
0,232052,039480001X,9.606973
1,232052,0553148001,9.571980
2,232052,0064401871,9.524368
3,232052,0060256672,9.495006
4,232052,0140434003,9.466053
...,...,...,...
54165,196085,0061054143,5.719382
54166,196085,1400033543,5.713003
54167,196085,1573225126,5.712014
54168,196085,042516876X,5.711779


In [25]:
import random as rd

def get_top_n_random(n=10):
    print("Random model is selected:")
    top_n = defaultdict(list)

    # Get list of internal item IDs
    all_iids = list(rs.train_set.iid_map.values())

    for uid in rs.train_set.uid_map.values():
        user_id = list(rs.train_set.user_ids)[uid]

        # Get n unique random items
        random_items = rd.sample(all_iids, n)

        for iid in random_items:
            item_id = list(rs.train_set.item_ids)[iid]
            top_n[user_id].append((item_id, 1.0))

    return top_n
n=get_top_n_random(n=10)

Random model is selected:


In [26]:
data=[]
for user_id, items in n.items():
    for item_id, score in items:
        data.append([user_id, item_id, float(score)])
    
df = pd.DataFrame(data, columns=["user_id", "item_id", "score"])
# df.to_csv("recommendations_random.csv", index=False)
print(f"Saved recommendations to recommendations_finetuned_random.csv")

Saved recommendations to recommendations_finetuned_random.csv


In [28]:
pd.read_csv('recommendations_finetuned_random.csv')

Unnamed: 0,user_id,item_id,score
0,232052,0671640127,1.0
1,232052,0553274465,1.0
2,232052,0451147960,1.0
3,232052,0441135560,1.0
4,232052,0312924801,1.0
...,...,...,...
54165,196085,0449223604,1.0
54166,196085,0385425473,1.0
54167,196085,0425119653,1.0
54168,196085,0312982518,1.0
