In [1]:
# Change the working directory
%cd ../..

/mnt/nfs/scratch1/rbialik/adversarial-recommendation-systems


In [49]:
import surprise
import numpy as np
import pandas as pd
from scipy import sparse
import src.cf.experiment_core as cf_core

from src.cf.utils.timer import Timer

from src.cf.utils.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_PREDICTION_COL
)

from src.cf.evaluation.eval_metrics_numba import (
    rmse,
    mae,
    precision_at_k,
    recall_at_k
)

from src.models.surprise_cf import * 
from src.models.cf_utils import refine_ratings

In [3]:
generated_users_file = '/mnt/nfs/scratch1/neerajsharma/model_params/generated_1000_user_neighbors_without_reviews_more_sparse.npy'
generated_items_file = '/mnt/nfs/scratch1/neerajsharma/model_params/generated_1000_item_neighbors_without_reviews_more_sparse.npy'
reviews_generated_users_file = '/mnt/nfs/scratch1/neerajsharma/model_params/generated_1000_user_neighbors_more_sparse.npy'
reviews_generated_items_file = '/mnt/nfs/scratch1/neerajsharma/model_params/generated_1000_item_neighbors_more_sparse.npy'

In [18]:
vanilla = True
aug = True
reviews_aug = True

In [4]:
masked_R_coo, unmasked_R_coo, keep_item_idxs = cf_core.get_data_from_dataloader()
mask_coo = sparse.coo_matrix(cf_core.logical_xor(masked_R_coo, unmasked_R_coo))

nnzs = masked_R_coo.getnnz(axis=1)
warm_users = nnzs > 2

loading the data... took 6.917295959079638 seconds for loading the dataset.


In [7]:
aug_masked_R_coo, aug_unmasked_R_coo, aug_mask_coo, generated_users, generated_items = \
    cf_core.make_aug_data(
        masked_R_coo, 
        unmasked_R_coo, 
        keep_item_idxs, 
        mask_coo, warm_users, 
        generated_users_file, 
        generated_items_file)
reviews_aug_masked_R_coo, reviews_aug_unmasked_R_coo, reviews_aug_mask_coo, reviews_generated_users, reviews_generated_items = \
    cf_core.make_aug_data(
        masked_R_coo, 
        unmasked_R_coo, 
        keep_item_idxs, 
        mask_coo, warm_users, 
        reviews_generated_users_file, 
        reviews_generated_items_file)

In [14]:
print("\n vanilla data")
mask_csr = mask_coo.tocsr()
unmasked_vals_csr = unmasked_R_coo.multiply(mask_coo)
unmasked_vals_coo = sparse.coo_matrix(unmasked_vals_csr)
unmasked_cold_coo = cf_core.only_cold_start(masked_R_coo, unmasked_vals_coo, warm_users)
print("\n augmented data")
aug_mask_csr = aug_mask_coo.tocsr()
aug_unmasked_vals_csr = aug_unmasked_R_coo.multiply(aug_mask_coo)
aug_unmasked_vals_coo = sparse.coo_matrix(aug_unmasked_vals_csr)
aug_unmasked_cold_coo = cf_core.only_cold_start(aug_masked_R_coo, aug_unmasked_vals_coo, warm_users)
print("\n augmented (+review) data")
reviews_aug_mask_csr = reviews_aug_mask_coo.tocsr()
reviews_aug_unmasked_vals_csr = reviews_aug_unmasked_R_coo.multiply(reviews_aug_mask_coo)
reviews_aug_unmasked_vals_coo = sparse.coo_matrix(reviews_aug_unmasked_vals_csr)
reviews_aug_unmasked_cold_coo = cf_core.only_cold_start(reviews_aug_masked_R_coo, reviews_aug_unmasked_vals_coo, warm_users)


 vanilla data
num users total =  62926
num cold start users =  35946

 augmented data
num users total =  63926
num cold start users =  36946

 augmented (+review) data
num users total =  63926
num cold start users =  36946


In [19]:
if vanilla:
    trainset, testset, cold_testset = setup(masked_R_coo, unmasked_vals_coo, unmasked_cold_coo)
if aug:
    aug_trainset, aug_testset, aug_cold_testset = setup(aug_masked_R_coo, aug_unmasked_vals_coo, aug_unmasked_cold_coo)
if reviews_aug:
    reviews_aug_trainset, reviews_aug_testset, reviews_aug_cold_testset = setup(reviews_aug_masked_R_coo, reviews_aug_unmasked_vals_coo, reviews_aug_unmasked_cold_coo)


make train and test sets...done in 4 seconds
make train and test sets...done in 8 seconds
make train and test sets...done in 23 seconds


In [21]:
if vanilla:
    SVD_vanilla = Model(name='SVD', algo=SVD(verbose=False), ks=5, ground_truth=unmasked_vals_coo, mask=mask_coo, ground_truth_cold=unmasked_cold_coo)
    SVD_vanilla.train(trainset)
    SVD_vanilla.predict(testset, cold_testset)
    SVD_vanilla.get_diy_predictions(trainset.global_mean)

training  SVD ... done in  13 seconds


In [26]:
if aug:
    SVD_aug = Model(
        name='SVD_aug', 
        algo=SVD(verbose=False), 
        ks=5, 
        ground_truth=aug_unmasked_vals_coo, 
        mask=aug_mask_coo, 
        ground_truth_cold=aug_unmasked_cold_coo)

    SVD_aug.train(aug_trainset)
    SVD_aug.predict(aug_testset, aug_cold_testset)
    SVD_aug.get_diy_predictions(aug_trainset.global_mean)
    SVD_aug.refined_predictions = refine_ratings(
        aug_trainset.ur, aug_trainset.ir, 
        SVD_aug.full_prediction_matrix, 
        generated_users,
        generated_items, .5
    )

training  SVD_aug ... done in  301 seconds
refining...done in 17 seconds


In [28]:
if reviews_aug:
    SVD_aug_reviews = Model(
        name='SVD_reviews_aug', 
        algo=SVD(verbose=False), 
        ks=5, 
        ground_truth=reviews_aug_unmasked_vals_coo, 
        mask=reviews_aug_mask_coo, 
        ground_truth_cold=reviews_aug_unmasked_cold_coo)
        
    SVD_aug_reviews.train(reviews_aug_trainset)
    SVD_aug_reviews.predict(reviews_aug_testset, reviews_aug_cold_testset)
    SVD_aug_reviews.get_diy_predictions(reviews_aug_trainset.global_mean)
    SVD_aug_reviews.refined_predictions = refine_ratings(
        reviews_aug_trainset.ur, reviews_aug_trainset.ir, 
        SVD_aug_reviews.full_prediction_matrix, 
        reviews_generated_users,
        reviews_generated_items, .5
    )

training  SVD_reviews_aug ... done in  781 seconds
refining...done in 18 seconds


In [43]:
models = []
if vanilla:
    models.append(SVD_vanilla)
if aug:
    models.append(SVD_aug)
if reviews_aug:
    models.append(SVD_aug_reviews)
    
for model in models:
    print(model.name)
    if 'aug' in model.name:
        model.refined_rmse = rmse(model.ground_truth.toarray(), model.refined_predictions)
        model.refined_mae = mae(model.ground_truth.toarray(), model.refined_predictions)
    model.rmse = rmse(model.ground_truth.toarray(), model.full_prediction_matrix)
    model.mae = mae(model.ground_truth.toarray(), model.full_prediction_matrix)

SVD
SVD_aug
SVD_reviews_aug


In [53]:
labels = []
errors = []
for model in models:
    labels.append(model.name)
    errors.append([model.mae, model.rmse])
    if 'aug' in model.name:
        labels.append('refined ' + model.name)
        errors.append([model.refined_mae, model.refined_rmse])
error_labels = ['all_users'] + ['MAE', 'RMSE']
tab_data = [[labels[i]] + errors[i] for i in range(len(labels))]
print(tabulate(tab_data, headers=error_labels, tablefmt="github"))

| all_users               |      MAE |    RMSE |
|-------------------------|----------|---------|
| SVD                     | 0.739702 | 1.02192 |
| SVD_aug                 | 0.806632 | 1.09564 |
| refined SVD_aug         | 0.806632 | 1.09564 |
| SVD_reviews_aug         | 0.75122  | 1.10723 |
| refined SVD_reviews_aug | 0.75122  | 1.10723 |


In [44]:
trainset_df = cf_core.surprise_trainset_to_df(trainset)
trainset_uid, trainset_iid = trainset_df['userID'].to_numpy(), trainset_df['itemID'].to_numpy()

aug_trainset_df = cf_core.surprise_trainset_to_df(aug_trainset)
aug_trainset_uid, aug_trainset_iid = aug_trainset_df['userID'].to_numpy(), aug_trainset_df['itemID'].to_numpy()

reviews_aug_trainset_df = cf_core.surprise_trainset_to_df(reviews_aug_trainset)
reviews_aug_trainset_uid, reviews_aug_trainset_iid = reviews_aug_trainset_df['userID'].to_numpy(), reviews_aug_trainset_df['itemID'].to_numpy()

In [12]:
for k in range(1,16):
    eval_precision = precision_at_k(
        groundtruth, refined_predictions, 
        trainset_uid, trainset_iid,
        relevancy_method='top_k', k=k, threshold=0
    )
    eval_recall = recall_at_k(
        groundtruth, refined_predictions, 
        trainset_uid, trainset_iid,
        relevancy_method='top_k', k=k, threshold=0
    )

    print(f"Precision@{k}:{eval_precision:.6f}",
          f"Recall@{k}:{eval_recall:.6f}", sep='\n')
    print()

Precision@1:0.000344
Recall@1:0.000105

Precision@2:0.000336
Recall@2:0.000210

Precision@3:0.000365
Recall@3:0.000306

Precision@4:0.000438
Recall@4:0.000473

Precision@5:0.000451
Recall@5:0.000626

Precision@6:0.000433
Recall@6:0.000750

Precision@7:0.000445
Recall@7:0.000878

Precision@8:0.000438
Recall@8:0.000995

Precision@9:0.000429
Recall@9:0.001125

Precision@10:0.000430
Recall@10:0.001256

Precision@11:0.000420
Recall@11:0.001319

Precision@12:0.000405
Recall@12:0.001377

Precision@13:0.000400
Recall@13:0.001466

Precision@14:0.000398
Recall@14:0.001576

Precision@15:0.000393
Recall@15:0.001666

