In [1]:
# Change the working directory
%cd ../..

/home/tianyiyang/git/adversarial-recommendation-systems


In [2]:
import surprise
import numpy as np
import pandas as pd
from scipy import sparse
import src.cf.experiment_core as cf_core

from src.cf.utils.timer import Timer

from src.cf.utils.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_PREDICTION_COL
)

from src.cf.evaluation.eval_metrics_numba import (
    rmse,
    mae,
    precision_at_k,
    recall_at_k
)

from src.models.surprise_cf import * 
from src.models.cf_utils import refine_ratings

In [3]:
generated_users_file = '/mnt/nfs/scratch1/neerajsharma/model_params/generated_1000_user_neighbors_without_reviews_more_sparse.npy'
generated_items_file = '/mnt/nfs/scratch1/neerajsharma/model_params/generated_1000_item_neighbors_without_reviews_more_sparse.npy'
aug = 'yes'

In [4]:
masked_R_coo, unmasked_R_coo, keep_item_idxs = cf_core.get_data_from_dataloader()
mask_coo = sparse.coo_matrix(cf_core.logical_xor(masked_R_coo, unmasked_R_coo))

nnzs = masked_R_coo.getnnz(axis=1)
warm_users = nnzs > 2

loading the data... took 7.8991184970363975 seconds for loading the dataset.


In [5]:
if aug == 'yes':
    generated_users = np.load(generated_users_file, allow_pickle=True).item()
    generated_items = np.load(generated_items_file, allow_pickle=True).item()
    for key, value in generated_users.items():
        generated_users[key] = value[:,keep_item_idxs]
    num_user_ids = len(generated_users.keys())
    num_item_ids = len(generated_items.keys())
    user_neighbor_per_id, user_neighbor_dim = generated_users[list(generated_users.keys())[0]].shape
    item_neighbor_per_id, item_neighbor_dim = generated_items[list(generated_items.keys())[0]].shape
    num_generated_users = num_user_ids * user_neighbor_per_id
    num_generated_items = num_item_ids * item_neighbor_per_id

    generated_users_vectors = np.array([v for v in generated_users.values()]).reshape(num_generated_users, user_neighbor_dim)
    generated_users_coo = sparse.coo_matrix(generated_users_vectors)
    false_coo = sparse.coo_matrix(np.zeros_like(generated_users_vectors, dtype=bool))
    masked_R_coo = sparse.vstack([masked_R_coo, generated_users_coo])
    unmasked_R_coo = sparse.vstack([unmasked_R_coo, generated_users_coo])
    mask_coo = sparse.vstack([mask_coo, false_coo])

    generated_items_vectors = np.array([v for v in generated_items.values()]).reshape(num_generated_items, item_neighbor_dim)
    filler = np.zeros((num_generated_items, num_generated_users))
    generated_items_vectors = np.concatenate((generated_items_vectors, filler), axis=1)
    false_coo = sparse.coo_matrix(np.zeros_like(generated_items_vectors.T, dtype=bool))
    generated_items_coo = sparse.coo_matrix(generated_items_vectors.T)
    
    masked_R_coo = sparse.hstack([masked_R_coo, generated_items_coo])
    unmasked_R_coo = sparse.hstack([unmasked_R_coo, generated_items_coo])
    mask_coo = sparse.hstack([mask_coo, false_coo])
    aug = True

else:
    aug = False
    generated_users, generated_items = None, None

In [6]:
mask_csr = mask_coo.tocsr()
unmasked_vals_csr = unmasked_R_coo.multiply(mask_coo)
unmasked_vals_coo = sparse.coo_matrix(unmasked_vals_csr)
unmasked_cold_coo = cf_core.only_cold_start(masked_R_coo, unmasked_vals_coo, warm_users)

num users total =  63926
num cold start users =  36946


In [7]:
trainset, testset, cold_testset = setup(masked_R_coo, unmasked_vals_coo, unmasked_cold_coo)
model = Model(name='SVD', algo=SVD(verbose=False), ks=5, ground_truth=unmasked_vals_coo, mask=mask_coo, ground_truth_cold=unmasked_cold_coo)

make train and test sets...done in 27 seconds


In [8]:
model.train(trainset)
model.predict(testset, cold_testset)
model.get_diy_predictions(trainset.global_mean)
model.refined_predictions = refine_ratings(
    trainset.ur, trainset.ir, 
    model.full_prediction_matrix, 
    generated_users,
    generated_items, .5
)

training  SVD ... done in  800 seconds
refining...done in 30 seconds


In [9]:
groundtruth = model.ground_truth.toarray()
refined_predictions = model.refined_predictions

In [10]:
trainset_df = cf_core.surprise_trainset_to_df(trainset)
trainset_uid, trainset_iid = trainset_df['userID'].to_numpy(), trainset_df['itemID'].to_numpy()

In [12]:
for k in range(1,16):
    eval_precision = precision_at_k(
        groundtruth, refined_predictions, 
        trainset_uid, trainset_iid,
        relevancy_method='top_k', k=k, threshold=0
    )
    eval_recall = recall_at_k(
        groundtruth, refined_predictions, 
        trainset_uid, trainset_iid,
        relevancy_method='top_k', k=k, threshold=0
    )

    print(f"Precision@{k}:{eval_precision:.6f}",
          f"Recall@{k}:{eval_recall:.6f}", sep='\n')
    print()

Precision@1:0.000360
Recall@1:0.000122

Precision@2:0.000321
Recall@2:0.000211

Precision@3:0.000339
Recall@3:0.000297

Precision@4:0.000422
Recall@4:0.000490

Precision@5:0.000454
Recall@5:0.000612

Precision@6:0.000467
Recall@6:0.000760

Precision@7:0.000476
Recall@7:0.000894

Precision@8:0.000448
Recall@8:0.000969

Precision@9:0.000440
Recall@9:0.001057

Precision@10:0.000449
Recall@10:0.001223

Precision@11:0.000448
Recall@11:0.001382

Precision@12:0.000450
Recall@12:0.001530

Precision@13:0.000445
Recall@13:0.001633

Precision@14:0.000445
Recall@14:0.001765

Precision@15:0.000438
Recall@15:0.001859

