In [1]:
# Change the working directory
%cd ../..

/home/tianyiyang/git/adversarial-recommendation-systems


In [2]:
import surprise
import numpy as np
import pandas as pd
from scipy import sparse
import src.cf.experiment_core as cf_core

from src.cf.utils.timer import Timer

from src.cf.utils.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_PREDICTION_COL
)

from src.cf.evaluation.eval_metrics_numba import (
    rmse,
    mae,
    precision_at_k,
    recall_at_k
)

from src.cf.utils.numba_utils import (
    getitem_by_row_col
)

from numba import vectorize, guvectorize

# Data Loading

In [4]:
# Load data from dataloader 
masked_R, unmasked_R = cf_core.get_data_from_dataloader()
mask = sparse.coo_matrix(cf_core.logical_xor(masked_R, unmasked_R))

nnzs = masked_R.getnnz(axis=1)
warm_users = nnzs > 2

mask_csr = mask.tocsr()
unmasked_vals = sparse.coo_matrix(unmasked_R.multiply(mask))
unmasked_cold = cf_core.only_cold_start(masked_R, unmasked_vals, warm_users)

trainset, testset, cold_testset = cf_core.setup(masked_R, unmasked_vals, unmasked_cold)

loading the data... took 6.858623347710818 seconds for loading the dataset.
num users total =  62926
num cold start users =  35946
make train and test sets... `Setup` took 1.815666910726577 seconds.


In [5]:
# Covert testsets to pd.Dataframe
trainset_df = cf_core.surprise_trainset_to_df(trainset)
testset_df = cf_core.surprise_testset_to_df(testset)
cold_testset_df = cf_core.surprise_testset_to_df(cold_testset)

# Training

In [6]:
# Train SVD model
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=False)
with Timer() as train_time:
    svd.fit(trainset)

print("Took {} seconds for training.".format(train_time.interval))

Took 31.745008911006153 seconds for training.


# Prediction

In [7]:
def predict(algo: surprise.prediction_algorithms.algo_base.AlgoBase, data: pd.DataFrame, 
            usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL) -> pd.DataFrame:
    """Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE.
    
    Args:
        algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
        data (pd.DataFrame): the data on which to predict
        usercol (str): name of the user column
        itemcol (str): name of the item column
    
    Returns:
        pd.DataFrame: dataframe with usercol, itemcol, predcol
    """
    predictions = [
        algo.predict(getattr(row, usercol), getattr(row, itemcol))
        for row in data.itertuples()
    ]
    predictions = pd.DataFrame(predictions)
    predictions = predictions.rename(
        index=str, columns={"uid": usercol, "iid": itemcol, "est": predcol}
    )
    return predictions.drop(["details", "r_ui"], axis="columns")

def compute_ranking_predictions(algo: surprise.prediction_algorithms.algo_base.AlgoBase, data: pd.DataFrame, 
                                usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL,
                                remove_seen=False) -> pd.DataFrame:
    """Computes predictions of an algorithm from Surprise on all users and items in data. It can be used for computing
    ranking metrics like NDCG.
    
    Args:
        algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
        data (pd.DataFrame): the data from which to get the users and items
        usercol (str): name of the user column
        itemcol (str): name of the item column
        remove_seen (bool): flag to remove (user, item) pairs seen in the training data
    
    Returns:
        pd.DataFrame: dataframe with usercol, itemcol, predcol
    """

    iuids = np.array([algo.trainset.to_inner_uid(uid) for uid in data[usercol].unique()])
    iiids = np.array([algo.trainset.to_inner_iid(iid) for iid in data[itemcol].unique()])

    full_prediction_matrix = algo.pu @ algo.qi.T
    full_prediction_matrix += algo.bu.reshape(-1,1)
    full_prediction_matrix += algo.bi
    full_prediction_matrix += data[DEFAULT_RATING_COL].mean()

    full_prediction_matrix = full_prediction_matrix[iuids, :]
    full_prediction_matrix = full_prediction_matrix[:, iiids]

    return full_prediction_matrix

    # userIDs = np.arange(0, full_prediction_matrix.shape[0])
    # itemIDs = np.arange(0, full_prediction_matrix.shape[1])
    # full_prediction_matrix = full_prediction_matrix.flatten()
    

    # all_predictions = pd.DataFrame(
    #     data=full_prediction_matrix, columns=[usercol, itemcol, predcol]
    # )

    # if remove_seen:
    #     tempdf = pd.concat(
    #         [
    #             data[[usercol, itemcol]],
    #             pd.DataFrame(
    #                 data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
    #             ),
    #         ],
    #         axis=1,
    #     )
    #     merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
    #     return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    # else:
    #     return all_predictions

In [8]:
predictions = predict(svd, testset_df, usercol='userID', itemcol='itemID')

In [8]:
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,0,240,4.414653
1,0,3258,4.391092
2,1,2954,4.414337
3,1,4178,4.273617
4,1,6860,4.849997


In [9]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(
        svd, trainset_df, usercol='userID', itemcol='itemID', remove_seen=True
    )
    
print("Took {} seconds for prediction.".format(test_time.interval))

Took 27.25401992024854 seconds for prediction.


In [10]:
all_array = masked_R.toarray()
all_array = all_array[:, np.sum(all_array, axis=0) != 0]

# Evaluation

In [9]:
predictions[prediction]

Unnamed: 0,userID,itemID,prediction
0,0,240,4.414653
1,0,3258,4.391092
2,1,2954,4.414337
3,1,4178,4.273617
4,1,6860,4.849997
...,...,...,...
187704,62925,3930,4.162117
187705,62925,3948,4.980985
187706,62925,4031,4.930572
187707,62925,4470,4.896207


In [None]:
# RMSE and MAE
eval_rmse = rmse(testset_df, predictions)
eval_mae = mae(testset_df, predictions)

print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae, sep='\n')

In [30]:
# k = 5

for k in range(1,10):
      eval_map = map_at_k(trainset_df, all_predictions, col_prediction='prediction', k=k)
      # eval_ndcg = ndcg_at_k(testset_df, predictions, col_prediction='prediction', k=k)
      eval_precision = precision_at_k(trainset_df, all_predictions, col_prediction='prediction', k=k)
      eval_recall = recall_at_k(trainset_df, all_predictions, col_prediction='prediction', k=k)

      print("MAP:\t%f" % eval_map,
            # "NDCG:\t%f" % eval_ndcg,
            "Precision@K:\t%f" % eval_precision,
            "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.430784
Precision@K:	1.000000
Recall@K:	0.430784
MAP:	0.704941
Precision@K:	0.921687
Recall@K:	0.704941
MAP:	0.851562
Precision@K:	0.810558
Recall@K:	0.851562
MAP:	0.919797
Precision@K:	0.696204
Recall@K:	0.919797
MAP:	0.952551
Precision@K:	0.599207
Recall@K:	0.952551
MAP:	0.969857
Precision@K:	0.521669
Recall@K:	0.969857
MAP:	0.979925
Precision@K:	0.460081
Recall@K:	0.979925
MAP:	0.986045
Precision@K:	0.410435
Recall@K:	0.986045
MAP:	0.989967
Precision@K:	0.369868
Recall@K:	0.989967


In [11]:
tmp = sparse.coo_matrix(all_array)
uid, iid = tmp.row, tmp.col
all_predictions[uid, iid] = 0

In [15]:
ground_truth = unmasked_R.toarray()
ground_truth = ground_truth[:, np.sum(masked_R.toarray(), axis=0) != 0]
ground_truth[uid, iid] = 0

In [16]:
tmp_coo = sparse.coo_matrix(ground_truth)
partial_all_predictions_coo = sparse.coo_matrix((all_predictions[ground_truth > 0], (tmp_coo.row, tmp_coo.col)))

In [18]:
# k = 5

for k in range(1,6):
    eval_precision = precision_at_k(ground_truth, all_predictions, relevancy_method='top_k', k=k, threshold=3.5)
    eval_recall = recall_at_k(ground_truth, all_predictions, relevancy_method='top_k', k=k, threshold=3.5)

    print("Precision@K:\t%f" % eval_precision,
        "Recall@K:\t%f" % eval_recall, sep='\n')
    print()

Precision@K:	0.000397
Recall@K:	0.000131

Precision@K:	0.000373
Recall@K:	0.000219

Precision@K:	0.000350
Recall@K:	0.000324

Precision@K:	0.000334
Recall@K:	0.000430

Precision@K:	0.000308
Recall@K:	0.000528

