In [1]:
import sys
sys.path.append('../')
sys.path.append('.')
import recoengi
import recoengi.cf as cf

import pandas as pd
from scipy import sparse
import numpy as np
from sklearn import metrics
import logging
logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s %(message)s")

In [2]:
dtf_ratings = pd.read_csv("sampledata/ml-latest-small/ratings.csv", usecols=["userId", "movieId", "rating"])
dtf_tmp = dtf_ratings.groupby(["movieId"]).agg({"rating": "count"}).reset_index(drop=False)
dtf_tmp = dtf_tmp.loc[dtf_tmp.rating > 3,:]
dtf_ratings = dtf_ratings.loc[dtf_ratings.movieId.isin(dtf_tmp.movieId),:]

dtf_tmp = pd.DataFrame({"movieId": dtf_ratings.movieId.unique()})
dtf_tmp = dtf_tmp.reset_index(drop = False)
dtf_tmp = dtf_tmp.rename({"index": "movieId_new"}, axis = 1)
dtf_ratings = pd.merge(dtf_ratings, dtf_tmp, on = ["movieId"], how = "left").drop(["movieId"], axis=1).rename({"movieId_new": "movieId"}, axis=1)
dtf_ratings.userId = dtf_ratings.userId-1
dtf_ratings.rating = (dtf_ratings.rating >= 3) + 0.0

tmp_bln_split = np.random.choice([True, False], size=dtf_ratings.shape[0], replace=True, p=[0.8, 0.2])
dtf_train = dtf_ratings.loc[tmp_bln_split, ["userId", "movieId", "rating"]]
dtf_test = dtf_ratings.loc[~tmp_bln_split, ["userId", "movieId", "rating"]]

M = sparse.csr_matrix((dtf_train.rating, (dtf_train.userId, dtf_train.movieId)))

In [3]:
cfm = cf.CFM(M)
cfm.computeEverything(bln_bin=False, bln_norm=True, flt_ths=0.01, ntop=64, flt_lb=-1)

DEBUG 2019-10-13 13:07:11,394 M matrix has shape 610x4180.
DEBUG 2019-10-13 13:07:11,396 B matrix has shape 610x4180.
DEBUG 2019-10-13 13:07:11,410 B matrix has sparsity 2.37665699270531%.
DEBUG 2019-10-13 13:07:11,412 Computing the similarity matrix S ...
DEBUG 2019-10-13 13:07:11,435 S matrix has shape 610x610.
DEBUG 2019-10-13 13:07:11,436 Computing the matrix SNORMALIZED ...
DEBUG 2019-10-13 13:07:11,438 Computing the matrix SCORES ...
DEBUG 2019-10-13 13:07:11,480 SCORES matrix has shape 610x4180.
DEBUG 2019-10-13 13:07:11,481 Computing the matrix AMOUNTS ...
DEBUG 2019-10-13 13:07:11,521 AMOUNTS matrix has shape 610x4180.
DEBUG 2019-10-13 13:07:11,521 Computing the performances ...
DEBUG 2019-10-13 13:07:11,648 Average global scores difference: 0.05077722274294556.
DEBUG 2019-10-13 13:07:11,649 Average positive scores difference: 0.7474164850980024.
DEBUG 2019-10-13 13:07:11,803 Average global amounts difference: 0.05077722274294556.
DEBUG 2019-10-13 13:07:11,803 Average positive

In [4]:
dtf_pred = pd.DataFrame(pd.Series(dict(cfm.SCORES.todok().items()))).reset_index(drop=False)
dtf_pred.columns = ["userId", "movieId", "predicted_score"]
dtf_train = pd.merge(dtf_train, dtf_pred, on=["userId", "movieId"], how="inner")
dtf_test = pd.merge(dtf_test, dtf_pred, on=["userId", "movieId"], how="inner")

In [5]:
fpr, tpr, thresholds = metrics.roc_curve(y_true=dtf_train.rating+1, y_score=dtf_train.predicted_score, pos_label=2)
print("AUC on training set: " + str(metrics.auc(fpr, tpr)) + ".")
fpr, tpr, thresholds = metrics.roc_curve(y_true=dtf_test.rating+1, y_score=dtf_test.predicted_score, pos_label=2)
print("AUC on test set: " + str(metrics.auc(fpr, tpr)) + ".")

AUC on training set: 0.6746369175263351.
AUC on test set: 0.6596552122483741.
