In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


In [7]:
import recoengi
import recoengi.cv as cv

import pandas as pd
import numpy as np
from scipy import sparse
from sklearn import metrics
import pickle
import pkg_resources
import logging
logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s %(message)s")



In [8]:
dtf_train_orig = pickle.load(open(pkg_resources.resource_filename('recoengi', 'sampledata/movie_ratings_train.pickle'), "rb"))
dtf_test = pickle.load(open(pkg_resources.resource_filename('recoengi', 'sampledata/movie_ratings_test.pickle'), "rb"))



In [9]:
dtf_train = dtf_train_orig.set_index(["userId", "movieId"])["rating"].unstack(fill_value=0.0).rename_axis([None], axis=1).reset_index(drop=False)
dtf_train.index = dtf_train.userId
dtf_train = dtf_train.drop(["userId"], axis=1)
dtf_train.columns = ["film_"+str(x) for x in dtf_train.columns]



In [10]:
M = sparse.csc_matrix(dtf_train)
colnames = pd.Series(dtf_train.columns)
rownames = pd.Series(dtf_train.index)



In [11]:
array_dict_conf = [{
    "target": x, 
    "target_type": "classification", 
    "threshold": 2.9,
    "features": np.setdiff1d(colnames, [x]), 
    "nfolds": 2,
    "n_estimators": 100,
    "max_depth": 10
} for x in colnames[0:10]]



In [12]:
output = cv.cvmMultiRun(array_dict_conf, M, colnames, rownames, npool=8)



DEBUG 2019-10-13 23:43:13,593 Target film_0 | Fold 1.
DEBUG 2019-10-13 23:43:13,612 Target film_1 | Fold 1.
DEBUG 2019-10-13 23:43:13,616 Target film_2 | Fold 1.
DEBUG 2019-10-13 23:43:13,623 Target film_4 | Fold 1.
DEBUG 2019-10-13 23:43:13,619 Target film_3 | Fold 1.
DEBUG 2019-10-13 23:43:13,627 Target film_6 | Fold 1.
DEBUG 2019-10-13 23:43:13,626 Target film_5 | Fold 1.
DEBUG 2019-10-13 23:43:13,645 Target film_7 | Fold 1.
DEBUG 2019-10-13 23:43:13,867 Target film_0 | Fold 2.
DEBUG 2019-10-13 23:43:13,865 Target film_2 | Fold 2.
DEBUG 2019-10-13 23:43:13,855 Target film_6 | Fold 2.
DEBUG 2019-10-13 23:43:13,892 Target film_4 | Fold 2.
DEBUG 2019-10-13 23:43:13,905 Target film_3 | Fold 2.
DEBUG 2019-10-13 23:43:13,916 Target film_5 | Fold 2.
DEBUG 2019-10-13 23:43:13,948 Target film_1 | Fold 2.
DEBUG 2019-10-13 23:43:14,090 Target film_6 | AUC on training set: 0.6854096989966556.
DEBUG 2019-10-13 23:43:14,119 Target film_8 | Fold 1.
DEBUG 2019-10-13 23:43:14,119 Target film_0 | AUC

In [13]:
dtf_pred = pd.DataFrame(np.array(output).transpose()).unstack().reset_index(drop=False)
dtf_pred.columns = ["movieId", "userId", "predicted_score"]



In [14]:
print("Training set shape: " + str(dtf_train_orig.shape))
print("Test set shape: " + str(dtf_test.shape))
dtf_train = pd.merge(dtf_train_orig, dtf_pred, on=["userId", "movieId"], how="inner")
dtf_test = pd.merge(dtf_test, dtf_pred, on=["userId", "movieId"], how="inner")
print("Training set shape: " + str(dtf_train.shape))
print("Test set shape: " + str(dtf_test.shape))



Training set shape: (73749, 3)
Test set shape: (18641, 3)
Training set shape: (922, 4)
Test set shape: (224, 4)


In [15]:
fpr, tpr, thresholds = metrics.roc_curve(y_true=(dtf_train.rating>2.9)+1, y_score=dtf_train.predicted_score, pos_label=2)
print("AUC on training set: " + str(metrics.auc(fpr, tpr)) + ".")
fpr, tpr, thresholds = metrics.roc_curve(y_true=(dtf_test.rating>2.9)+1, y_score=dtf_test.predicted_score, pos_label=2)
print("AUC on test set: " + str(metrics.auc(fpr, tpr)) + ".")


AUC on training set: 0.6317526904559615.
AUC on test set: 0.6939583333333333.
