# Data loading 
Next cells are used to load the data we need.

In [4]:
import os
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1 
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import sys
import matplotlib.pyplot as plt
sys.path.append('../../')


In [5]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [6]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [7]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [8]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [9]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Base.Evaluation.Evaluator import EvaluatorHoldout

URM_train_1, URM_validation_1 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=1)

URM_train_2, URM_validation_2 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=2)

URM_train_3, URM_validation_3 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=3)

URM_train_4, URM_validation_4 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=4)

URM_train_5, URM_validation_5 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=5)

training_list = [URM_train_1,URM_train_2,URM_train_3,URM_train_4,URM_train_5]

evaluator_validation_1 = EvaluatorHoldout(URM_validation_1, cutoff_list=[10])
evaluator_validation_2 = EvaluatorHoldout(URM_validation_2, cutoff_list=[10])
evaluator_validation_3 = EvaluatorHoldout(URM_validation_3, cutoff_list=[10])
evaluator_validation_4 = EvaluatorHoldout(URM_validation_4, cutoff_list=[10])
evaluator_validation_5 = EvaluatorHoldout(URM_validation_5, cutoff_list=[10])

evaluator_list = [evaluator_validation_1, evaluator_validation_2, evaluator_validation_3, evaluator_validation_4, evaluator_validation_5]




In [7]:
from MatrixFactorization.IALSRecommender_implicit import IALSRecommender_implicit
i=1
for URM_train, evaluator in zip(training_list,evaluator_list):
    
    URM_train_ICM_all_IALS = sps.vstack([URM_train*(1-0.55124), ICM_all.T*0.55124])
    recommender = IALSRecommender_implicit(URM_train_ICM_all_IALS)
    recommender.fit(n_factors = 867, regularization = 0.74517, iterations=74)

    #recommender.URM_train = URM_train.tocsr()
    
    result_dict, _ = evaluator.evaluateRecommender(recommender)
    print(result_dict)
    
    filename = 'MF_IALS_fold_number' + str(i)
    recommender.save_model('MF_saved/', file_name = filename)
    i = i+1

Recommender_Base_Class: URM Detected 73 (0.26 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=74.0), HTML(value='')))


EvaluatorHoldout: Processed 5536 ( 100.00% ) in 8.65 sec. Users per second: 640
{10: {'ROC_AUC': 0.21895115090834164, 'PRECISION': 0.04674855491329367, 'PRECISION_RECALL_MIN_DEN': 0.1700523700568867, 'RECALL': 0.16502445247336553, 'MAP': 0.08143844825850469, 'MRR': 0.16332318331957096, 'NDCG': 0.11966137665241816, 'F1': 0.07285777138161698, 'HIT_RATE': 0.467485549132948, 'ARHR': 0.19011778603541624, 'NOVELTY': 0.005414006217056048, 'AVERAGE_POPULARITY': 0.10864052879178178, 'DIVERSITY_MEAN_INTER_LIST': 0.996379496478009, 'DIVERSITY_HERFINDAHL': 0.9996199514632547, 'COVERAGE_ITEM': 0.48635226179018287, 'COVERAGE_ITEM_CORRECT': 0.07160731472569778, 'COVERAGE_USER': 0.6966150748710205, 'COVERAGE_USER_CORRECT': 0.2354347552535548, 'DIVERSITY_GINI': 0.20276493135726886, 'SHANNON_ENTROPY': 12.539387675658178}}
Recommender_Base_Class: Saving model in file 'MF_saved/MF_IALS_fold_number1'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 59 (0.21 %) cold users.
Recom

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=74.0), HTML(value='')))


EvaluatorHoldout: Processed 5627 ( 100.00% ) in 8.84 sec. Users per second: 637
{10: {'ROC_AUC': 0.20607185170140638, 'PRECISION': 0.04496179136306982, 'PRECISION_RECALL_MIN_DEN': 0.16071209954273807, 'RECALL': 0.1560130785660015, 'MAP': 0.07390058208020818, 'MRR': 0.15122721797681832, 'NDCG': 0.11123289158513168, 'F1': 0.06980601595483459, 'HIT_RATE': 0.4496179136307091, 'ARHR': 0.17669287251658017, 'NOVELTY': 0.0054154609213438966, 'AVERAGE_POPULARITY': 0.10817293947659223, 'DIVERSITY_MEAN_INTER_LIST': 0.9964511539792369, 'DIVERSITY_HERFINDAHL': 0.9996274070070585, 'COVERAGE_ITEM': 0.48481231953801734, 'COVERAGE_ITEM_CORRECT': 0.0695283926852743, 'COVERAGE_USER': 0.7080659368315088, 'COVERAGE_USER_CORRECT': 0.22939474015351705, 'DIVERSITY_GINI': 0.20285792646206374, 'SHANNON_ENTROPY': 12.548657862074872}}
Recommender_Base_Class: Saving model in file 'MF_saved/MF_IALS_fold_number2'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 73 (0.26 %) cold users.
Re

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=74.0), HTML(value='')))


EvaluatorHoldout: Processed 5658 ( 100.00% ) in 8.96 sec. Users per second: 632
{10: {'ROC_AUC': 0.20368144276680952, 'PRECISION': 0.046129374337220565, 'PRECISION_RECALL_MIN_DEN': 0.16392416693318168, 'RECALL': 0.15908490544144424, 'MAP': 0.07430343894525972, 'MRR': 0.15006382310199948, 'NDCG': 0.11206593602789074, 'F1': 0.0715202388686079, 'HIT_RATE': 0.46129374337221635, 'ARHR': 0.17722665477172417, 'NOVELTY': 0.005414294765614372, 'AVERAGE_POPULARITY': 0.1085728435934056, 'DIVERSITY_MEAN_INTER_LIST': 0.9963358428228855, 'DIVERSITY_HERFINDAHL': 0.9996159749531471, 'COVERAGE_ITEM': 0.48854667949951874, 'COVERAGE_ITEM_CORRECT': 0.07025986525505294, 'COVERAGE_USER': 0.7119667799169498, 'COVERAGE_USER_CORRECT': 0.23316974959104064, 'DIVERSITY_GINI': 0.2027890660797584, 'SHANNON_ENTROPY': 12.533453244007625}}
Recommender_Base_Class: Saving model in file 'MF_saved/MF_IALS_fold_number3'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 76 (0.27 %) cold users.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=74.0), HTML(value='')))


EvaluatorHoldout: Processed 5674 ( 100.00% ) in 8.84 sec. Users per second: 642
{10: {'ROC_AUC': 0.19976584923712218, 'PRECISION': 0.04510045823052413, 'PRECISION_RECALL_MIN_DEN': 0.1577792884278614, 'RECALL': 0.15325653097445352, 'MAP': 0.07368391630239207, 'MRR': 0.15061733834645397, 'NDCG': 0.11024750660380081, 'F1': 0.06969192062726588, 'HIT_RATE': 0.45100458230525203, 'ARHR': 0.17826531211709246, 'NOVELTY': 0.0054126371092368935, 'AVERAGE_POPULARITY': 0.10846940525297959, 'DIVERSITY_MEAN_INTER_LIST': 0.9962735691348136, 'DIVERSITY_HERFINDAHL': 0.9996097983380648, 'COVERAGE_ITEM': 0.4865832531280077, 'COVERAGE_ITEM_CORRECT': 0.07037536092396535, 'COVERAGE_USER': 0.7139801182836291, 'COVERAGE_USER_CORRECT': 0.22549389706807602, 'DIVERSITY_GINI': 0.20069995741303567, 'SHANNON_ENTROPY': 12.518775808798425}}
Recommender_Base_Class: Saving model in file 'MF_saved/MF_IALS_fold_number4'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 66 (0.24 %) cold users.
R

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=74.0), HTML(value='')))


EvaluatorHoldout: Processed 5589 ( 100.00% ) in 8.68 sec. Users per second: 644
{10: {'ROC_AUC': 0.20300651506502362, 'PRECISION': 0.04482018250134083, 'PRECISION_RECALL_MIN_DEN': 0.15543222656749303, 'RECALL': 0.15086827698124933, 'MAP': 0.07241026182246814, 'MRR': 0.15219152132732414, 'NDCG': 0.10921747877588638, 'F1': 0.06910927426013104, 'HIT_RATE': 0.4482018250134192, 'ARHR': 0.17732372545845493, 'NOVELTY': 0.005410945687113123, 'AVERAGE_POPULARITY': 0.10943507044116145, 'DIVERSITY_MEAN_INTER_LIST': 0.9962825344753148, 'DIVERSITY_HERFINDAHL': 0.9996104276730732, 'COVERAGE_ITEM': 0.4859672762271415, 'COVERAGE_ITEM_CORRECT': 0.06895091434071222, 'COVERAGE_USER': 0.7032842582106456, 'COVERAGE_USER_CORRECT': 0.22700390084308544, 'DIVERSITY_GINI': 0.200530176829481, 'SHANNON_ENTROPY': 12.51749304317315}}
Recommender_Base_Class: Saving model in file 'MF_saved/MF_IALS_fold_number5'
Recommender_Base_Class: Saving complete
