# Data loading 
Next cells are used to load the data we need.

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1 
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import sys
import matplotlib.pyplot as plt
sys.path.append('../../../')


In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
ICM_df = pd.read_csv('../../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [5]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [6]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Base.Evaluation.Evaluator import EvaluatorHoldout

URM_train_1, URM_validation_1 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=1)

URM_train_2, URM_validation_2 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=2)

URM_train_3, URM_validation_3 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=3)

URM_train_4, URM_validation_4 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=4)

URM_train_5, URM_validation_5 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed=5)

training_list = [URM_train_1,URM_train_2,URM_train_3,URM_train_4,URM_train_5]

evaluator_validation_1 = EvaluatorHoldout(URM_validation_1, cutoff_list=[10])
evaluator_validation_2 = EvaluatorHoldout(URM_validation_2, cutoff_list=[10])
evaluator_validation_3 = EvaluatorHoldout(URM_validation_3, cutoff_list=[10])
evaluator_validation_4 = EvaluatorHoldout(URM_validation_4, cutoff_list=[10])
evaluator_validation_5 = EvaluatorHoldout(URM_validation_5, cutoff_list=[10])

evaluator_list = [evaluator_validation_1, evaluator_validation_2, evaluator_validation_3, evaluator_validation_4, evaluator_validation_5]




In [8]:
from MatrixFactorization.IALSRecommender_implicit import IALSRecommender_implicit
i=1
for URM_train, evaluator in zip(training_list,evaluator_list):
    
    URM_train_ICM_all_IALS = sps.vstack([URM_train*(1-0.5293971458364172), ICM_all.T*0.5293971458364172])
    recommender = IALSRecommender_implicit(URM_train_ICM_all_IALS)
    recommender.fit(n_factors = 768, regularization = 0.4489004525533907, iterations=76)

    #recommender.URM_train = URM_train.tocsr()
    
    result_dict, _ = evaluator.evaluateRecommender(recommender)
    print(result_dict)
    
    filename = 'MF_IALS_fold_number' + str(i)
    recommender.save_model('MF_saved_2/', file_name = filename)
    i = i+1

Recommender_Base_Class: URM Detected 73 (0.26 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=76.0), HTML(value='')))


EvaluatorHoldout: Processed 5536 ( 100.00% ) in 8.04 sec. Users per second: 689
{10: {'ROC_AUC': 0.21842795210569887, 'PRECISION': 0.047272398843929635, 'PRECISION_RECALL_MIN_DEN': 0.17142133165886853, 'RECALL': 0.16627039343083283, 'MAP': 0.08150103193171922, 'MRR': 0.16329529945407867, 'NDCG': 0.12004833510199378, 'F1': 0.07361522503729441, 'HIT_RATE': 0.47272398843930635, 'ARHR': 0.19039841671254282, 'NOVELTY': 0.005404758255382906, 'AVERAGE_POPULARITY': 0.11072255316976594, 'DIVERSITY_MEAN_INTER_LIST': 0.9961539480760896, 'DIVERSITY_HERFINDAHL': 0.9995974006972752, 'COVERAGE_ITEM': 0.46725697786333015, 'COVERAGE_ITEM_CORRECT': 0.07168431183830606, 'COVERAGE_USER': 0.6966150748710205, 'COVERAGE_USER_CORRECT': 0.23769976091606895, 'DIVERSITY_GINI': 0.19082020853842854, 'SHANNON_ENTROPY': 12.453686786607655}}
Recommender_Base_Class: Saving model in file 'MF_saved_2/MF_IALS_fold_number1'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 59 (0.21 %) cold user

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=76.0), HTML(value='')))


EvaluatorHoldout: Processed 5627 ( 100.00% ) in 8.18 sec. Users per second: 688
{10: {'ROC_AUC': 0.2067826959585452, 'PRECISION': 0.045157277412474446, 'PRECISION_RECALL_MIN_DEN': 0.16140109618872783, 'RECALL': 0.15680492278785405, 'MAP': 0.07434359009912551, 'MRR': 0.15175105288842664, 'NDCG': 0.11191465519385199, 'F1': 0.07012087797567225, 'HIT_RATE': 0.45157277412475566, 'ARHR': 0.17765415330281206, 'NOVELTY': 0.005407241517571836, 'AVERAGE_POPULARITY': 0.10988976866196372, 'DIVERSITY_MEAN_INTER_LIST': 0.996266458421135, 'DIVERSITY_HERFINDAHL': 0.999608940733558, 'COVERAGE_ITEM': 0.46814244465832533, 'COVERAGE_ITEM_CORRECT': 0.06895091434071222, 'COVERAGE_USER': 0.7080659368315088, 'COVERAGE_USER_CORRECT': 0.2309047439285265, 'DIVERSITY_GINI': 0.19196510302821396, 'SHANNON_ENTROPY': 12.473412406192908}}
Recommender_Base_Class: Saving model in file 'MF_saved_2/MF_IALS_fold_number2'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 73 (0.26 %) cold users.
R

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=76.0), HTML(value='')))


EvaluatorHoldout: Processed 5658 ( 100.00% ) in 8.23 sec. Users per second: 687
{10: {'ROC_AUC': 0.20551145449342761, 'PRECISION': 0.04625309296571125, 'PRECISION_RECALL_MIN_DEN': 0.16427414196502296, 'RECALL': 0.15941622288980895, 'MAP': 0.0747033641265049, 'MRR': 0.15124027223709136, 'NDCG': 0.11263645258980691, 'F1': 0.0717024155683452, 'HIT_RATE': 0.46253092965712267, 'ARHR': 0.17805256779275935, 'NOVELTY': 0.005406191259758185, 'AVERAGE_POPULARITY': 0.11052140001781768, 'DIVERSITY_MEAN_INTER_LIST': 0.9961559776383554, 'DIVERSITY_HERFINDAHL': 0.9995979916136475, 'COVERAGE_ITEM': 0.4752261790182868, 'COVERAGE_ITEM_CORRECT': 0.07029836381135707, 'COVERAGE_USER': 0.7119667799169498, 'COVERAGE_USER_CORRECT': 0.23392475147854536, 'DIVERSITY_GINI': 0.19359450597625916, 'SHANNON_ENTROPY': 12.467354524272878}}
Recommender_Base_Class: Saving model in file 'MF_saved_2/MF_IALS_fold_number3'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 76 (0.27 %) cold users.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=76.0), HTML(value='')))


EvaluatorHoldout: Processed 5674 ( 100.00% ) in 8.24 sec. Users per second: 688
{10: {'ROC_AUC': 0.20129766240887226, 'PRECISION': 0.04511808248149347, 'PRECISION_RECALL_MIN_DEN': 0.15698053219642988, 'RECALL': 0.15235624317380642, 'MAP': 0.07400303796860454, 'MRR': 0.15106836530876053, 'NDCG': 0.11039579942987252, 'F1': 0.06961939506085647, 'HIT_RATE': 0.45118082481494537, 'ARHR': 0.17925597685907912, 'NOVELTY': 0.005403850988850792, 'AVERAGE_POPULARITY': 0.11054030237498727, 'DIVERSITY_MEAN_INTER_LIST': 0.9960471163053307, 'DIVERSITY_HERFINDAHL': 0.9995871570461781, 'COVERAGE_ITEM': 0.471106833493744, 'COVERAGE_ITEM_CORRECT': 0.06960538979788258, 'COVERAGE_USER': 0.7139801182836291, 'COVERAGE_USER_CORRECT': 0.2249905624764062, 'DIVERSITY_GINI': 0.19059621505776492, 'SHANNON_ENTROPY': 12.444020605526354}}
Recommender_Base_Class: Saving model in file 'MF_saved_2/MF_IALS_fold_number4'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 66 (0.24 %) cold users.
R

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=76.0), HTML(value='')))


EvaluatorHoldout: Processed 5589 ( 100.00% ) in 8.08 sec. Users per second: 691
{10: {'ROC_AUC': 0.20335057241122806, 'PRECISION': 0.045106459116119824, 'PRECISION_RECALL_MIN_DEN': 0.15566447131127836, 'RECALL': 0.15107955106839538, 'MAP': 0.07292047925907795, 'MRR': 0.15265132473935508, 'NDCG': 0.10975184249109277, 'F1': 0.0694714530066547, 'HIT_RATE': 0.4510645911612095, 'ARHR': 0.17800178638879705, 'NOVELTY': 0.005402740216560256, 'AVERAGE_POPULARITY': 0.11143623538472297, 'DIVERSITY_MEAN_INTER_LIST': 0.9960652078496044, 'DIVERSITY_HERFINDAHL': 0.9995886988989728, 'COVERAGE_ITEM': 0.47091434071222327, 'COVERAGE_ITEM_CORRECT': 0.06871992300288739, 'COVERAGE_USER': 0.7032842582106456, 'COVERAGE_USER_CORRECT': 0.22750723543475526, 'DIVERSITY_GINI': 0.1907656395104489, 'SHANNON_ENTROPY': 12.4446526278001}}
Recommender_Base_Class: Saving model in file 'MF_saved_2/MF_IALS_fold_number5'
Recommender_Base_Class: Saving complete
