# Data loading 
Next cells are used to load the data we need.

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1 
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import sys
import matplotlib.pyplot as plt
sys.path.append('../../../')


In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
ICM_df = pd.read_csv('../../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [5]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [6]:
grouped_users = dataset.groupby(['row']).count()
sorted_users = grouped_users.sort_values(by=['col'], ascending=True)
sorted_users = sorted_users.index.to_numpy()
block_size = int(len(sorted_users)*0.75)

start_pos_group_0 = 0*block_size
end_pos_group_0 = min((0+1)*block_size, len(sorted_users))
users_in_group_0 = sorted_users[start_pos_group_0:end_pos_group_0]
users_not_in_group_0_flag = np.isin(sorted_users, users_in_group_0, invert = True)
users_not_in_group_0 = sorted_users[users_not_in_group_0_flag]


start_pos_group_1 = 1*block_size
end_pos_group_1 = min((1+1)*block_size, len(sorted_users))
users_in_group_1 = sorted_users[start_pos_group_1:end_pos_group_1]
users_not_in_group_1_flag = np.isin(sorted_users, users_in_group_1, invert = True)
users_not_in_group_1 = sorted_users[users_not_in_group_1_flag]

In [7]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Base.Evaluation.Evaluator import EvaluatorHoldout

URM_train_1, URM_validation_1 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 1)

URM_train_2, URM_validation_2 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 2)

URM_train_3, URM_validation_3 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 3)

URM_train_4, URM_validation_4 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 4)

URM_train_5, URM_validation_5 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 5)

URM_train_vector = [URM_train_1,URM_train_2,URM_train_3,URM_train_4,URM_train_5]

evaluator_validation_1 = EvaluatorHoldout(URM_validation_1, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_2 = EvaluatorHoldout(URM_validation_2, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_3 = EvaluatorHoldout(URM_validation_3, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_4 = EvaluatorHoldout(URM_validation_4, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_5 = EvaluatorHoldout(URM_validation_5, cutoff_list=[10], ignore_users = users_not_in_group_0)

Evaluator_vector = [evaluator_validation_1, evaluator_validation_2, evaluator_validation_3, evaluator_validation_4, evaluator_validation_5]


EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users


In [9]:
from MatrixFactorization.IALSRecommender_implicit import IALSRecommender_implicit
i=1
for URM_train, evaluator in zip(URM_train_vector,Evaluator_vector):
    
    URM_train_ICM_all_IALS = sps.vstack([URM_train*(1-0.5299554043024671), ICM_all.T*0.5299554043024671])
    recommender = IALSRecommender_implicit(URM_train_ICM_all_IALS)
    recommender.fit(n_factors = 864, regularization = 0.7981462652421099, iterations=100)

    #recommender.URM_train = URM_train.tocsr()
    
    result_dict, _ = evaluator.evaluateRecommender(recommender)
    print(result_dict)
    
    filename = 'MF_IALS_fold_number' + str(i)
    recommender.save_model('MF_saved_group_0/', file_name = filename)
    i = i+1

Recommender_Base_Class: URM Detected 73 (0.26 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


EvaluatorHoldout: Processed 3585 ( 100.00% ) in 5.52 sec. Users per second: 650
{10: {'ROC_AUC': 0.1694648447012468, 'PRECISION': 0.027921896792189416, 'PRECISION_RECALL_MIN_DEN': 0.1820502092050209, 'RECALL': 0.1820502092050209, 'MAP': 0.08737535328713843, 'MRR': 0.1210423723185229, 'NDCG': 0.11926240185314642, 'F1': 0.048417737473058636, 'HIT_RATE': 0.27921896792189677, 'ARHR': 0.12774623098890883, 'NOVELTY': 0.005419615918464619, 'AVERAGE_POPULARITY': 0.10854450912619014, 'DIVERSITY_MEAN_INTER_LIST': 0.9961104521568042, 'DIVERSITY_HERFINDAHL': 0.9995832597079494, 'COVERAGE_ITEM': 0.40311838306063524, 'COVERAGE_ITEM_CORRECT': 0.0312223291626564, 'COVERAGE_USER': 0.6015100671140939, 'COVERAGE_USER_CORRECT': 0.14966442953020134, 'DIVERSITY_GINI': 0.18395726513348445, 'SHANNON_ENTROPY': 12.392011183127021}}
Recommender_Base_Class: Saving model in file 'MF_saved_group_0/MF_IALS_fold_number1'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 59 (0.21 %) cold us

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


EvaluatorHoldout: Processed 3680 ( 100.00% ) in 5.60 sec. Users per second: 657
{10: {'ROC_AUC': 0.1544982531055903, 'PRECISION': 0.026358695652173712, 'PRECISION_RECALL_MIN_DEN': 0.16716744306418221, 'RECALL': 0.16716744306418221, 'MAP': 0.0766791123907292, 'MRR': 0.1057733781918564, 'NDCG': 0.10683748672675827, 'F1': 0.04553716396046148, 'HIT_RATE': 0.26358695652173914, 'ARHR': 0.1125736499309868, 'NOVELTY': 0.005424219033056702, 'AVERAGE_POPULARITY': 0.10721888291587885, 'DIVERSITY_MEAN_INTER_LIST': 0.996398758523701, 'DIVERSITY_HERFINDAHL': 0.9996127997991493, 'COVERAGE_ITEM': 0.4068912415784408, 'COVERAGE_ITEM_CORRECT': 0.029412897016361888, 'COVERAGE_USER': 0.6174496644295302, 'COVERAGE_USER_CORRECT': 0.14395973154362415, 'DIVERSITY_GINI': 0.1872725450583147, 'SHANNON_ENTROPY': 12.437751585720031}}
Recommender_Base_Class: Saving model in file 'MF_saved_group_0/MF_IALS_fold_number2'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 73 (0.26 %) cold user

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


EvaluatorHoldout: Processed 3697 ( 100.00% ) in 5.66 sec. Users per second: 654
{10: {'ROC_AUC': 0.15318565889975166, 'PRECISION': 0.026913713822017574, 'PRECISION_RECALL_MIN_DEN': 0.16971482669345794, 'RECALL': 0.16971482669345794, 'MAP': 0.07830645074728115, 'MRR': 0.10826592561107004, 'NDCG': 0.10877258648397085, 'F1': 0.04645974856962885, 'HIT_RATE': 0.26913713822017854, 'ARHR': 0.11562828719983148, 'NOVELTY': 0.005423224660252447, 'AVERAGE_POPULARITY': 0.10765346252326498, 'DIVERSITY_MEAN_INTER_LIST': 0.996229041448138, 'DIVERSITY_HERFINDAHL': 0.9995959571867005, 'COVERAGE_ITEM': 0.4117805582290664, 'COVERAGE_ITEM_CORRECT': 0.03068334937439846, 'COVERAGE_USER': 0.6203020134228188, 'COVERAGE_USER_CORRECT': 0.14597315436241612, 'DIVERSITY_GINI': 0.18768302302560308, 'SHANNON_ENTROPY': 12.422974410020533}}
Recommender_Base_Class: Saving model in file 'MF_saved_group_0/MF_IALS_fold_number3'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 76 (0.27 %) cold 

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


EvaluatorHoldout: Processed 3717 ( 100.00% ) in 5.68 sec. Users per second: 655
{10: {'ROC_AUC': 0.14671490064952553, 'PRECISION': 0.02520850147968774, 'PRECISION_RECALL_MIN_DEN': 0.16106947487092768, 'RECALL': 0.16106947487092768, 'MAP': 0.07476045639013085, 'MRR': 0.10420995768049834, 'NDCG': 0.10362772063675638, 'F1': 0.04359420448044709, 'HIT_RATE': 0.2520850147968792, 'ARHR': 0.1112284399007562, 'NOVELTY': 0.005419405546082996, 'AVERAGE_POPULARITY': 0.10838681296308361, 'DIVERSITY_MEAN_INTER_LIST': 0.9960528430598308, 'DIVERSITY_HERFINDAHL': 0.9995784870812572, 'COVERAGE_ITEM': 0.4089701636188643, 'COVERAGE_ITEM_CORRECT': 0.02875842155919153, 'COVERAGE_USER': 0.6236577181208054, 'COVERAGE_USER_CORRECT': 0.13691275167785236, 'DIVERSITY_GINI': 0.18479429367511194, 'SHANNON_ENTROPY': 12.394606071258693}}
Recommender_Base_Class: Saving model in file 'MF_saved_group_0/MF_IALS_fold_number4'
Recommender_Base_Class: Saving complete
Recommender_Base_Class: URM Detected 66 (0.24 %) cold us

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


EvaluatorHoldout: Processed 3639 ( 100.00% ) in 5.54 sec. Users per second: 657
{10: {'ROC_AUC': 0.14888640259621333, 'PRECISION': 0.025611431712008605, 'PRECISION_RECALL_MIN_DEN': 0.15801960245488691, 'RECALL': 0.15801960245488691, 'MAP': 0.07373175628225086, 'MRR': 0.10457608709875814, 'NDCG': 0.10264408447751307, 'F1': 0.044078695910995246, 'HIT_RATE': 0.2561143171200879, 'ARHR': 0.11125069245431982, 'NOVELTY': 0.005416121937301518, 'AVERAGE_POPULARITY': 0.10962706249925296, 'DIVERSITY_MEAN_INTER_LIST': 0.9960223834970883, 'DIVERSITY_HERFINDAHL': 0.9995748675779722, 'COVERAGE_ITEM': 0.4028873917228104, 'COVERAGE_ITEM_CORRECT': 0.029220404234841194, 'COVERAGE_USER': 0.6105704697986577, 'COVERAGE_USER_CORRECT': 0.13758389261744966, 'DIVERSITY_GINI': 0.18197912284305304, 'SHANNON_ENTROPY': 12.371159292233639}}
Recommender_Base_Class: Saving model in file 'MF_saved_group_0/MF_IALS_fold_number5'
Recommender_Base_Class: Saving complete
