# Data loading 
Next cells are used to load the data we need.

In [1]:
import os
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import sys
import matplotlib.pyplot as plt
sys.path.append('../../../')


In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
ICM_df = pd.read_csv('../../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [5]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [6]:
grouped_users = dataset.groupby(['row']).count()
sorted_users = grouped_users.sort_values(by=['col'], ascending=True)
sorted_users = sorted_users.index.to_numpy()
block_size = int(len(sorted_users)*0.75)

start_pos_group_0 = 0*block_size
end_pos_group_0 = min((0+1)*block_size, len(sorted_users))
users_in_group_0 = sorted_users[start_pos_group_0:end_pos_group_0]
users_not_in_group_0_flag = np.isin(sorted_users, users_in_group_0, invert = True)
users_not_in_group_0 = sorted_users[users_not_in_group_0_flag]


start_pos_group_1 = 1*block_size
end_pos_group_1 = min((1+1)*block_size, len(sorted_users))
users_in_group_1 = sorted_users[start_pos_group_1:end_pos_group_1]
users_not_in_group_1_flag = np.isin(sorted_users, users_in_group_1, invert = True)
users_not_in_group_1 = sorted_users[users_not_in_group_1_flag]

In [7]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Base.Evaluation.Evaluator import EvaluatorHoldout

URM_train_1, URM_validation_1 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 1)

URM_train_2, URM_validation_2 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 2)

URM_train_3, URM_validation_3 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 3)

URM_train_4, URM_validation_4 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 4)

URM_train_5, URM_validation_5 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8, seed = 5)

URM_train_vector = [URM_train_1,URM_train_2,URM_train_3,URM_train_4,URM_train_5]

evaluator_validation_1 = EvaluatorHoldout(URM_validation_1, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_2 = EvaluatorHoldout(URM_validation_2, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_3 = EvaluatorHoldout(URM_validation_3, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_4 = EvaluatorHoldout(URM_validation_4, cutoff_list=[10], ignore_users = users_not_in_group_0)
evaluator_validation_5 = EvaluatorHoldout(URM_validation_5, cutoff_list=[10], ignore_users = users_not_in_group_0)

Evaluator_vector = [evaluator_validation_1, evaluator_validation_2, evaluator_validation_3, evaluator_validation_4, evaluator_validation_5]


EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users
EvaluatorHoldout: Ignoring 1987 Users


In [9]:
from GraphBased.RP3betaRecommender import RP3betaRecommender
i=1
for URM_train, evaluator in zip(URM_train_vector,Evaluator_vector):
    
    URM_train_ICM_all_RP3 = sps.vstack([URM_train, ICM_all.T])
    recommender = RP3betaRecommender(URM_train_ICM_all_RP3)
    recommender.fit(topK = 196, alpha = 0.35248383404238176, beta = 0.14410506792619115, implicit = False)

    #recommender.URM_train = URM_train.tocsr()
    
    result_dict, _ = evaluator.evaluateRecommender(recommender)
    print(result_dict)
    
    filename = 'RP3_fold_number' + str(i)
    recommender.save_model('RP3_saved_group_0/', file_name = filename)
    i = i+1

RP3betaRecommender: URM Detected 73 (0.26 %) cold users.
RP3betaRecommender: URM Detected 1 (0.00 %) cold items.
EvaluatorHoldout: Processed 3585 ( 100.00% ) in 2.39 sec. Users per second: 1500
{10: {'ROC_AUC': 0.16956509707555759, 'PRECISION': 0.0273919107391908, 'PRECISION_RECALL_MIN_DEN': 0.17933984193398428, 'RECALL': 0.17933984193398428, 'MAP': 0.08708749714049566, 'MRR': 0.11961236191361699, 'NDCG': 0.11816657501242982, 'F1': 0.04752497745232643, 'HIT_RATE': 0.2739191073919107, 'ARHR': 0.1258364880122202, 'NOVELTY': 0.005422064263675152, 'AVERAGE_POPULARITY': 0.11677475592747563, 'DIVERSITY_MEAN_INTER_LIST': 0.9944368586869894, 'DIVERSITY_HERFINDAHL': 0.9994159470441889, 'COVERAGE_ITEM': 0.47237728585178057, 'COVERAGE_ITEM_CORRECT': 0.030259865255052934, 'COVERAGE_USER': 0.6015100671140939, 'COVERAGE_USER_CORRECT': 0.14865771812080536, 'DIVERSITY_GINI': 0.22806996824447756, 'SHANNON_ENTROPY': 12.535727666691354}}
RP3betaRecommender: Saving model in file 'RP3_saved_group_0/RP3_fol