# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt

sys.path.append('../../')

from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from MatrixFactorization.IALSRecommender_implicit import IALSRecommender_implicit

In [2]:
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

test_users = pd.read_csv('../data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [4]:
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])



In [17]:
recommender = IALSRecommender_implicit(URM_train)
recommender.fit(iterations=10)

Recommender_Base_Class: URM Detected 67 (0.84 %) cold users.
Recommender_Base_Class: URM Detected 2395 (9.22 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [18]:
evaluator_validation.evaluateRecommender(recommender)

EvaluatorHoldout: Processed 5663 ( 100.00% ) in 9.25 sec. Users per second: 612


({10: {'ROC_AUC': 0.1703081966202226,
   'PRECISION': 0.035546530107715954,
   'PRECISION_RECALL_MIN_DEN': 0.12459890012865511,
   'RECALL': 0.12101877269695065,
   'MAP': 0.0578548216495249,
   'MRR': 0.12451929679989042,
   'NDCG': 0.08790249882653058,
   'F1': 0.05495211736201835,
   'HIT_RATE': 0.35546530107716756,
   'ARHR': 0.14281699082599697,
   'NOVELTY': 0.005091809888995,
   'AVERAGE_POPULARITY': 0.05483423376704649,
   'DIVERSITY_MEAN_INTER_LIST': 0.9966530091499145,
   'DIVERSITY_HERFINDAHL': 0.9996477015328769,
   'COVERAGE_ITEM': 0.40485081809432144,
   'COVERAGE_ITEM_CORRECT': 0.05235803657362849,
   'COVERAGE_USER': 0.712595948156537,
   'COVERAGE_USER_CORRECT': 0.1906379765949415,
   'DIVERSITY_GINI': 0.16417014692544676,
   'SHANNON_ENTROPY': 12.349282771948545}},
 'CUTOFF: 10 - ROC_AUC: 0.1703082, PRECISION: 0.0355465, PRECISION_RECALL_MIN_DEN: 0.1245989, RECALL: 0.1210188, MAP: 0.0578548, MRR: 0.1245193, NDCG: 0.0879025, F1: 0.0549521, HIT_RATE: 0.3554653, ARHR: 0.

In [10]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    print(user)
    recommendations.append(recommender.recommend(user,at = 10))

3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145


In [11]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)
