# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt

sys.path.append('../..')

from MF.ALS import AlternatingLeastSquare

In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
items_per_user = np.ediff1d(URM_all.indptr) #tells in which position each of the rows begin
users_per_item = np.ediff1d(sps.csc_matrix(URM_all).indptr)
URM_all = sps.csr_matrix(URM_all)

In [5]:
items_per_user = np.sort(items_per_user) #sorting based on the single value. Losing reference to the user. 
users_per_item = np.sort(users_per_item)

In [6]:
ICM_df = pd.read_csv('data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [7]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [8]:
ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

In [9]:
features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

In [10]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])




In [11]:
from GraphBased.P3alphaRecommender import P3alphaRecommender
from MatrixFactorization.PureSVDRecommender import PureSVDRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender
from KNN.ItemKNNScoresHybridRecommender_Normalized import ItemKNNScoresHybridRecommender_Normalized
from MatrixFactorization.Cython.MatrixFactorization_Cython import MatrixFactorization_AsySVD_Cython

recommender_alpha = P3alphaRecommender(URM_train)
recommender_alpha.fit(topK=475, alpha = 0.45, implicit = True)

recommender_asy_svd = MatrixFactorization_AsySVD_Cython(URM_train)
recommender_asy_svd.fit()

P3alphaRecommender: URM Detected 44 (0.55 %) cold users.
P3alphaRecommender: URM Detected 1976 (7.61 %) cold items.
MatrixFactorization_AsySVD_Cython_Recommender: URM Detected 44 (0.55 %) cold users.
MatrixFactorization_AsySVD_Cython_Recommender: URM Detected 1976 (7.61 %) cold items.
ASY_SVD: Estimating user factors... 
ASY_SVD: Estimating user factors... done!
ASY_SVD: Processed 96278 ( 100.00% ) in 0.94 seconds. MSE loss 6.18E-03. Sample per second: 102125
ASY_SVD: Epoch 1 of 300. Elapsed time 0.35 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 1.31 seconds. MSE loss 9.66E-04. Sample per second: 73746
ASY_SVD: Epoch 2 of 300. Elapsed time 0.72 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.66 seconds. MSE loss 9.46E-04. Sample per second: 144919
ASY_SVD: Epoch 3 of 300. Elapsed time 1.08 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 1.01 seconds. MSE loss 9.17E-04. Sample per second: 94937
ASY_SVD: Epoch 4 of 300. Elapsed time 1.43 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.37 seconds.

ASY_SVD: Epoch 54 of 300. Elapsed time 20.04 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 1.01 seconds. MSE loss 5.13E-04. Sample per second: 95178
ASY_SVD: Epoch 55 of 300. Elapsed time 20.42 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.38 seconds. MSE loss 5.09E-04. Sample per second: 250490
ASY_SVD: Epoch 56 of 300. Elapsed time 20.80 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.73 seconds. MSE loss 5.05E-04. Sample per second: 132563
ASY_SVD: Epoch 57 of 300. Elapsed time 21.14 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 1.09 seconds. MSE loss 5.06E-04. Sample per second: 88272
ASY_SVD: Epoch 58 of 300. Elapsed time 21.50 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.50 seconds. MSE loss 5.01E-04. Sample per second: 194444
ASY_SVD: Epoch 59 of 300. Elapsed time 21.91 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.89 seconds. MSE loss 5.00E-04. Sample per second: 108556
ASY_SVD: Epoch 60 of 300. Elapsed time 22.30 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 1.28 seconds. MSE loss 4.

ASY_SVD: Epoch 110 of 300. Elapsed time 40.89 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.83 seconds. MSE loss 3.78E-04. Sample per second: 116526
ASY_SVD: Epoch 111 of 300. Elapsed time 41.24 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 1.19 seconds. MSE loss 3.80E-04. Sample per second: 80904
ASY_SVD: Epoch 112 of 300. Elapsed time 41.60 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.54 seconds. MSE loss 3.81E-04. Sample per second: 179681
ASY_SVD: Epoch 113 of 300. Elapsed time 41.95 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.89 seconds. MSE loss 3.77E-04. Sample per second: 107617
ASY_SVD: Epoch 114 of 300. Elapsed time 42.31 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 1.26 seconds. MSE loss 3.76E-04. Sample per second: 76691
ASY_SVD: Epoch 115 of 300. Elapsed time 42.67 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.62 seconds. MSE loss 3.76E-04. Sample per second: 156497
ASY_SVD: Epoch 116 of 300. Elapsed time 43.03 sec
ASY_SVD: Processed 96278 ( 100.00% ) in 0.97 seconds. MSE 

ASY_SVD: Processed 96278 ( 100.00% ) in 0.57 seconds. MSE loss 3.11E-04. Sample per second: 169729
ASY_SVD: Epoch 166 of 300. Elapsed time 1.03 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.03 seconds. MSE loss 3.13E-04. Sample per second: 93384
ASY_SVD: Epoch 167 of 300. Elapsed time 1.04 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.44 seconds. MSE loss 3.10E-04. Sample per second: 218437
ASY_SVD: Epoch 168 of 300. Elapsed time 1.05 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.84 seconds. MSE loss 3.12E-04. Sample per second: 114140
ASY_SVD: Epoch 169 of 300. Elapsed time 1.05 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.20 seconds. MSE loss 3.07E-04. Sample per second: 80011
ASY_SVD: Epoch 170 of 300. Elapsed time 1.06 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.58 seconds. MSE loss 3.08E-04. Sample per second: 164831
ASY_SVD: Epoch 171 of 300. Elapsed time 1.07 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.94 seconds. MSE loss 3.07E-04. Sample per second: 102219
ASY_SVD: Epoch 

ASY_SVD: Epoch 221 of 300. Elapsed time 1.39 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.30 seconds. MSE loss 2.65E-04. Sample per second: 73838
ASY_SVD: Epoch 222 of 300. Elapsed time 1.40 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.68 seconds. MSE loss 2.65E-04. Sample per second: 140614
ASY_SVD: Epoch 223 of 300. Elapsed time 1.40 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.08 seconds. MSE loss 2.64E-04. Sample per second: 89190
ASY_SVD: Epoch 224 of 300. Elapsed time 1.41 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.47 seconds. MSE loss 2.66E-04. Sample per second: 206052
ASY_SVD: Epoch 225 of 300. Elapsed time 1.41 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.84 seconds. MSE loss 2.64E-04. Sample per second: 114885
ASY_SVD: Epoch 226 of 300. Elapsed time 1.42 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.25 seconds. MSE loss 2.63E-04. Sample per second: 76912
ASY_SVD: Epoch 227 of 300. Elapsed time 1.43 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.65 seconds. MSE loss 2.6

ASY_SVD: Epoch 277 of 300. Elapsed time 1.76 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.81 seconds. MSE loss 2.29E-04. Sample per second: 118825
ASY_SVD: Epoch 278 of 300. Elapsed time 1.77 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.19 seconds. MSE loss 2.31E-04. Sample per second: 80702
ASY_SVD: Epoch 279 of 300. Elapsed time 1.78 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.56 seconds. MSE loss 2.33E-04. Sample per second: 171064
ASY_SVD: Epoch 280 of 300. Elapsed time 1.78 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.94 seconds. MSE loss 2.29E-04. Sample per second: 102033
ASY_SVD: Epoch 281 of 300. Elapsed time 1.79 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.31 seconds. MSE loss 2.26E-04. Sample per second: 73481
ASY_SVD: Epoch 282 of 300. Elapsed time 1.80 min
ASY_SVD: Processed 96278 ( 100.00% ) in 0.68 seconds. MSE loss 2.28E-04. Sample per second: 142122
ASY_SVD: Epoch 283 of 300. Elapsed time 1.80 min
ASY_SVD: Processed 96278 ( 100.00% ) in 1.05 seconds. MSE loss 2.

In [12]:
hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_train, recommender_alpha_ICM.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=600, alpha = 0.45)

hybridrecommender_final = ItemKNNScoresHybridRecommender_Normalized(URM_train, hybridrecommender, recommender_SVD)
hybridrecommender_final.fit(alpha = 0.6)



NameError: name 'recommender_alpha_ICM' is not defined

In [None]:
evaluator_validation.evaluateRecommender(hybridrecommender_final)

In [None]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

In [None]:
recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=475, alpha = 0.45, implicit = True)

recommender_alpha_ICM = P3alphaRecommender(ICM_all.T)
recommender_alpha_ICM.fit(topK=175, alpha = 0.45)
recommender_alpha_ICM.URM_train = URM_train

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha_ICM.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=600, alpha = 0.45)

hybridrecommender_final = ItemKNNScoresHybridRecommender_Normalized(URM_all, hybridrecommender, recommender_rp3)
hybridrecommender_final.fit(alpha = 0.45)

user_id = test_users['user_id']
recommendations = hybridrecommender.recommend(user_id,cutoff = 10)


In [None]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

In [None]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)
