# Data loading 
Next cells are used to load the data we need.

In [3]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('../../')


In [4]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


In [5]:
unique_users, min_users, max_users = len(dataset.row.unique()), dataset.row.min(), dataset.row.max()
unique_items, min_items, max_items = len(dataset.col.unique()), dataset.col.min(), dataset.col.max()
print('users stats:',unique_users, min_users, max_users)
print('items stats:',unique_items, min_items, max_items)

users stats: 7947 0 7946
items stats: 24896 0 25974


In [6]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [7]:
items_per_user = np.ediff1d(URM_all.indptr) #tells in which position each of the rows begin
users_per_item = np.ediff1d(sps.csc_matrix(URM_all).indptr)
URM_all = sps.csr_matrix(URM_all)

In [8]:
items_per_user = np.sort(items_per_user) #sorting based on the single value. Losing reference to the user. 
users_per_item = np.sort(users_per_item)

In [11]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [12]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [13]:
ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

In [14]:
features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

In [15]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])




In [26]:
from GraphBased.P3alphaRecommender import P3alphaRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender
from KNN.UserKNNCFRecommender import UserKNNCFRecommender
from KNN.ItemKNNScoresHybridRecommender import ItemKNNScoresHybridRecommender
from GraphBased.RP3betaRecommender_user import RP3betaRecommender_user

recommender_userKNN = UserKNNCFRecommender(URM_train)
recommender_userKNN.fit(topK= 75, shrink = 200)

#recommender_rp3Beta_user = RP3betaRecommender_user(URM_train.T)
#recommender_rp3Beta_user.fit(topK=100, alpha = 0.4, beta=0.4, implicit = True)
#recommender_rp3Beta_user.URM_train = URM_train

recommender_alpha = P3alphaRecommender(URM_train)
recommender_alpha.fit(topK=350, alpha = 0.45, implicit = True)

recommender_beta_ICM = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM.fit(topK=70, alpha=0.35, beta=0.4, implicit=False)
recommender_beta_ICM.URM_train = URM_train

hybridrecommender_p3_rp3 = ItemKNNSimilarityHybridRecommender(URM_train, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_p3_rp3.fit(topK=450, alpha = 0.1)

hybridrecommender = ItemKNNScoresHybridRecommender(URM_train, hybridrecommender_p3_rp3, recommender_userKNN)
hybridrecommender.fit(alpha = 0.95)

evaluator_validation.evaluateRecommender(hybridrecommender)

UserKNNCFRecommender: URM Detected 39 (0.49 %) cold users.
UserKNNCFRecommender: URM Detected 2007 (7.73 %) cold items.
Similarity column 7947 ( 100 % ), 10658.96 column/sec, elapsed time 0.01 min
P3alphaRecommender: URM Detected 39 (0.49 %) cold users.
P3alphaRecommender: URM Detected 2007 (7.73 %) cold items.
RP3betaRecommender: URM Detected 2 (0.01 %) cold users.
RP3betaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 39 (0.49 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2007 (7.73 %) cold items.
ItemKNNScoresHybridRecommender: URM Detected 39 (0.49 %) cold users.
ItemKNNScoresHybridRecommender: URM Detected 2007 (7.73 %) cold items.
shape of item_weights: (1000, 25975)
type of item_weights: <class 'numpy.ndarray'>
shape of item_weights: (1000, 25975)
type of item_weights: <class 'numpy.ndarray'>
shape of item_weights: (1000, 25975)
type of item_weights: <class 'numpy.ndarray'>
shape of item_weights: (1000, 25975)


({10: {'ROC_AUC': 0.19988667819793696,
   'PRECISION': 0.042163355408387546,
   'PRECISION_RECALL_MIN_DEN': 0.17226426994638952,
   'RECALL': 0.16886396712111626,
   'MAP': 0.07845773984406636,
   'MRR': 0.14705419059723726,
   'NDCG': 0.116143805673673,
   'F1': 0.06747819548724514,
   'HIT_RATE': 0.4216335540838852,
   'ARHR': 0.16893111181190687,
   'NOVELTY': 0.0049240298054829025,
   'AVERAGE_POPULARITY': 0.09172460174115801,
   'DIVERSITY_MEAN_INTER_LIST': 0.9885985050899271,
   'DIVERSITY_HERFINDAHL': 0.9988400110848489,
   'COVERAGE_ITEM': 0.489162656400385,
   'COVERAGE_ITEM_CORRECT': 0.05066410009624639,
   'COVERAGE_USER': 0.627029067572669,
   'COVERAGE_USER_CORRECT': 0.19818799546998866,
   'DIVERSITY_GINI': 0.18511689667876635,
   'SHANNON_ENTROPY': 11.959519989454767}},
 'CUTOFF: 10 - ROC_AUC: 0.1998867, PRECISION: 0.0421634, PRECISION_RECALL_MIN_DEN: 0.1722643, RECALL: 0.1688640, MAP: 0.0784577, MRR: 0.1470542, NDCG: 0.1161438, F1: 0.0674782, HIT_RATE: 0.4216336, ARHR: 

In [61]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [62]:
recommender_rp3Beta_user = RP3betaRecommender_user(URM_all.T)
recommender_rp3Beta_user.fit(topK=100, alpha = 0.4, beta=0.4, implicit = True)
recommender_rp3Beta_user.URM_train = URM_all

recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=270, alpha = 0.45, implicit = True)

recommender_beta_ICM = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM.fit(topK=70, alpha=0.35, beta=0.4, implicit=False)
recommender_beta_ICM.URM_train = URM_all

hybridrecommender_p3_rp3 = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_p3_rp3.fit(topK=300, alpha = 0.1)

hybridrecommender = ItemKNNScoresHybridRecommender(URM_all, hybridrecommender_p3_rp3, recommender_rp3Beta_user)
hybridrecommender.fit(alpha = 0.95)

user_id = test_users['user_id']
recommendations = hybridrecommender.recommend(user_id,cutoff = 10)


RP3betaRecommender_user: URM Detected 1079 (4.15 %) cold users.
P3alphaRecommender: URM Detected 1079 (4.15 %) cold items.
RP3betaRecommender: URM Detected 2 (0.01 %) cold users.
RP3betaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.
ItemKNNScoresHybridRecommender: URM Detected 1079 (4.15 %) cold items.
shape of item_weights: (7944, 25975)


In [63]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

7944


In [64]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)
