# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt

sys.path.append('../..')

from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from GraphBased.P3alphaRecommender import P3alphaRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender
from KNN.UserKNNCFRecommender import UserKNNCFRecommender
from KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from Base.NonPersonalizedRecommender import TopPop
from MatrixFactorization.PureSVDRecommender import PureSVDRecommender
from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
from GraphBased.P3alphaRecommender import P3alphaRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender
from KNN.UserKNNCFRecommender import UserKNNCFRecommender
from KNN.ItemKNNScoresHybridMultipleRecommender import ItemKNNScoresHybridMultipleRecommender
from GraphBased.RP3betaRecommender_user import RP3betaRecommender_user
from MatrixFactorization.PureSVDRecommender import PureSVDRecommender
from MatrixFactorization.IALSRecommender import IALSRecommender



In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


In [3]:
unique_users, min_users, max_users = len(dataset.row.unique()), dataset.row.min(), dataset.row.max()
unique_items, min_items, max_items = len(dataset.col.unique()), dataset.col.min(), dataset.col.max()
print('users stats:',unique_users, min_users, max_users)
print('items stats:',unique_items, min_items, max_items)

users stats: 7947 0 7946
items stats: 24896 0 25974


In [4]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [5]:
items_per_user = np.ediff1d(URM_all.indptr) #tells in which position each of the rows begin
users_per_item = np.ediff1d(sps.csc_matrix(URM_all).indptr)
URM_all = sps.csr_matrix(URM_all)

In [6]:
items_per_user = np.sort(items_per_user) #sorting based on the single value. Losing reference to the user. 
users_per_item = np.sort(users_per_item)

In [7]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [8]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [9]:
ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

In [10]:
features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

In [11]:
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)



In [12]:
grouped_users = dataset.groupby(['row']).count()

# All users present into the dataser
sorted_users = grouped_users.sort_values(by=['col'], ascending=True)
sorted_users = sorted_users.index.to_numpy()
print(len(sorted_users))

'''block_size = int(len(sorted_users)*0.20)

start_pos = 0
end_pos = block_size

users_in_group = sorted_users[start_pos:end_pos]'''


# User for the firt target of users 0-1 interaction
grouped_users = grouped_users[grouped_users['col'] < 2]
users_in_group = grouped_users.sort_values(by=['col'], ascending=True)
users_in_group = users_in_group.index.to_numpy()

user_first_group = users_in_group


7947


In [13]:
test_users = pd.read_csv('../data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [14]:
user_id = test_users['user_id'].to_numpy()
user_first_group = user_first_group[np.isin(user_first_group, user_id)]
print(len(user_id))

# 1085 perche sono stati rimossi i 3 utenti di cui non voglio fare reccomend
print(len(user_first_group)) 

users_middle_flag = np.isin(user_id, user_first_group, invert = True)
user_last_group = user_id[users_middle_flag]

print(len(user_first_group))
print(len(user_last_group))

7944
90
90
7854


In [15]:
# first 

recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=170, alpha = 0.45, implicit = True)

recommender_beta_ICM = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM.fit(topK=70, alpha=0.2, beta=0.5, implicit=False)
recommender_beta_ICM.URM_train = URM_all

evaluator_validation_earlystopping = EvaluatorHoldout(URM_validation, cutoff_list=[10])

earlystopping_keywargs = {"validation_every_n": 5,
                            "stop_on_validation": True,
                            "evaluator_object": evaluator_validation_earlystopping,
                            "lower_validations_allowed": 5,
                            "validation_metric": "MAP" ,
                            }

'''recommender_als = IALSRecommender(URM_train)
recommender_als.fit(epochs = 135,
        num_factors = 153,
        confidence_scaling = "log",
        alpha = 30.61917675327107,
        epsilon = 0.001001609694823079,
        reg = 1.3918031693071822e-05,
        validation_every_n = 5,
        stop_on_validation = True,
        evaluator_object = evaluator_validation_earlystopping,
        lower_validations_allowed = 5,
        validation_metric = "MAP" )'''

hybridrecommender_first_group = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_first_group.fit(topK=200, alpha = 0.1)


# second 

recommender_userKNN_top_model = UserKNNCFRecommender(URM_all)
recommender_userKNN_top_model.fit(topK= 75, shrink = 200)

recommender_SVD_top_model = PureSVDRecommender(URM_all)
recommender_SVD_top_model.fit(num_factors = 1200)

recommender_alpha_top_model = P3alphaRecommender(URM_all)
recommender_alpha_top_model.fit(topK=350, alpha = 0.45, implicit = True)

recommender_beta_ICM_top_model = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM_top_model.fit(topK=70, alpha=0.35, beta=0.4, implicit=False)
recommender_beta_ICM_top_model.URM_train = URM_all



P3alphaRecommender: URM Detected 1079 (4.15 %) cold items.
RP3betaRecommender: URM Detected 2 (0.01 %) cold users.
RP3betaRecommender: URM Detected 25 (0.10 %) cold items.
IALSRecommender: URM Detected 39 (0.49 %) cold users.
IALSRecommender: URM Detected 1983 (7.63 %) cold items.
IALSRecommender: Epoch 1 of 135. Elapsed time 15.54 sec
IALSRecommender: Epoch 2 of 135. Elapsed time 31.13 sec
IALSRecommender: Epoch 3 of 135. Elapsed time 47.13 sec
IALSRecommender: Epoch 4 of 135. Elapsed time 1.06 min
IALSRecommender: Validation begins...
EvaluatorHoldout: Processed 4935 ( 100.00% ) in 4.81 sec. Users per second: 1027
IALSRecommender: CUTOFF: 10 - ROC_AUC: 0.1407692, PRECISION: 0.0278825, PRECISION_RECALL_MIN_DEN: 0.1135411, RECALL: 0.1112441, MAP: 0.0504508, MRR: 0.1000286, NDCG: 0.0760842, F1: 0.0445890, HIT_RATE: 0.2788247, ARHR: 0.1118416, NOVELTY: 0.0049285, AVERAGE_POPULARITY: 0.0638079, DIVERSITY_MEAN_INTER_LIST: 0.9959667, DIVERSITY_HERFINDAHL: 0.9995765, COVERAGE_ITEM: 0.4611742

In [16]:
from KNN.ItemKNNScoresHybridRecommender import ItemKNNScoresHybridRecommender
from KNN.ItemKNNScoresHybridMultipleRecommender import ItemKNNScoresHybridMultipleRecommender

hybridrecommender_p3_rp3_top_model = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha_top_model.W_sparse, recommender_beta_ICM_top_model.W_sparse)
hybridrecommender_p3_rp3_top_model.fit(topK=450, alpha = 0.1)

hybridrecommender_top_model = ItemKNNScoresHybridMultipleRecommender(URM_all, hybridrecommender_p3_rp3_top_model, recommender_userKNN_top_model, recommender_SVD_top_model)
hybridrecommender_top_model.fit(alpha = 0.7, beta=0.25)

# make reccomendations for group

recommendations_first = hybridrecommender_first_group.recommend(user_first_group,cutoff = 10)
recommendations_last = hybridrecommender_top_model.recommend(user_last_group,cutoff = 10)

ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.
ItemKNNScoresHybridMultipleRecommender: URM Detected 1079 (4.15 %) cold items.
shape of item_weights: (7854, 25975)


In [17]:
for index in range(len(recommendations_first)):
    recommendations_first[index]=np.array(recommendations_first[index])

print(len(recommendations_first))

for index in range(len(recommendations_last)):
    recommendations_last[index]=np.array(recommendations_last[index])

print(len(recommendations_last))

90
7854


In [18]:
elementi = test_users['user_id'].to_numpy()
recommendations = []

for index in elementi:
    if index in user_first_group:
        local_index, = np.where(user_first_group == index) 
        recommendations.append(recommendations_first[local_index[0]])
    
    if index in user_last_group:
        local_index, = np.where(user_last_group == index) 
        recommendations.append(recommendations_last[local_index[0]])
    
    print(index, " - ", local_index[0])

print(recommendations)

50,  4411,  1393,  5337, 21327,
        9806]), array([18512,  3516, 17674, 12952, 14918,  3831,  2696, 19874, 19970,
       23404]), array([ 9438, 10269,  6974, 11357, 14253,  4444,  8987, 19220, 25675,
        9851]), array([22438, 23981,    43,  3759, 25239, 10762,  3631,  4423, 16897,
        8758]), array([18574,  3037,  1573,  6358, 14684,  3630,  8709, 19303,  4363,
       11083]), array([ 2426, 10594, 12992,  9322, 19089, 16651, 10792, 23600,  9555,
       10345]), array([14684, 20308, 19874, 18419,  6500,  2812, 19246,  9526, 20236,
       14413]), array([12158,  5032, 15903, 21492, 15910, 17724, 18793, 16172, 25342,
       12048]), array([20308,  4502, 24355,  5581, 24070, 25141,  9034,  8819,  7883,
       18419]), array([15234,  6544,  2122, 24425,  1722, 19681,  8249, 25918, 20735,
        7644]), array([24176,  6780,  9544, 15255, 16208, 20973,  9654, 10954,  6152,
        2002]), array([19089, 19709, 14895, 11730, 19480,  3165, 22914, 17760, 11687,
       12914]), array(

In [19]:
test_users['item_list']= recommendations

test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

test_users
test_users.to_csv('submission.csv', index=False)
