# Data loading 
Next cells are used to load the data we need.

In [210]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('../..')


In [211]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


In [212]:
unique_users, min_users, max_users = len(dataset.row.unique()), dataset.row.min(), dataset.row.max()
unique_items, min_items, max_items = len(dataset.col.unique()), dataset.col.min(), dataset.col.max()
print('users stats:',unique_users, min_users, max_users)
print('items stats:',unique_items, min_items, max_items)

users stats: 7947 0 7946
items stats: 24896 0 25974


In [213]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [214]:
items_per_user = np.ediff1d(URM_all.indptr) #tells in which position each of the rows begin
users_per_item = np.ediff1d(sps.csc_matrix(URM_all).indptr)
URM_all = sps.csr_matrix(URM_all)

In [215]:
items_per_user = np.sort(items_per_user) #sorting based on the single value. Losing reference to the user. 
users_per_item = np.sort(users_per_item)

In [216]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [217]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [218]:
ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

In [219]:
features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

In [220]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)



In [221]:
# These are the base recommender for all classess 
# We are going to tune just the hybrid fot the specific class fo user

from GraphBased.P3alphaRecommender import P3alphaRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender

recommender_alpha = P3alphaRecommender(URM_train)
recommender_alpha.fit(topK=170, alpha = 0.45, implicit = True)

recommender_beta_ICM = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM.fit(topK=70, alpha=0.2, beta=0.5, implicit=False)
recommender_beta_ICM.URM_train = URM_train


hybridrecommender_all_user = ItemKNNSimilarityHybridRecommender(URM_train, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_all_user.fit(topK=200, alpha = 0.1)

P3alphaRecommender: URM Detected 49 (0.62 %) cold users.
P3alphaRecommender: URM Detected 2036 (7.84 %) cold items.
RP3betaRecommender: URM Detected 2 (0.01 %) cold users.
RP3betaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 49 (0.62 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2036 (7.84 %) cold items.


In [222]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["topK"] = Integer(1, 800)
hyperparameters_range_dictionary["alpha"] = Real(low = 0, high = 1, prior = 'uniform')

grouped_users = dataset.groupby(['row']).count()

# All users present into the dataser
sorted_users = grouped_users.sort_values(by=['col'], ascending=True)
sorted_users = sorted_users.index.to_numpy()
print(len(sorted_users))

# User for the firt target of users 0-2 interaction
grouped_users = grouped_users[grouped_users['col'] < 3]
users_in_group = grouped_users.sort_values(by=['col'], ascending=True)
users_in_group = users_in_group.index.to_numpy()

user_first_group = users_in_group

print(len(user_first_group))

users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert = True)
users_not_in_group = sorted_users[users_not_in_group_flag]

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group)

from ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = ItemKNNSimilarityHybridRecommender

parameterSearch = SearchBayesianSkopt(recommender_class,
                                 evaluator_validation=evaluator_validation,
                                 evaluator_test=evaluator_validation)

from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP" 

parameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       parameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path,
                       output_file_name_root = recommender_class.RECOMMENDER_NAME,
                       metric_to_optimize = metric_to_optimize,
                      )

from Base.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

hyperparameters_list = search_metadata["hyperparameters_list"]

best_parameters = search_metadata["hyperparameters_best"]
best_parameters

7947
1088
EvaluatorHoldout: Ignoring 6859 Users
Iteration No: 1 started. Evaluating function at random point.
ItemKNNSimilarityHybridRecommender: URM Detected 49 (0.62 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2036 (7.84 %) cold items.
SearchBayesianSkopt: Testing config: {'topK': 348, 'alpha': 0.0006178440919720752}
EvaluatorHoldout: Processed 301 ( 100.00% ) in 0.18 sec. Users per second: 1651
SearchBayesianSkopt: New best config found. Config 0: {'topK': 348, 'alpha': 0.0006178440919720752} - results: ROC_AUC: 0.0442968, PRECISION: 0.0066445, PRECISION_RECALL_MIN_DEN: 0.0664452, RECALL: 0.0664452, MAP: 0.0334889, MRR: 0.0334889, NDCG: 0.0410867, F1: 0.0120809, HIT_RATE: 0.0664452, ARHR: 0.0334889, NOVELTY: 0.0051970, AVERAGE_POPULARITY: 0.0133460, DIVERSITY_MEAN_INTER_LIST: 0.9751783, DIVERSITY_HERFINDAHL: 0.9971938, COVERAGE_ITEM: 0.0865833, COVERAGE_ITEM_CORRECT: 0.0007700, COVERAGE_USER: 0.2766544, COVERAGE_USER_CORRECT: 0.0183824, DIVERSITY_GINI: 0.0656402,

{'topK': 724, 'alpha': 0.4112573481807089}

In [223]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["topK"] = Integer(1, 800)
hyperparameters_range_dictionary["alpha"] = Real(low = 0, high = 1, prior = 'uniform')

grouped_users = dataset.groupby(['row']).count()

# All users present into the dataser
sorted_users = grouped_users.sort_values(by=['col'], ascending=True)
sorted_users = sorted_users.index.to_numpy()
print(len(sorted_users))

# User for the firt target of users 0-2 interaction
grouped_users = grouped_users[grouped_users['col'] > 9]
users_in_group = grouped_users.sort_values(by=['col'], ascending=True)
users_in_group = users_in_group.index.to_numpy()

user_last_group = users_in_group

print(len(user_last_group))

users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert = True)
users_not_in_group = sorted_users[users_not_in_group_flag]

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group)

from ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = ItemKNNSimilarityHybridRecommender

parameterSearch = SearchBayesianSkopt(recommender_class,
                                 evaluator_validation=evaluator_validation,
                                 evaluator_test=evaluator_validation)

from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP" 

parameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       parameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path,
                       output_file_name_root = recommender_class.RECOMMENDER_NAME,
                       metric_to_optimize = metric_to_optimize,
                      )

from Base.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

hyperparameters_list = search_metadata["hyperparameters_list"]

best_parameters = search_metadata["hyperparameters_best"]
best_parameters

7947
2413
EvaluatorHoldout: Ignoring 5534 Users
Iteration No: 1 started. Evaluating function at random point.
ItemKNNSimilarityHybridRecommender: URM Detected 49 (0.62 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2036 (7.84 %) cold items.
SearchBayesianSkopt: Testing config: {'topK': 84, 'alpha': 0.6258946273009242}
EvaluatorHoldout: Processed 2255 ( 100.00% ) in 1.48 sec. Users per second: 1520
SearchBayesianSkopt: New best config found. Config 0: {'topK': 84, 'alpha': 0.6258946273009242} - results: ROC_AUC: 0.2261444, PRECISION: 0.0525055, PRECISION_RECALL_MIN_DEN: 0.1344214, RECALL: 0.1279747, MAP: 0.0584221, MRR: 0.1641532, NDCG: 0.0991392, F1: 0.0744611, HIT_RATE: 0.5250554, ARHR: 0.1956358, NOVELTY: 0.0047084, AVERAGE_POPULARITY: 0.1224078, DIVERSITY_MEAN_INTER_LIST: 0.9769555, DIVERSITY_HERFINDAHL: 0.9976522, COVERAGE_ITEM: 0.2311453, COVERAGE_ITEM_CORRECT: 0.0266410, COVERAGE_USER: 0.9345213, COVERAGE_USER_CORRECT: 0.3468711, DIVERSITY_GINI: 0.0795245, SHANNO

{'topK': 547, 'alpha': 0.3295857028765683}

In [224]:
test_users = pd.read_csv('../data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [230]:
user_id = test_users['user_id'].to_numpy()
user_first_group = user_first_group[np.isin(user_first_group, user_id)]
user_last_group = user_last_group[np.isin(user_last_group, user_id)]
print(len(user_id))
print(len(user_last_group))
print(len(user_first_group)) # 1085 perche sono stati rimossi i 3 utenti di cui non voglio fare reccomend

users_middle_flag = np.isin(user_id, user_first_group, invert = True)
user_middle = user_id[users_middle_flag]

users_middle_flag = np.isin(user_middle, user_last_group, invert = True)
user_middle = user_middle[users_middle_flag]

print(len(user_first_group))
print(len(user_last_group))
print(len(user_middle))

7944
2413
1085
1085
2413
4446


In [231]:
recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=170, alpha = 0.45, implicit = True)

recommender_beta_ICM = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM.fit(topK=70, alpha=0.2, beta=0.5, implicit=False)
recommender_beta_ICM.URM_train = URM_all

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender.fit(topK=200, alpha = 0.1)

hybridrecommender_first_group = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_first_group.fit(topK=724, alpha = 0.4112573481807089)

hybridrecommender_last_group = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_last_group.fit(topK=547, alpha = 0.3295857028765683)

recommendations_all = hybridrecommender.recommend(user_middle,cutoff = 10)
recommendations_first = hybridrecommender_first_group.recommend(user_first_group,cutoff = 10)
recommendations_last = hybridrecommender_last_group.recommend(user_last_group,cutoff = 10)


P3alphaRecommender: URM Detected 1079 (4.15 %) cold items.
RP3betaRecommender: URM Detected 2 (0.01 %) cold users.
RP3betaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.


In [232]:
for index in range(len(recommendations_all)):
    recommendations_all[index]=np.array(recommendations_all[index])

print(len(recommendations_all))

for index in range(len(recommendations_first)):
    recommendations_first[index]=np.array(recommendations_first[index])

print(len(recommendations_first))

for index in range(len(recommendations_last)):
    recommendations_last[index]=np.array(recommendations_last[index])

print(len(recommendations_last))

4446
1085
2413


In [233]:
elementi = test_users['user_id'].to_numpy()
recommendations = []

for index in elementi:
    if index in user_first_group:
        local_index, = np.where(user_first_group == index) 
        recommendations.append(recommendations_first[local_index[0]])
        
    if index in user_middle:
        local_index, = np.where(user_middle == index) 
        recommendations.append(recommendations_all[local_index[0]])

    if index in user_last_group:
        local_index, = np.where(user_last_group == index) 
        recommendations.append(recommendations_last[local_index[0]])
    
    print(index, " - ", local_index[0])

print(recommendations)

55, 24950, 10786,   964,  5942,
       12061]), array([14684, 19970, 19874,   800, 25201, 20308, 20569, 16324, 10418,
       17668]), array([10269, 25675,  9438, 14253, 12211,  4502, 12061,  9851, 22554,
       10786]), array([ 4423, 23981,  3759, 22438, 10762, 12195, 25239,    43,  3631,
       10161]), array([ 3037,  9769,   497, 12048, 15559,  8709,  5032,  4936,  7611,
        6358]), array([ 2426, 23154, 10594, 15691, 19089,  6734,  7494, 12409, 20146,
       23600]), array([14684, 19874, 20308, 22653, 18419,  6612, 19879, 10940,  4257,
       10418]), array([ 5032, 12158, 18793, 13609,   497,  8097, 16172, 20333, 25342,
       22548]), array([ 4502, 24355, 18419, 20308,  5581, 10418, 17668, 14137,  1714,
       19879]), array([20735, 23369, 15234,  2122, 18778,  7934,  1722,  5414, 19681,
       22507]), array([ 6780,  9544, 24176, 20973,  1518, 10954,   526, 24740,  2002,
       19603]), array([19089, 24300, 12409, 19480, 19709, 14895, 11730,  7494, 17723,
       13352]), array(

In [234]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)
