# Data loading 
Next cells are used to load the data we need.

In [180]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('../..')


In [181]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


In [182]:
unique_users, min_users, max_users = len(dataset.row.unique()), dataset.row.min(), dataset.row.max()
unique_items, min_items, max_items = len(dataset.col.unique()), dataset.col.min(), dataset.col.max()
print('users stats:',unique_users, min_users, max_users)
print('items stats:',unique_items, min_items, max_items)

users stats: 7947 0 7946
items stats: 24896 0 25974


In [183]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [184]:
items_per_user = np.ediff1d(URM_all.indptr) #tells in which position each of the rows begin
users_per_item = np.ediff1d(sps.csc_matrix(URM_all).indptr)
URM_all = sps.csr_matrix(URM_all)

In [185]:
items_per_user = np.sort(items_per_user) #sorting based on the single value. Losing reference to the user. 
users_per_item = np.sort(users_per_item)

In [186]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [187]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [188]:
ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

In [189]:
features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

In [190]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)



In [191]:
# These are the base recommender for all classess 
# We are going to tune just the hybrid fot the specific class fo user

from GraphBased.P3alphaRecommender import P3alphaRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender

recommender_alpha = P3alphaRecommender(URM_train)
recommender_alpha.fit(topK=170, alpha = 0.45, implicit = True)

recommender_beta_ICM = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM.fit(topK=70, alpha=0.2, beta=0.5, implicit=False)
recommender_beta_ICM.URM_train = URM_train


hybridrecommender_all_user = ItemKNNSimilarityHybridRecommender(URM_train, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_all_user.fit(topK=200, alpha = 0.1)

P3alphaRecommender: URM Detected 35 (0.44 %) cold users.
P3alphaRecommender: URM Detected 2037 (7.84 %) cold items.
RP3betaRecommender: URM Detected 2 (0.01 %) cold users.
RP3betaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 35 (0.44 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2037 (7.84 %) cold items.


In [192]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["topK"] = Integer(1, 800)
hyperparameters_range_dictionary["alpha"] = Real(low = 0, high = 1, prior = 'uniform')

grouped_users = dataset.groupby(['row']).count()

# All users present into the dataser
sorted_users = grouped_users.sort_values(by=['col'], ascending=True)
sorted_users = sorted_users.index.to_numpy()
print(len(sorted_users))

# User for the firt target of users 0-2 interaction
grouped_users = grouped_users[grouped_users['col'] < 3]
users_in_group = grouped_users.sort_values(by=['col'], ascending=True)
users_in_group = users_in_group.index.to_numpy()

user_first_group = users_in_group

print(len(user_first_group))

users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert = True)
users_not_in_group = sorted_users[users_not_in_group_flag]

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group)

from ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = ItemKNNSimilarityHybridRecommender

parameterSearch = SearchBayesianSkopt(recommender_class,
                                 evaluator_validation=evaluator_validation,
                                 evaluator_test=evaluator_validation)

from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP" 

parameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       parameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path,
                       output_file_name_root = recommender_class.RECOMMENDER_NAME,
                       metric_to_optimize = metric_to_optimize,
                      )

from Base.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

hyperparameters_list = search_metadata["hyperparameters_list"]

best_parameters = search_metadata["hyperparameters_best"]
best_parameters

7947
1088
EvaluatorHoldout: Ignoring 6859 Users
Iteration No: 1 started. Evaluating function at random point.
ItemKNNSimilarityHybridRecommender: URM Detected 35 (0.44 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2037 (7.84 %) cold items.
SearchBayesianSkopt: Testing config: {'topK': 660, 'alpha': 0.7860923010122383}
EvaluatorHoldout: Processed 283 ( 100.00% ) in 0.17 sec. Users per second: 1664
SearchBayesianSkopt: New best config found. Config 0: {'topK': 660, 'alpha': 0.7860923010122383} - results: ROC_AUC: 0.0530035, PRECISION: 0.0081272, PRECISION_RECALL_MIN_DEN: 0.0812721, RECALL: 0.0812721, MAP: 0.0412516, MRR: 0.0412516, NDCG: 0.0503461, F1: 0.0147767, HIT_RATE: 0.0812721, ARHR: 0.0412516, NOVELTY: 0.0051219, AVERAGE_POPULARITY: 0.0572642, DIVERSITY_MEAN_INTER_LIST: 0.9848984, DIVERSITY_HERFINDAHL: 0.9981418, COVERAGE_ITEM: 0.0766891, COVERAGE_ITEM_CORRECT: 0.0008855, COVERAGE_USER: 0.2601103, COVERAGE_USER_CORRECT: 0.0211397, DIVERSITY_GINI: 0.0555293, SHANN

{'topK': 462, 'alpha': 0.2493259345951197}

In [193]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["topK"] = Integer(1, 800)
hyperparameters_range_dictionary["alpha"] = Real(low = 0, high = 1, prior = 'uniform')

grouped_users = dataset.groupby(['row']).count()

# All users present into the dataser
sorted_users = grouped_users.sort_values(by=['col'], ascending=True)
sorted_users = sorted_users.index.to_numpy()
print(len(sorted_users))

# User for the firt target of users 0-2 interaction
grouped_users = grouped_users[grouped_users['col'] > 9]
users_in_group = grouped_users.sort_values(by=['col'], ascending=True)
users_in_group = users_in_group.index.to_numpy()

user_last_group = users_in_group

print(len(user_last_group))

users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert = True)
users_not_in_group = sorted_users[users_not_in_group_flag]

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group)

from ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = ItemKNNSimilarityHybridRecommender

parameterSearch = SearchBayesianSkopt(recommender_class,
                                 evaluator_validation=evaluator_validation,
                                 evaluator_test=evaluator_validation)

from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP" 

parameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       parameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path,
                       output_file_name_root = recommender_class.RECOMMENDER_NAME,
                       metric_to_optimize = metric_to_optimize,
                      )

from Base.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

hyperparameters_list = search_metadata["hyperparameters_list"]

best_parameters = search_metadata["hyperparameters_best"]
best_parameters

7947
2413
EvaluatorHoldout: Ignoring 5534 Users
Iteration No: 1 started. Evaluating function at random point.
ItemKNNSimilarityHybridRecommender: URM Detected 35 (0.44 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2037 (7.84 %) cold items.
SearchBayesianSkopt: Testing config: {'topK': 341, 'alpha': 0.5564063313816416}
EvaluatorHoldout: Processed 2258 ( 100.00% ) in 1.65 sec. Users per second: 1372
SearchBayesianSkopt: New best config found. Config 0: {'topK': 341, 'alpha': 0.5564063313816416} - results: ROC_AUC: 0.2249176, PRECISION: 0.0508857, PRECISION_RECALL_MIN_DEN: 0.1289598, RECALL: 0.1233763, MAP: 0.0580111, MRR: 0.1632791, NDCG: 0.0977254, F1: 0.0720535, HIT_RATE: 0.5088574, ARHR: 0.1939272, NOVELTY: 0.0046432, AVERAGE_POPULARITY: 0.1345542, DIVERSITY_MEAN_INTER_LIST: 0.9739715, DIVERSITY_HERFINDAHL: 0.9973540, COVERAGE_ITEM: 0.2046968, COVERAGE_ITEM_CORRECT: 0.0269875, COVERAGE_USER: 0.9357646, COVERAGE_USER_CORRECT: 0.3373394, DIVERSITY_GINI: 0.0655137, SHAN

{'topK': 45, 'alpha': 0.16599037774567918}

In [194]:
test_users = pd.read_csv('../data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [195]:
user_id = test_users['user_id'].to_numpy()
user_first_group = user_first_group[np.isin(user_first_group, user_id)]
user_last_group = user_last_group[np.isin(user_last_group, user_id)]
print(len(user_id))
print(len(user_last_group))
print(len(user_first_group)) # 1085 perche sono stati rimossi i 3 utenti di cui non voglio fare reccomend

users_middle_flag = np.isin(user_id, user_first_group, invert = True)
user_middle = user_id[users_middle_flag]

users_middle_flag = np.isin(user_middle, user_last_group, invert = True)
user_middle = user_middle[users_middle_flag]

print(len(user_first_group))
print(len(user_last_group))
print(len(user_middle))

7944
2413
1085
1085
2413
4446


In [196]:
recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=170, alpha = 0.45, implicit = True)

recommender_beta_ICM = RP3betaRecommender(ICM_all.T)
recommender_beta_ICM.fit(topK=70, alpha=0.2, beta=0.5, implicit=False)
recommender_beta_ICM.URM_train = URM_all

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender.fit(topK=200, alpha = 0.1)

hybridrecommender_first_group = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_first_group.fit(topK=169, alpha = 0.37626543131707546)

hybridrecommender_last_group = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha.W_sparse, recommender_beta_ICM.W_sparse)
hybridrecommender_last_group.fit(topK=497, alpha = 0.13958873680066322)

recommendations_all = hybridrecommender.recommend(user_middle,cutoff = 10)
recommendations_first = hybridrecommender_first_group.recommend(user_first_group,cutoff = 10)
recommendations_last = hybridrecommender_last_group.recommend(user_last_group,cutoff = 10)


P3alphaRecommender: URM Detected 1079 (4.15 %) cold items.
RP3betaRecommender: URM Detected 2 (0.01 %) cold users.
RP3betaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 1079 (4.15 %) cold items.


In [197]:
for index in range(len(recommendations_all)):
    recommendations_all[index]=np.array(recommendations_all[index])

print(len(recommendations_all))

for index in range(len(recommendations_first)):
    recommendations_first[index]=np.array(recommendations_first[index])

print(len(recommendations_first))

for index in range(len(recommendations_last)):
    recommendations_last[index]=np.array(recommendations_last[index])

print(len(recommendations_last))

4446
1085
2413


In [198]:
elementi = test_users['user_id'].to_numpy()
recommendations = []

for index in elementi:
    if elementi[index] in user_first_group:
        local_index = np.where(user_first_group == elementi[index]) 
        recommendations.append(recommendations_first[local_index])
        
    if elementi[index] in user_middle:
        local_index = np.where(user_middle == elementi[index]) 
        recommendations.append(recommendations_all[local_index])

    if elementi[index] in user_last_group:
        local_index = np.where(user_last_group == elementi[index]) 
        recommendations.append(recommendations_last[local_index])

print(recommendations)

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [178]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)
