In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot

from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Utils.DataReader import load_urm, load_icm, load_target

In [2]:
URM_all = load_urm()

URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.85)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 1555 ( 3.7%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 914 ( 2.2%) Users that have less than 1 test interactions


## Pure SVD

In [3]:
from Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDRecommender

recommender_class = PureSVDRecommender

### Hyperparameter Tuning

In [4]:
from skopt.space import Integer
#basic tuning of hyperparamether: number of factors N_k

hyperparameters_range_dictionary = {
    "num_factors": Integer(30, 35),
}

We create a bayesian optimizer object, we pass the recommender and the evaluator

In [5]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                           evaluator_validation=evaluator_validation)

We provide the data needed to create an instance of the model, one on the URM_train, the other on the URM_all

In [6]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs

#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

Let's do some experiments!

In [8]:
import os

output_folder_path = "Experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

n_cases = 10
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"
cutoff_to_optimize = 10

### Bayesian Search

In [9]:
hyperparameterSearch.search(recommender_input_args,
                            recommender_input_args_last_test = recommender_input_args_last_test,
                            hyperparameter_search_space = hyperparameters_range_dictionary,
                            n_cases = n_cases,
                            n_random_starts = n_random_starts,
                            save_model = "last",
                            output_folder_path = output_folder_path, # Where to save the results
                            output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                            metric_to_optimize = metric_to_optimize,
                            cutoff_to_optimize = cutoff_to_optimize,
                            )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'num_factors': 33}
PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 2.94 sec
EvaluatorHoldout: Processed 40074 (100.0%) in 1.06 min. Users per second: 633
SearchBayesianSkopt: New best config found. Config 0: {'num_factors': 33} - results: PRECISION: 0.0252707, PRECISION_RECALL_MIN_DEN: 0.0466731, RECALL: 0.0443830, MAP: 0.0107460, MAP_MIN_DEN: 0.0196231, MRR: 0.0837180, NDCG: 0.0412209, F1: 0.0322048, HIT_RATE: 0.1966113, ARHR_ALL_HITS: 0.0943697, NOVELTY: 0.0041310, AVERAGE_POPULARITY: 0.1617287, DIVERSITY_MEAN_INTER_LIST: 0.9221194, DIVERSITY_HERFINDAHL: 0.9922096, COVERAGE_ITEM: 0.0143632, COVERAGE_ITEM_HIT: 0.0116701, ITEMS_IN_GT: 0.9860040, COVERAGE_USER: 0.9626462, COVERAGE_USER_HIT: 0.1892671, USERS_IN_GT: 0.9626462, DIVERSITY_GINI: 0.0050493, SHANNON_ENTROPY: 7.3624500, RATIO_DIVERSITY_HERFINDAHL: 0.9925382, RATIO_DIVE



Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 0.5530
Function value obtained: -0.0107
Current minimum: -0.0109
Iteration No: 5 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'num_factors': 31}
SearchBayesianSkopt: Config 4 was already explored at index 2. Config: {'num_factors': 31} - results: PRECISION: 0.0255752, PRECISION_RECALL_MIN_DEN: 0.0469048, RECALL: 0.0445992, MAP: 0.0108858, MAP_MIN_DEN: 0.0198775, MRR: 0.0846326, NDCG: 0.0416747, F1: 0.0325085, HIT_RATE: 0.1985826, ARHR_ALL_HITS: 0.0954891, NOVELTY: 0.0041183, AVERAGE_POPULARITY: 0.1661883, DIVERSITY_MEAN_INTER_LIST: 0.9210189, DIVERSITY_HERFINDAHL: 0.9920996, COVERAGE_ITEM: 0.0140368, COVERAGE_ITEM_HIT: 0.0110581, ITEMS_IN_GT: 0.9860040, COVERAGE_USER: 0.9626462, COVERAGE_USER_HIT: 0.1911648, USERS_IN_GT: 0.9626462, DIVERSITY_GINI: 0.0049522, SHANNON_ENTROPY: 7.3336282, RATIO_DIVERSITY_HERFINDAHL: 0.9924281, RATIO_DIVERSITY_GINI: 0.0097838, RATIO_SHANNO



Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 0.4037
Function value obtained: -0.0109
Current minimum: -0.0109
Iteration No: 6 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'num_factors': 35}
PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 2.01 sec
EvaluatorHoldout: Processed 40074 (100.0%) in 46.91 sec. Users per second: 854
SearchBayesianSkopt: Config 5 is suboptimal. Config: {'num_factors': 35} - results: PRECISION: 0.0250287, PRECISION_RECALL_MIN_DEN: 0.0461878, RECALL: 0.0439390, MAP: 0.0106386, MAP_MIN_DEN: 0.0194352, MRR: 0.0830866, NDCG: 0.0408527, F1: 0.0318913, HIT_RATE: 0.1950891, ARHR_ALL_HITS: 0.0935450, NOVELTY: 0.0041353, AVERAGE_POPULARITY: 0.1606836, DIVERSITY_MEAN_INTER_LIST: 0.9221702, DIVERSITY_HERFINDAHL: 0.9922147, COVERAGE_ITEM: 0.0153426, COVERAGE_ITEM_HIT: 0.0116701, ITEMS_IN_GT: 0.9860040, COVERAGE_USER: 0.9626462, COVERAGE_USER



Iteration No: 7 ended. Search finished for the next optimal point.
Time taken: 0.3690
Function value obtained: -0.0107
Current minimum: -0.0109
Iteration No: 8 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'num_factors': 30}
SearchBayesianSkopt: Config 7 was already explored at index 1. Config: {'num_factors': 30} - results: PRECISION: 0.0253805, PRECISION_RECALL_MIN_DEN: 0.0468931, RECALL: 0.0446061, MAP: 0.0108560, MAP_MIN_DEN: 0.0198803, MRR: 0.0846718, NDCG: 0.0416211, F1: 0.0323527, HIT_RATE: 0.1975346, ARHR_ALL_HITS: 0.0954310, NOVELTY: 0.0041094, AVERAGE_POPULARITY: 0.1682646, DIVERSITY_MEAN_INTER_LIST: 0.9181928, DIVERSITY_HERFINDAHL: 0.9918170, COVERAGE_ITEM: 0.0137920, COVERAGE_ITEM_HIT: 0.0111397, ITEMS_IN_GT: 0.9860040, COVERAGE_USER: 0.9626462, COVERAGE_USER_HIT: 0.1901559, USERS_IN_GT: 0.9626462, DIVERSITY_GINI: 0.0048279, SHANNON_ENTROPY: 7.2952357, RATIO_DIVERSITY_HERFINDAHL: 0.9921454, RATIO_DIVERSITY_GINI: 0.0095382, RATIO_SHANNO



Iteration No: 8 ended. Search finished for the next optimal point.
Time taken: 1.0798
Function value obtained: -0.0109
Current minimum: -0.0109
Iteration No: 9 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'num_factors': 31}
SearchBayesianSkopt: Config 8 was already explored at index 2. Config: {'num_factors': 31} - results: PRECISION: 0.0255752, PRECISION_RECALL_MIN_DEN: 0.0469048, RECALL: 0.0445992, MAP: 0.0108858, MAP_MIN_DEN: 0.0198775, MRR: 0.0846326, NDCG: 0.0416747, F1: 0.0325085, HIT_RATE: 0.1985826, ARHR_ALL_HITS: 0.0954891, NOVELTY: 0.0041183, AVERAGE_POPULARITY: 0.1661883, DIVERSITY_MEAN_INTER_LIST: 0.9210189, DIVERSITY_HERFINDAHL: 0.9920996, COVERAGE_ITEM: 0.0140368, COVERAGE_ITEM_HIT: 0.0110581, ITEMS_IN_GT: 0.9860040, COVERAGE_USER: 0.9626462, COVERAGE_USER_HIT: 0.1911648, USERS_IN_GT: 0.9626462, DIVERSITY_GINI: 0.0049522, SHANNON_ENTROPY: 7.3336282, RATIO_DIVERSITY_HERFINDAHL: 0.9924281, RATIO_DIVERSITY_GINI: 0.0097838, RATIO_SHANNO



Iteration No: 9 ended. Search finished for the next optimal point.
Time taken: 1.0233
Function value obtained: -0.0109
Current minimum: -0.0109
Iteration No: 10 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'num_factors': 31}
SearchBayesianSkopt: Config 9 was already explored at index 2. Config: {'num_factors': 31} - results: PRECISION: 0.0255752, PRECISION_RECALL_MIN_DEN: 0.0469048, RECALL: 0.0445992, MAP: 0.0108858, MAP_MIN_DEN: 0.0198775, MRR: 0.0846326, NDCG: 0.0416747, F1: 0.0325085, HIT_RATE: 0.1985826, ARHR_ALL_HITS: 0.0954891, NOVELTY: 0.0041183, AVERAGE_POPULARITY: 0.1661883, DIVERSITY_MEAN_INTER_LIST: 0.9210189, DIVERSITY_HERFINDAHL: 0.9920996, COVERAGE_ITEM: 0.0140368, COVERAGE_ITEM_HIT: 0.0110581, ITEMS_IN_GT: 0.9860040, COVERAGE_USER: 0.9626462, COVERAGE_USER_HIT: 0.1911648, USERS_IN_GT: 0.9626462, DIVERSITY_GINI: 0.0049522, SHANNON_ENTROPY: 7.3336282, RATIO_DIVERSITY_HERFINDAHL: 0.9924281, RATIO_DIVERSITY_GINI: 0.0097838, RATIO_SHANN



Iteration No: 10 ended. Search finished for the next optimal point.
Time taken: 1.0271
Function value obtained: -0.0109
Current minimum: -0.0109
SearchBayesianSkopt: Search complete. Best config is 2: {'num_factors': 31}

SearchBayesianSkopt: Evaluation with constructor data for final test. Using best config: {'num_factors': 31}
PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 1.98 sec
SearchBayesianSkopt: Saving model in result_experiments/PureSVDRecommender

PureSVDRecommender: Saving model in file 'result_experiments/PureSVDRecommender_best_model_last'
PureSVDRecommender: Saving complete


### Search Results:

In [10]:
from Recommenders.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['algorithm_name_recommender', 'algorithm_name_search', 'cutoff_to_optimize', 'exception_list', 'hyperparameters_best', 'hyperparameters_best_index', 'hyperparameters_df', 'metric_to_optimize', 'result_on_earlystopping_df', 'result_on_last', 'result_on_test_best', 'result_on_test_df', 'result_on_validation_best', 'result_on_validation_df', 'time_df', 'time_on_last_df', 'time_on_test_avg', 'time_on_test_total', 'time_on_train_avg', 'time_on_train_total', 'time_on_validation_avg', 'time_on_validation_total'])

In [11]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,num_factors
0,33
1,30
2,31
3,33
4,31
5,35
6,33
7,30
8,31
9,31


In [12]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.025271,0.046673,0.044383,0.010746,0.019623,0.083718,0.041221,0.032205,0.196611,0.09437,...,0.962646,0.189267,0.962646,0.005049,7.36245,0.992538,0.009976,0.54571,3.210517,0.27811
1,10,0.025381,0.046893,0.044606,0.010856,0.01988,0.084672,0.041621,0.032353,0.197535,0.095431,...,0.962646,0.190156,0.962646,0.004828,7.295236,0.992145,0.009538,0.540728,3.340261,0.276654
2,10,0.025575,0.046905,0.044599,0.010886,0.019878,0.084633,0.041675,0.032509,0.198583,0.095489,...,0.962646,0.191165,0.962646,0.004952,7.333628,0.992428,0.009784,0.543574,3.299045,0.277255
3,10,0.025271,0.046673,0.044383,0.010746,0.019623,0.083718,0.041221,0.032205,0.196611,0.09437,...,0.962646,0.189267,0.962646,0.005049,7.36245,0.992538,0.009976,0.54571,3.210517,0.27811
4,10,0.025575,0.046905,0.044599,0.010886,0.019878,0.084633,0.041675,0.032509,0.198583,0.095489,...,0.962646,0.191165,0.962646,0.004952,7.333628,0.992428,0.009784,0.543574,3.299045,0.277255
5,10,0.025029,0.046188,0.043939,0.010639,0.019435,0.083087,0.040853,0.031891,0.195089,0.093545,...,0.962646,0.187802,0.962646,0.005106,7.378287,0.992543,0.010089,0.546884,3.189771,0.278397
6,10,0.025271,0.046673,0.044383,0.010746,0.019623,0.083718,0.041221,0.032205,0.196611,0.09437,...,0.962646,0.189267,0.962646,0.005049,7.36245,0.992538,0.009976,0.54571,3.210517,0.27811
7,10,0.025381,0.046893,0.044606,0.010856,0.01988,0.084672,0.041621,0.032353,0.197535,0.095431,...,0.962646,0.190156,0.962646,0.004828,7.295236,0.992145,0.009538,0.540728,3.340261,0.276654
8,10,0.025575,0.046905,0.044599,0.010886,0.019878,0.084633,0.041675,0.032509,0.198583,0.095489,...,0.962646,0.191165,0.962646,0.004952,7.333628,0.992428,0.009784,0.543574,3.299045,0.277255
9,10,0.025575,0.046905,0.044599,0.010886,0.019878,0.084633,0.041675,0.032509,0.198583,0.095489,...,0.962646,0.191165,0.962646,0.004952,7.333628,0.992428,0.009784,0.543574,3.299045,0.277255


In [13]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'num_factors': 31}

## Recommender Testing

In [15]:
recommender = PureSVDRecommender(URM_all)
recommender.fit(num_factors=31)

PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 1.35 sec


## Submissions

In [16]:
test_users = pd.read_csv('../Dataset/data_target_users_test.csv')

In [17]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user, cutoff=10))

In [18]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])

test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('..\Submissions\Submission_03_10933934_PureSVD.csv', index=False)