In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot

from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Utils.DataReader import load_urm, load_icm, load_target

In [3]:
URM_all = load_urm()

URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.85)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 4358 (10.5%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 3112 ( 7.5%) Users that have less than 1 test interactions


## Pure SVD

In [4]:
from Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDRecommender

recommender_class = PureSVDRecommender

### Hyperparameter Tuning

In [5]:
from skopt.space import Integer
#basic tuning of hyperparamether: number of factors N_k

hyperparameters_range_dictionary = {
    "num_factors": Integer(30, 35),
}

We create a bayesian optimizer object, we pass the recommender and the evaluator

In [6]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                           evaluator_validation=evaluator_validation)

We provide the data needed to create an instance of the model, one on the URM_train, the other on the URM_all

In [7]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs

#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

Let's do some experiments!

In [8]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

n_cases = 5
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"
cutoff_to_optimize = 10

### Bayesian Search

In [9]:
hyperparameterSearch.search(recommender_input_args,
                            recommender_input_args_last_test = recommender_input_args_last_test,
                            hyperparameter_search_space = hyperparameters_range_dictionary,
                            n_cases = n_cases,
                            n_random_starts = n_random_starts,
                            save_model = "last",
                            output_folder_path = output_folder_path, # Where to save the results
                            output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                            metric_to_optimize = metric_to_optimize,
                            cutoff_to_optimize = cutoff_to_optimize,
                            )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'num_factors': 32}
PureSVDRecommender: URM Detected 1 ( 0.0%) users with no interactions.
PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 1.20 sec
EvaluatorHoldout: Processed 37271 (100.0%) in 41.32 sec. Users per second: 902
SearchBayesianSkopt: New best config found. Config 0: {'num_factors': 32} - results: PRECISION: 0.0230716, PRECISION_RECALL_MIN_DEN: 0.0564688, RECALL: 0.0556349, MAP: 0.0097705, MAP_MIN_DEN: 0.0234995, MRR: 0.0779316, NDCG: 0.0448999, F1: 0.0326170, HIT_RATE: 0.1836012, ARHR_ALL_HITS: 0.0870259, NOVELTY: 0.0040073, AVERAGE_POPULARITY: 0.1753179, DIVERSITY_MEAN_INTER_LIST: 0.9224066, DIVERSITY_HERFINDAHL: 0.9922382, COVERAGE_ITEM: 0.0132207, COVERAGE_ITEM_HIT: 0.0102012, ITEMS_IN_GT: 0.9305505, COVERAGE_USER: 0.8953134, COVERAGE_USER_HIT: 0.1643806, USERS_IN_GT: 0.8953134, DIVERSITY_GINI: 0.0050018, SHANN



Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 0.3471
Function value obtained: -0.0098
Current minimum: -0.0098
Iteration No: 5 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'num_factors': 30}
PureSVDRecommender: URM Detected 1 ( 0.0%) users with no interactions.
PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 1.00 sec
EvaluatorHoldout: Processed 37271 (100.0%) in 43.27 sec. Users per second: 861
SearchBayesianSkopt: New best config found. Config 4: {'num_factors': 30} - results: PRECISION: 0.0232245, PRECISION_RECALL_MIN_DEN: 0.0567925, RECALL: 0.0559642, MAP: 0.0099635, MAP_MIN_DEN: 0.0239475, MRR: 0.0792713, NDCG: 0.0454815, F1: 0.0328264, HIT_RATE: 0.1843256, ARHR_ALL_HITS: 0.0885838, NOVELTY: 0.0039943, AVERAGE_POPULARITY: 0.1792417, DIVERSITY_MEAN_INTER_LIST: 0.9188238, DIVERSITY_HERFINDAHL: 0.9918799, COVERAGE_ITEM: 0.0127311, COVERAGE_ITEM_HIT: 0.

### Search Results:

In [10]:
from Recommenders.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['algorithm_name_recommender', 'algorithm_name_search', 'cutoff_to_optimize', 'exception_list', 'hyperparameters_best', 'hyperparameters_best_index', 'hyperparameters_df', 'metric_to_optimize', 'result_on_earlystopping_df', 'result_on_last', 'result_on_test_best', 'result_on_test_df', 'result_on_validation_best', 'result_on_validation_df', 'time_df', 'time_on_last_df', 'time_on_test_avg', 'time_on_test_total', 'time_on_train_avg', 'time_on_train_total', 'time_on_validation_avg', 'time_on_validation_total'])

In [11]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,num_factors
0,32
1,35
2,31
3,31
4,30


In [12]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.023072,0.056469,0.055635,0.00977,0.0235,0.077932,0.0449,0.032617,0.183601,0.087026,...,0.895313,0.164381,0.895313,0.005002,7.338793,0.992699,0.011409,0.556704,2.895877,0.408085
1,10,0.022707,0.055216,0.054397,0.009444,0.022491,0.074948,0.043457,0.032039,0.180677,0.083909,...,0.895313,0.161762,0.895313,0.005178,7.389767,0.993013,0.01181,0.56057,2.822456,0.409542
2,10,0.023074,0.056642,0.055824,0.009753,0.02349,0.07779,0.04492,0.032652,0.184057,0.086875,...,0.895313,0.164789,0.895313,0.004908,7.31131,0.992483,0.011195,0.554619,2.926237,0.407365
3,10,0.023074,0.056642,0.055824,0.009753,0.02349,0.07779,0.04492,0.032652,0.184057,0.086875,...,0.895313,0.164789,0.895313,0.004908,7.31131,0.992483,0.011195,0.554619,2.926237,0.407365
4,10,0.023224,0.056793,0.055964,0.009964,0.023947,0.079271,0.045482,0.032826,0.184326,0.088584,...,0.895313,0.165029,0.895313,0.004807,7.285446,0.992341,0.010965,0.552657,2.96069,0.40676


In [13]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'num_factors': 30}

## Recommender Testing

In [15]:
recommender = PureSVDRecommender(URM_all)
recommender.fit(num_factors=30)

PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 1.35 sec


## Submissions

In [16]:
test_users = pd.read_csv('../Dataset/data_target_users_test.csv')

In [17]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user, cutoff=10))

In [18]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])

test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('..\Submissions\Submission_03_10933934_PureSVD.csv', index=False)