In [1]:
import os
import random

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as plt
import seaborn as snb

# Random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [2]:
from data_manager import DatasetLoader, DatasetSplitter, URMGenerator

dataset_loader = DatasetLoader()
dataset_splitter = DatasetSplitter(dataset_loader)
dataset_train, dataset_val = dataset_splitter.load_train_val()
URM_generator = URMGenerator(dataset_train, dataset_val)
URM_train, URM_val = URM_generator.generate_explicit_URM()
URM_all = URM_train + URM_val

Loading previusly generated splits...
Generating explicit URM...


In [3]:
from evaluation.evaluator import EvaluatorHoldout

evaluator = EvaluatorHoldout(URM_val, cutoff_list=[10])

In [4]:
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from skopt.space import Real, Integer, Categorical
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs

output_folder_path = "result_experiments/ItemKNNCFRecommender/"
recommender_class = ItemKNNCFRecommender
n_cases = 30
n_random_starts = int(n_cases * 0.3)
metric_to_optimize = "MAP"
cutoff_to_optimize = 10

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Define hyperparameters
hyperparameters_range_dictionary = {
    "topK": Integer(500, 2000),
    "shrink": Integer(0, 1000),
    "similarity": Categorical(["cosine"]),
    "normalize": Categorical([True, False]),
}

hyperparameter_search = SearchBayesianSkopt(
    recommender_class,
    evaluator_validation=evaluator,
)

recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS=[
        URM_train,
    ],
    CONSTRUCTOR_KEYWORD_ARGS={},
    FIT_POSITIONAL_ARGS=[],
    FIT_KEYWORD_ARGS={},
    EARLYSTOPPING_KEYWORD_ARGS={},
)


In [5]:
hyperparameter_search.search(
    recommender_input_args,
    hyperparameter_search_space=hyperparameters_range_dictionary,
    n_cases=n_cases,
    n_random_starts=n_random_starts,
    save_model="best",
    output_folder_path=output_folder_path,  # Where to save the results
    output_file_name_root=recommender_class.RECOMMENDER_NAME,  # How to call the files
    metric_to_optimize=metric_to_optimize,
    cutoff_to_optimize=cutoff_to_optimize,
)

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 594, 'shrink': 913, 'similarity': 'cosine', 'normalize': False}
Similarity column 24507 (100.0%), 5493.30 column/sec. Elapsed time 4.46 sec
EvaluatorHoldout: Processed 41629 (100.0%) in 52.95 sec. Users per second: 786
SearchBayesianSkopt: New best config found. Config 0: {'topK': 594, 'shrink': 913, 'similarity': 'cosine', 'normalize': False} - results: PRECISION: 0.0316150, PRECISION_RECALL_MIN_DEN: 0.0547142, RECALL: 0.0521715, MAP: 0.0138981, MAP_MIN_DEN: 0.0237744, MRR: 0.1077698, NDCG: 0.0551401, F1: 0.0393715, HIT_RATE: 0.2450695, ARHR_ALL_HITS: 0.1219339, NOVELTY: 0.0036897, AVERAGE_POPULARITY: 0.3968522, DIVERSITY_MEAN_INTER_LIST: 0.5925273, DIVERSITY_HERFINDAHL: 0.9592513, COVERAGE_ITEM: 0.1406129, COVERAGE_ITEM_HIT: 0.0137512, ITEMS_IN_GT: 0.9925736, COVERAGE_USER: 1.0000000, COVERAGE_USER_HIT: 0.2450695, USERS_IN_GT: 1.0000000, DIVERSITY_GINI: 0.0037006, SHANNON_ENTRO



Iteration No: 12 ended. Search finished for the next optimal point.
Time taken: 0.4946
Function value obtained: -0.0214
Current minimum: -0.0214
Iteration No: 13 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 503, 'shrink': 999, 'similarity': 'cosine', 'normalize': True}
Similarity column 24507 (100.0%), 5055.79 column/sec. Elapsed time 4.85 sec
EvaluatorHoldout: Processed 41629 (100.0%) in 51.36 sec. Users per second: 811
SearchBayesianSkopt: Config 12 is suboptimal. Config: {'topK': 503, 'shrink': 999, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0357035, PRECISION_RECALL_MIN_DEN: 0.0605078, RECALL: 0.0574890, MAP: 0.0169753, MAP_MIN_DEN: 0.0283043, MRR: 0.1275474, NDCG: 0.0635587, F1: 0.0440498, HIT_RATE: 0.2664729, ARHR_ALL_HITS: 0.1465109, NOVELTY: 0.0038195, AVERAGE_POPULARITY: 0.3424230, DIVERSITY_MEAN_INTER_LIST: 0.7301781, DIVERSITY_HERFINDAHL: 0.9730161, COVERAGE_ITEM: 0.1534664, COVERAGE_ITEM_HIT: 0.0177092,



Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.6699
Function value obtained: -0.0214
Current minimum: -0.0214
Iteration No: 22 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 500, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}
SearchBayesianSkopt: Config 21 was already explored at index 10. Config: {'topK': 500, 'shrink': 0, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0441255, PRECISION_RECALL_MIN_DEN: 0.0716458, RECALL: 0.0676926, MAP: 0.0214337, MAP_MIN_DEN: 0.0345214, MRR: 0.1508167, NDCG: 0.0745194, F1: 0.0534255, HIT_RATE: 0.3070215, ARHR_ALL_HITS: 0.1787956, NOVELTY: 0.0044779, AVERAGE_POPULARITY: 0.1975485, DIVERSITY_MEAN_INTER_LIST: 0.9214757, DIVERSITY_HERFINDAHL: 0.9921454, COVERAGE_ITEM: 0.6211695, COVERAGE_ITEM_HIT: 0.0533725, ITEMS_IN_GT: 0.9925736, COVERAGE_USER: 1.0000000, COVERAGE_USER_HIT: 0.3070215, USERS_IN_GT: 1.0000000, DIVERSITY_GINI: 0.0545728, SHANNON_



Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.5883
Function value obtained: -0.0214
Current minimum: -0.0214
Iteration No: 23 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 1996, 'shrink': 988, 'similarity': 'cosine', 'normalize': True}
Similarity column 24507 (100.0%), 4685.01 column/sec. Elapsed time 5.23 sec
EvaluatorHoldout: Processed 41629 (100.0%) in 1.07 min. Users per second: 647
SearchBayesianSkopt: Config 22 is suboptimal. Config: {'topK': 1996, 'shrink': 988, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0347762, PRECISION_RECALL_MIN_DEN: 0.0593770, RECALL: 0.0564950, MAP: 0.0164343, MAP_MIN_DEN: 0.0275629, MRR: 0.1244094, NDCG: 0.0617640, F1: 0.0430515, HIT_RATE: 0.2616926, ARHR_ALL_HITS: 0.1423660, NOVELTY: 0.0037415, AVERAGE_POPULARITY: 0.3694545, DIVERSITY_MEAN_INTER_LIST: 0.6762735, DIVERSITY_HERFINDAHL: 0.9676257, COVERAGE_ITEM: 0.0886277, COVERAGE_ITEM_HIT: 0.0150569



Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.8458
Function value obtained: -0.0214
Current minimum: -0.0214
Iteration No: 30 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 500, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}
SearchBayesianSkopt: Config 29 was already explored at index 10. Config: {'topK': 500, 'shrink': 0, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0441255, PRECISION_RECALL_MIN_DEN: 0.0716458, RECALL: 0.0676926, MAP: 0.0214337, MAP_MIN_DEN: 0.0345214, MRR: 0.1508167, NDCG: 0.0745194, F1: 0.0534255, HIT_RATE: 0.3070215, ARHR_ALL_HITS: 0.1787956, NOVELTY: 0.0044779, AVERAGE_POPULARITY: 0.1975485, DIVERSITY_MEAN_INTER_LIST: 0.9214757, DIVERSITY_HERFINDAHL: 0.9921454, COVERAGE_ITEM: 0.6211695, COVERAGE_ITEM_HIT: 0.0533725, ITEMS_IN_GT: 0.9925736, COVERAGE_USER: 1.0000000, COVERAGE_USER_HIT: 0.3070215, USERS_IN_GT: 1.0000000, DIVERSITY_GINI: 0.0545728, SHANNON_



Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.6318
Function value obtained: -0.0214
Current minimum: -0.0214
SearchBayesianSkopt: Search complete. Best config is 10: {'topK': 500, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}



In [6]:
from Recommenders.DataIO import DataIO

data_loader = DataIO(folder_path=output_folder_path)
search_metadata = data_loader.load_data(
    recommender_class.RECOMMENDER_NAME + "_metadata.zip"
)

result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.031615,0.054714,0.052172,0.013898,0.023774,0.10777,0.05514,0.039372,0.24507,0.121934,...,1.0,0.24507,1.0,0.003701,5.697423,0.959568,0.007274,0.422222,7.841874,0.211102
1,10,0.031572,0.054633,0.052096,0.013876,0.023736,0.107638,0.055039,0.039316,0.244901,0.121761,...,1.0,0.244901,1.0,0.003597,5.67897,0.959325,0.00707,0.420855,7.863236,0.210923
2,10,0.030993,0.053891,0.051434,0.013565,0.023277,0.105623,0.053983,0.038679,0.24149,0.119269,...,1.0,0.24149,1.0,0.002303,5.420465,0.955804,0.004528,0.401698,8.14719,0.208442
3,10,0.043753,0.071714,0.067817,0.021249,0.034487,0.150606,0.075576,0.05319,0.30719,0.177911,...,1.0,0.30719,1.0,0.023544,8.46768,0.99046,0.046282,0.62752,4.413968,0.243614
4,10,0.031315,0.054226,0.051726,0.013754,0.023563,0.106907,0.054556,0.039012,0.243364,0.120815,...,1.0,0.243364,1.0,0.003105,5.585227,0.958004,0.006104,0.413908,7.969457,0.210018
5,10,0.031096,0.053958,0.051485,0.013618,0.023339,0.105944,0.054129,0.038774,0.241923,0.119678,...,1.0,0.241923,1.0,0.002544,5.471597,0.956482,0.005,0.405487,8.092782,0.208927
6,10,0.031072,0.053954,0.051486,0.013607,0.023321,0.105887,0.054119,0.038755,0.241875,0.119602,...,1.0,0.241875,1.0,0.002464,5.455576,0.95627,0.004843,0.4043,8.110124,0.208773
7,10,0.039528,0.066091,0.062639,0.019025,0.031337,0.139348,0.069847,0.048469,0.286843,0.161996,...,1.0,0.286843,1.0,0.00739,7.137856,0.982519,0.014526,0.528969,5.768751,0.226229
8,10,0.037587,0.063352,0.060144,0.018012,0.029872,0.133745,0.066653,0.046262,0.276466,0.154486,...,1.0,0.276466,1.0,0.005088,6.652998,0.977094,0.010003,0.493038,6.430941,0.220711
9,10,0.043765,0.071077,0.067176,0.021239,0.034232,0.149408,0.073633,0.053001,0.303971,0.177175,...,1.0,0.303971,1.0,0.05667,9.15227,0.991748,0.1114,0.678253,4.058719,0.255407


In [7]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,topK,shrink,similarity,normalize
0,594,913,cosine,False
1,624,959,cosine,False
2,1508,200,cosine,False
3,603,17,cosine,True
4,819,358,cosine,False
5,1218,732,cosine,False
6,1296,34,cosine,False
7,530,259,cosine,True
8,1043,418,cosine,True
9,2000,0,cosine,True


In [8]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'topK': 500, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}

In [9]:
exception_list = search_metadata["exception_list"]
exception_list

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [10]:
recommender = recommender_class(URM_all)
recommender.fit(**best_hyperparameters)
recommender.save_model(
    folder_path=output_folder_path,
    file_name=recommender_class.RECOMMENDER_NAME
    + "_best_model_trained_on_everything.zip",
)

Similarity column 24507 (100.0%), 3450.89 column/sec. Elapsed time 7.10 sec
ItemKNNCFRecommender: Saving model in file 'result_experiments/ItemKNNCFRecommender/ItemKNNCFRecommender_best_model_trained_on_everything.zip'
ItemKNNCFRecommender: Saving complete


In [None]:
from utils.create_submission import create_submission

create_submission(recommender)