In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot

from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Utils.DataReader import load_urm, load_icm, load_target

Cloning into 'Recommender-Systems-Challenge-2022'...
remote: Enumerating objects: 619, done.[K
remote: Counting objects: 100% (215/215), done.[K
remote: Compressing objects: 100% (162/162), done.[K
remote: Total 619 (delta 107), reused 139 (delta 52), pack-reused 404[K
Receiving objects: 100% (619/619), 106.18 MiB | 23.69 MiB/s, done.
Resolving deltas: 100% (279/279), done.
Updating files: 100% (254/254), done.


# Loading Data

In [2]:
URM_all = load_urm()

URM_train = sps.load_npz('Dataset/Split/URM_train.npz')
URM_test =  sps.load_npz('Dataset/Split/URM_test.npz')
URM_validation = sps.load_npz('Dataset/Split/URM_validation.npz')

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 1569 ( 3.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 921 ( 2.2%) Users that have less than 1 test interactions


In [3]:
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender

recommender_class = ItemKNNCFRecommender

In [4]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {
    "topK": Integer(5, 1000),
    "shrink": Integer(0, 1000),
    "similarity": Categorical(["cosine"]),
    "normalize": Categorical([True, False]),
    "feature_weighting": Categorical(["TF-IDF"])
}

In [6]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

hyperparameterSearch = SearchBayesianSkopt(recommender_class, evaluator_validation=evaluator_validation)

In [7]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [8]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [9]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [10]:
%load_ext Cython

In [11]:
from Recommenders.Similarity import *

#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 156, 'shrink': 162, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'}
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 24507 (100.0%), 662.38 column/sec. Elapsed time 37.00 sec
EvaluatorHoldout: Processed 40060 (100.0%) in 58.05 sec. Users per second: 690
SearchBayesianSkopt: New best config found. Config 0: {'topK': 156, 'shrink': 162, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'} - results: PRECISION: 0.0286046, PRECISION_RECALL_MIN_DEN: 0.0563436, RECALL: 0.0542709, MAP: 0.0121248, MAP_MIN_DEN: 0.0238903, MRR: 0.0967683, NDCG: 0.0490555, F1: 0.0374634, HIT_RATE: 0.2267099, ARHR_ALL_HITS: 0.1079455, NOVELTY: 0.0037612, AVERAGE_POPULARITY: 0.3780745, DIVERSITY_MEAN_INTER_LIST: 0.6725353, DIVERSITY_HERFINDAHL: 0.9672519, COVERAGE_ITEM: 0.2406659, COVERAGE_ITEM_HIT: 0.0164035, ITEMS_IN_GT: 0.98628



Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 1.2565
Function value obtained: -0.0172
Current minimum: -0.0172
Iteration No: 44 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 923, 'shrink': 967, 'similarity': 'cosine', 'normalize': True, 'feature_weighting': 'TF-IDF'}
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 24507 (100.0%), 407.14 column/sec. Elapsed time 1.00 min
EvaluatorHoldout: Processed 40060 (100.0%) in 1.17 min. Users per second: 570
SearchBayesianSkopt: Config 43 is suboptimal. Config: {'topK': 923, 'shrink': 967, 'similarity': 'cosine', 'normalize': True, 'feature_weighting': 'TF-IDF'} - results: PRECISION: 0.0375462, PRECISION_RECALL_MIN_DEN: 0.0718861, RECALL: 0.0689683, MAP: 0.0166930, MAP_MIN_DEN: 0.0318142, MRR: 0.1249875, NDCG: 0.0639184, F1: 0.0486224, HIT_RATE: 0.2784324, ARHR_ALL_HITS: 0.1438004, NOVELTY: 0.0041723, AVERAGE_POPULARITY: 0.2422863, DIVE



Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 1.2340
Function value obtained: -0.0172
Current minimum: -0.0172
Iteration No: 49 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 236, 'shrink': 991, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'}
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 24507 (100.0%), 640.09 column/sec. Elapsed time 38.29 sec
EvaluatorHoldout: Processed 40060 (100.0%) in 1.03 min. Users per second: 650
SearchBayesianSkopt: Config 48 is suboptimal. Config: {'topK': 236, 'shrink': 991, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'} - results: PRECISION: 0.0280679, PRECISION_RECALL_MIN_DEN: 0.0556038, RECALL: 0.0536052, MAP: 0.0118239, MAP_MIN_DEN: 0.0234631, MRR: 0.0948901, NDCG: 0.0482228, F1: 0.0368441, HIT_RATE: 0.2235147, ARHR_ALL_HITS: 0.1055937, NOVELTY: 0.0037194, AVERAGE_POPULARITY: 0.3932341, D



Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 1.2708
Function value obtained: -0.0172
Current minimum: -0.0172
SearchBayesianSkopt: Search complete. Best config is 41: {'topK': 138, 'shrink': 1000, 'similarity': 'cosine', 'normalize': True, 'feature_weighting': 'TF-IDF'}

SearchBayesianSkopt: Evaluation with constructor data for final test. Using best config: {'topK': 138, 'shrink': 1000, 'similarity': 'cosine', 'normalize': True, 'feature_weighting': 'TF-IDF'}
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 24507 (100.0%), 536.47 column/sec. Elapsed time 45.68 sec
SearchBayesianSkopt: Saving model in result_experiments/ItemKNNCFRecommender

ItemKNNCFRecommender: Saving model in file 'result_experiments/ItemKNNCFRecommender_best_model_last'
ItemKNNCFRecommender: Saving complete


In [12]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['result_on_validation_df', 'metric_to_optimize', 'result_on_last', 'time_on_last_df', 'result_on_earlystopping_df', 'hyperparameters_df', 'time_on_validation_avg', 'algorithm_name_recommender', 'cutoff_to_optimize', 'result_on_test_best', 'algorithm_name_search', 'hyperparameters_best_index', 'time_on_test_avg', 'result_on_validation_best', 'result_on_test_df', 'time_on_test_total', 'exception_list', 'time_on_validation_total', 'time_on_train_avg', 'time_df', 'hyperparameters_best', 'time_on_train_total'])

In [13]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,topK,shrink,similarity,normalize,feature_weighting
0,156,162,cosine,False,TF-IDF
1,310,816,cosine,False,TF-IDF
2,736,138,cosine,True,TF-IDF
3,952,504,cosine,True,TF-IDF
4,705,805,cosine,True,TF-IDF
5,720,630,cosine,False,TF-IDF
6,99,557,cosine,True,TF-IDF
7,762,768,cosine,False,TF-IDF
8,342,173,cosine,False,TF-IDF
9,917,495,cosine,False,TF-IDF


In [14]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.028605,0.056344,0.054271,0.012125,0.02389,0.096768,0.049055,0.037463,0.22671,0.107946,...,0.96231,0.218165,0.96231,0.008279,6.323196,0.967571,0.016364,0.468676,7.473729,0.253212
1,10,0.027731,0.055053,0.053082,0.011618,0.023098,0.093273,0.047556,0.03643,0.220769,0.103793,...,0.96231,0.212448,0.96231,0.005724,5.931209,0.96208,0.011313,0.439622,7.94214,0.248787
2,10,0.037676,0.071851,0.068956,0.016799,0.032058,0.125596,0.064209,0.048728,0.278308,0.144628,...,0.96231,0.267818,0.96231,0.040823,8.673812,0.990772,0.080692,0.642904,4.466302,0.288539
3,10,0.037559,0.07189,0.068989,0.016721,0.031875,0.125109,0.063992,0.048638,0.278333,0.144022,...,0.96231,0.267842,0.96231,0.027427,8.396866,0.989912,0.054214,0.622376,4.665729,0.283853
4,10,0.037644,0.072026,0.069132,0.016792,0.032014,0.125776,0.064217,0.048745,0.278682,0.144698,...,0.96231,0.268178,0.96231,0.023709,8.330286,0.989927,0.046864,0.617441,4.675893,0.282628
5,10,0.026897,0.053692,0.051798,0.011168,0.022274,0.09037,0.04611,0.035408,0.216226,0.100183,...,0.96231,0.208076,0.96231,0.003497,5.547265,0.956514,0.006913,0.411164,8.381321,0.244596
6,10,0.03818,0.072736,0.069791,0.017063,0.032602,0.126921,0.065083,0.049358,0.280579,0.146543,...,0.96231,0.270004,0.96231,0.029024,8.932091,0.993745,0.057371,0.662047,3.844888,0.292254
7,10,0.026812,0.053528,0.05164,0.011142,0.022256,0.09018,0.046022,0.035298,0.215652,0.099958,...,0.96231,0.207524,0.96231,0.003396,5.526832,0.956227,0.006713,0.409649,8.403887,0.244375
8,10,0.027591,0.054806,0.052835,0.011564,0.023006,0.093003,0.047367,0.036251,0.220045,0.103399,...,0.96231,0.211751,0.96231,0.005409,5.882174,0.961362,0.010691,0.435987,8.00081,0.248238
9,10,0.026672,0.053256,0.051377,0.011066,0.022064,0.089611,0.045721,0.035115,0.214703,0.099311,...,0.96231,0.206611,0.96231,0.003036,5.461616,0.955342,0.006002,0.404815,8.47126,0.243686


In [15]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'topK': 138,
 'shrink': 1000,
 'similarity': 'cosine',
 'normalize': True,
 'feature_weighting': 'TF-IDF'}

This are the best hyperparameters found by the bayesian search, we will use them in our model

In [16]:
recommender = ItemKNNCFRecommender(URM_all)
recommender.fit(shrink=1000, topK=138, feature_weighting = 'TF-IDF', similarity = 'cosine', normalize=True)

Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 24507 (100.0%), 532.77 column/sec. Elapsed time 46.00 sec


# Create final recommendations

In [17]:
test_users = pd.read_csv('/kaggle/working/Recommender-Systems-Challenge-2022/Dataset/data_target_users_test.csv')

In [18]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user, cutoff=10))

In [19]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])

test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('/kaggle/working/Submission_SLIM_EN_rp3Beta_EaseR_Linear.csv', index=False)