# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('..')


In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])



In [5]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["topK"] = Integer(1, 800)
hyperparameters_range_dictionary["shrink"] = Integer(1, 800)

In [6]:
from KNN.UserKNNCFRecommender import UserKNNCFRecommender
from ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = UserKNNCFRecommender

parameterSearch = SearchBayesianSkopt(recommender_class,
                                 evaluator_validation=evaluator_validation,
                                 evaluator_test=evaluator_validation)

In [7]:
from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [8]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [9]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 30
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP" 

In [10]:
parameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       parameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path,
                       output_file_name_root = recommender_class.RECOMMENDER_NAME,
                       metric_to_optimize = metric_to_optimize,
                      )

ECALL: 0.1129614, MAP: 0.0547726, MRR: 0.1052556, NDCG: 0.0799163, F1: 0.0441221, HIT_RATE: 0.2741514, ARHR: 0.1159675, NOVELTY: 0.0044989, AVERAGE_POPULARITY: 0.1784748, DIVERSITY_MEAN_INTER_LIST: 0.9524092, DIVERSITY_HERFINDAHL: 0.9952218, COVERAGE_ITEM: 0.2751492, COVERAGE_ITEM_CORRECT: 0.0242926, COVERAGE_USER: 0.6265257, COVERAGE_USER_CORRECT: 0.1401787, DIVERSITY_GINI: 0.0594917, SHANNON_ENTROPY: 9.8671586, 

Iteration No: 16 ended. Search finished for the next optimal point.
Time taken: 4.2741
Function value obtained: -0.0548
Current minimum: -0.0573
Iteration No: 17 started. Searching for the next optimal point.
UserKNNCFRecommender: URM Detected 42 (0.53 %) cold users.
UserKNNCFRecommender: URM Detected 1981 (7.63 %) cold items.
SearchBayesianSkopt: Testing config: {'topK': 95, 'shrink': 13}
Similarity column 7947 ( 100 % ), 16664.18 column/sec, elapsed time 0.01 min
EvaluatorHoldout: Processed 4979 ( 100.00% ) in 3.67 sec. Users per second: 1356
SearchBayesianSkopt: New best 

In [11]:
from Base.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

hyperparameters_list = search_metadata["hyperparameters_list"]

best_parameters = search_metadata["hyperparameters_best"]
best_parameters

{'topK': 146, 'shrink': 1}

In [12]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [13]:
recommender = UserKNNCFRecommender(URM_all)
recommender.fit(shrink=100, topK=150, feature_weighting = "TF-IDF")

user_id = test_users['user_id']
recommendations = recommender.recommend(user_id,cutoff = 10)


UserKNNCFRecommender: URM Detected 1079 (4.15 %) cold items.
Similarity column 7947 ( 100 % ), 16031.16 column/sec, elapsed time 0.01 min


In [14]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

test_users
test_users.to_csv('submission.csv', index=False)


7944
