# Data loading 
Next cells are used to load the data we need.

In [46]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('..')


In [47]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [48]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [49]:
ICM_df = pd.read_csv('data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [50]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [51]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

In [52]:
#evaluator for validation (used for hyperparameter tuning)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])


In [53]:
from KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt


recommender_class = ItemKNNCFRecommender

parameterSearch = SearchBayesianSkopt(recommender_class,
                                 evaluator_validation=evaluator_validation)

from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["topK"] = Integer(5, 1000)
hyperparameters_range_dictionary["shrink"] = Integer(0, 1000)
hyperparameters_range_dictionary["similarity"] = Categorical(["cosine"])
hyperparameters_range_dictionary["normalize"] = Categorical([True, False])
    
    
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)


In [54]:
output_folder_path = "result_experiments/"

import os

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

In [55]:
n_cases = 2
metric_to_optimize = "MAP"

parameterSearch.search(recommender_input_args,
                       parameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = 1,
                       save_model = "no",
                       output_folder_path = output_folder_path,
                       output_file_name_root = recommender_class.RECOMMENDER_NAME,
                       metric_to_optimize = metric_to_optimize
                      )

Iteration No: 1 started. Evaluating function at random point.
ItemKNNCFRecommender: URM Detected 195 (2.45 %) cold users.
ItemKNNCFRecommender: URM Detected 3485 (13.42 %) cold items.
SearchBayesianSkopt: Testing config: {'topK': 234, 'shrink': 896, 'similarity': 'cosine', 'normalize': False}
Similarity column 25975 ( 100 % ), 8405.16 column/sec, elapsed time 0.05 min
EvaluatorHoldout: Processed 4495 ( 100.00% ) in 3.01 sec. Users per second: 1495
SearchBayesianSkopt: New best config found. Config 0: {'topK': 234, 'shrink': 896, 'similarity': 'cosine', 'normalize': False} - results: ROC_AUC: 0.0992172, PRECISION: 0.0186874, PRECISION_RECALL_MIN_DEN: 0.0792238, RECALL: 0.0779763, MAP: 0.0380312, MRR: 0.0710179, NDCG: 0.0553456, F1: 0.0301494, HIT_RATE: 0.1868743, ARHR: 0.0766635, NOVELTY: 0.0046662, AVERAGE_POPULARITY: 0.1381434, DIVERSITY_MEAN_INTER_LIST: 0.9704905, DIVERSITY_HERFINDAHL: 0.9970275, COVERAGE_ITEM: 0.3212705, COVERAGE_ITEM_CORRECT: 0.0162464, COVERAGE_USER: 0.5656222, CO

ValueError: array must not contain infs or NaNs

In [None]:
from Base.DataIO import DataIO

data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")
search_metadata

In [None]:
from GraphBased.P3alphaRecommender import P3alphaRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender

recommender_alpha = P3alphaRecommender(URM_train)
recommender_alpha.fit(topK=450, alpha = 0.5, implicit = True)

recommender_itemKNNCBF = ItemKNNCBFRecommender(URM_train, ICM_all)
recommender_itemKNNCBF.fit(topK=200, shrink=250, feature_weighting = 'TF-IDF')

In [None]:
hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_train, recommender_itemKNNCBF.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=500, alpha = 0.57)

In [None]:
evaluator_validation.evaluateRecommender(hybridrecommender)

In [None]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

In [None]:
recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=450, alpha = 0.5, implicit = True)

recommender_itemKNNCBF = ItemKNNCBFRecommender(URM_all, ICM_all)
recommender_itemKNNCBF.fit(topK=200, shrink=250, feature_weighting = 'TF-IDF')

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_all, recommender_itemKNNCBF.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=500, alpha = 0.57)

user_id = test_users['user_id']
recommendations = hybridrecommender.recommend(user_id,cutoff = 10)


In [None]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

In [None]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)
