# Data loading 
Next cells are used to load the data we need.

In [8]:
import os

os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1

import pandas as pd
import numpy as np 
import scipy.sparse as sps
import sys
import matplotlib.pyplot as plt
import optuna

sys.path.append('../../')

from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from MatrixFactorization.IALSRecommender_implicit import IALSRecommender_implicit

In [9]:
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [10]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

test_users = pd.read_csv('../data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [11]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [12]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [13]:
URM_train_1, URM_validation_1 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
evaluator_validation_1 = EvaluatorHoldout(URM_validation_1, cutoff_list=[10])

URM_train_2, URM_validation_2 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
evaluator_validation_2 = EvaluatorHoldout(URM_validation_2, cutoff_list=[10])

URM_train_3, URM_validation_3 = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
evaluator_validation_3 = EvaluatorHoldout(URM_validation_3, cutoff_list=[10])

URM_train_vector = [URM_train_1, URM_train_2, URM_train_3]
Evaluator_vector = [evaluator_validation_1, evaluator_validation_2, evaluator_validation_3]



In [None]:
class Objective(object):
    def __init__(self, URM_train_vector, ICM_all, Evaluator_vector):
        # Hold this implementation specific arguments as the fields of the class.
        self.URM_train_vector = URM_train_vector
        self.ICM_all = ICM_all
        self.Evaluator_vector = Evaluator_vector
        self.MAP_vector = np.zeros(3)

        

    def __call__(self, trial):
        # Calculate an objective value by using the extra arguments.
        n_factors = trial.suggest_int('n_factors', 500,900)
        regularization = trial.suggest_uniform('regularization', 0.1,0.9)
        iterations = trial.suggest_int('iterations', 60, 90)
        weight = trial.suggest_uniform('weight', 0.3, 0.9)
        self.MAP_vector = np.zeros(3)
        i = 0        
        for URM_train, evaluator_validation in zip(self.URM_train_vector,self.Evaluator_vector):
            URM_train_ICM_all = sps.vstack([URM_train*(1-weight), ICM_all.T*weight])
            recommender = IALSRecommender_implicit(URM_train_ICM_all)
            recommender.fit(n_factors = n_factors, regularization = regularization, iterations = iterations)
            result_dict, _ = evaluator_validation.evaluateRecommender(recommender)
            self.MAP_vector[i]=result_dict[10]["MAP"]
            i = i+1
            
        print('printing self map vector: ', self.MAP_vector)
        MAP = np.sum(self.MAP_vector)/3
        print('printing MAP: ', MAP)
        
        return MAP
    
# Execute an optimization by using an `Objective` instance.
study = optuna.create_study(direction='maximize')
study.optimize(Objective(URM_train_vector, ICM_all, Evaluator_vector), n_trials=50)

print(study.best_params)

[32m[I 2021-01-01 11:55:46,424][0m A new study created in memory with name: no-name-7b23e680-130f-477f-822d-4abfb2929ce6[0m


Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=87.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 6.20 sec. Users per second: 910
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=87.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 5.89 sec. Users per second: 958
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=87.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 5.77 sec. Users per second: 974


[32m[I 2021-01-01 12:21:44,991][0m Trial 0 finished with value: 0.07340507238367493 and parameters: {'n_factors': 847, 'regularization': 0.24691729994969044, 'iterations': 87, 'weight': 0.7309664021123765}. Best is trial 0 with value: 0.07340507238367493.[0m


printing self map vector:  [0.07407015 0.07146375 0.07468132]
printing MAP:  0.07340507238367493
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=64.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 5.93 sec. Users per second: 951
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=64.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 5.72 sec. Users per second: 987
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=64.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 5.77 sec. Users per second: 974


[32m[I 2021-01-01 12:41:35,794][0m Trial 1 finished with value: 0.07314374005283734 and parameters: {'n_factors': 890, 'regularization': 0.17944747920579163, 'iterations': 64, 'weight': 0.37294500731860225}. Best is trial 0 with value: 0.07340507238367493.[0m


printing self map vector:  [0.07307805 0.07109526 0.07525791]
printing MAP:  0.07314374005283734
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=88.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 4.98 sec. Users per second: 1132
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=88.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 5.04 sec. Users per second: 1120
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=88.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 4.97 sec. Users per second: 1131


[32m[I 2021-01-01 12:52:02,286][0m Trial 2 finished with value: 0.07241987782488417 and parameters: {'n_factors': 419, 'regularization': 0.5944657258327788, 'iterations': 88, 'weight': 0.5051703142058501}. Best is trial 0 with value: 0.07340507238367493.[0m


printing self map vector:  [0.07022031 0.07204885 0.07499047]
printing MAP:  0.07241987782488417
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 4.90 sec. Users per second: 1151
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 4.94 sec. Users per second: 1143
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 4.85 sec. Users per second: 1158


[32m[I 2021-01-01 12:59:26,715][0m Trial 3 finished with value: 0.06930097814636489 and parameters: {'n_factors': 321, 'regularization': 0.3377970658939474, 'iterations': 83, 'weight': 0.41096453559183266}. Best is trial 0 with value: 0.07340507238367493.[0m


printing self map vector:  [0.06732149 0.06831339 0.07226805]
printing MAP:  0.06930097814636489
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=89.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 5.59 sec. Users per second: 1009
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=89.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 5.59 sec. Users per second: 1010
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=89.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 5.51 sec. Users per second: 1021


[32m[I 2021-01-01 13:22:15,314][0m Trial 4 finished with value: 0.0757954395548483 and parameters: {'n_factors': 802, 'regularization': 0.2007193103642127, 'iterations': 89, 'weight': 0.6015568127727112}. Best is trial 4 with value: 0.0757954395548483.[0m


printing self map vector:  [0.07595554 0.07385371 0.07757707]
printing MAP:  0.0757954395548483
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=72.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 5.75 sec. Users per second: 981
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=72.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 5.75 sec. Users per second: 982
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=72.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 5.67 sec. Users per second: 991


[32m[I 2021-01-01 13:40:38,025][0m Trial 5 finished with value: 0.06987744744321044 and parameters: {'n_factors': 808, 'regularization': 0.5290031218404988, 'iterations': 72, 'weight': 0.7959816117468789}. Best is trial 4 with value: 0.0757954395548483.[0m


printing self map vector:  [0.07021143 0.06830386 0.07111705]
printing MAP:  0.06987744744321044
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=78.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 5.50 sec. Users per second: 1025
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=78.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 5.52 sec. Users per second: 1024
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=78.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 5.39 sec. Users per second: 1042


[32m[I 2021-01-01 13:57:23,628][0m Trial 6 finished with value: 0.07560394917731246 and parameters: {'n_factors': 682, 'regularization': 0.2696512512293183, 'iterations': 78, 'weight': 0.5579354428353392}. Best is trial 4 with value: 0.0757954395548483.[0m


printing self map vector:  [0.07571324 0.07336414 0.07773447]
printing MAP:  0.07560394917731246
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 5.11 sec. Users per second: 1105
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 5.11 sec. Users per second: 1105
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 5.06 sec. Users per second: 1112


[32m[I 2021-01-01 14:06:19,226][0m Trial 7 finished with value: 0.06963685607772184 and parameters: {'n_factors': 409, 'regularization': 0.10018167194477146, 'iterations': 75, 'weight': 0.6781025156994545}. Best is trial 4 with value: 0.0757954395548483.[0m


printing self map vector:  [0.06853767 0.06811194 0.07226097]
printing MAP:  0.06963685607772184
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 5.50 sec. Users per second: 1027
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))


EvaluatorHoldout: Processed 5647 ( 100.00% ) in 6.08 sec. Users per second: 928
Recommender_Base_Class: URM Detected 55 (0.20 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))


EvaluatorHoldout: Processed 5622 ( 100.00% ) in 7.54 sec. Users per second: 746


[32m[I 2021-01-01 14:52:25,917][0m Trial 8 finished with value: 0.06691376679376397 and parameters: {'n_factors': 737, 'regularization': 0.5378023208616096, 'iterations': 83, 'weight': 0.8216463764091804}. Best is trial 4 with value: 0.0757954395548483.[0m


printing self map vector:  [0.06677894 0.06627086 0.0676915 ]
printing MAP:  0.06691376679376397
Recommender_Base_Class: URM Detected 68 (0.24 %) cold users.
Recommender_Base_Class: URM Detected 1 (0.00 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=82.0), HTML(value='')))


EvaluatorHoldout: Processed 5644 ( 100.00% ) in 5.01 sec. Users per second: 1127
Recommender_Base_Class: URM Detected 82 (0.29 %) cold users.
Recommender_Base_Class: URM Detected 3 (0.01 %) cold items.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=82.0), HTML(value='')))