# Pre-processing

In [75]:
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time

## URM

In [76]:
URM_all_dataframe = pd.read_csv("C:/Users/mauro/Downloads/Recommender System/Input/data_train.csv")
URM_all_dataframe.head(10)

Unnamed: 0,user_id,item_id,data
0,0,0,1.0
1,0,2,1.0
2,0,120,1.0
3,0,128,1.0
4,0,211,1.0
5,0,232,1.0
6,0,282,1.0
7,0,453,1.0
8,0,458,1.0
9,0,491,1.0


In [77]:
userID_unique = URM_all_dataframe["user_id"].unique()
itemID_unique = URM_all_dataframe["item_id"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("The number of interactions is {}".format(n_interactions))

Number of items	 38121, Number of users	 35736
Max ID items	 38120, Max Id users	 35735

The number of interactions is 1764607


## ICM

In [78]:
ICM_dataframe = pd.read_csv("C:/Users/mauro/Downloads/Recommender System/Input/data_ICM_metadata.csv")
ICM_dataframe.head(10)

Unnamed: 0,item_id,feature_id,data
0,2519,0,1.0
1,2642,0,1.0
2,3316,0,1.0
3,3400,0,1.0
4,3472,0,1.0
5,4361,0,1.0
6,4661,0,1.0
7,8697,0,1.0
8,9692,0,1.0
9,9693,0,1.0


In [79]:
featureID_unique = ICM_dataframe["feature_id"].unique()
itemID_unique = ICM_dataframe["item_id"].unique()

n_items = len(itemID_unique)
n_features = len(featureID_unique)
n_interactions = len(ICM_dataframe)

print ("Number of items\t {}, Number of features\t {}".format(n_items, n_features))
print ("Max ID items\t {}, Max Id features\t {}\n".format(max(itemID_unique), max(featureID_unique)))
print ("The number of combinations is {}".format(n_interactions))

Number of items	 38121, Number of features	 94331
Max ID items	 38120, Max Id features	 94330

The number of combinations is 2940040


## Construction of URM and ICM matrices

In [80]:
URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

<35736x38121 sparse matrix of type '<class 'numpy.float64'>'
	with 1764607 stored elements in Compressed Sparse Row format>

In [81]:
ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

<38121x94331 sparse matrix of type '<class 'numpy.float64'>'
	with 2940040 stored elements in Compressed Sparse Row format>

# Training

In [82]:
!git clone https://github.com/recsyspolimi/RecSys_Course_AT_PoliMi.git
%cd RecSys_Course_AT_PoliMi

c:\Users\mauro\Downloads\Recommender System\RecSys_Course_AT_PoliMi\RecSys_Course_AT_PoliMi\RecSys_Course_AT_PoliMi


Cloning into 'RecSys_Course_AT_PoliMi'...


In [83]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])

EvaluatorHoldout: Ignoring 177 ( 0.5%) Users that have less than 1 test interactions


In [84]:
import optuna
import pandas as pd
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender

def objective_function_KNN_similarities(optuna_trial):
    
    recommender_instance = ItemKNNCFRecommender(URM_train)
    similarity = optuna_trial.suggest_categorical("similarity", ['cosine', 'dice', 'jaccard', 'asymmetric', 'tversky', 'euclidean'])
    
    full_hyperp = {"similarity": similarity,
                   "topK": optuna_trial.suggest_int("topK", 0, 1500),
                   "shrink": optuna_trial.suggest_int("shrink", 0, 1000),
                  }
    
    if similarity == "asymmetric":
        full_hyperp["asymmetric_alpha"] = optuna_trial.suggest_float("asymmetric_alpha", 0, 2, log=False)
        full_hyperp["normalize"] = True     

    elif similarity == "tversky":
        full_hyperp["tversky_alpha"] = optuna_trial.suggest_float("tversky_alpha", 0, 2, log=False)
        full_hyperp["tversky_beta"] = optuna_trial.suggest_float("tversky_beta", 0, 2, log=False)
        full_hyperp["normalize"] = True 

    elif similarity == "euclidean":
        full_hyperp["normalize_avg_row"] = optuna_trial.suggest_categorical("normalize_avg_row", [True, False])
        full_hyperp["similarity_from_distance_mode"] = optuna_trial.suggest_categorical("similarity_from_distance_mode", ["lin", "log", "exp"])
        full_hyperp["normalize"] = optuna_trial.suggest_categorical("normalize", [True, False])
        
    
    recommender_instance.fit(**full_hyperp)
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)
    
    return result_df.loc[10]["MAP"]

In [85]:
class SaveResults(object):
    
    def __init__(self):
        self.results_df = pd.DataFrame(columns=["result"])
    
    def __call__(self, optuna_study, optuna_trial):
        hyperparam_dict = optuna_trial.params.copy()
        hyperparam_dict["result"] = optuna_trial.values[0]
        
        new_row = pd.DataFrame([hyperparam_dict])
        
        if not new_row.empty:
            self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)

In [86]:
# Start measuring time
start_time = time.time()

# Create study and optimize
optuna_study = optuna.create_study(direction="maximize")
save_results = SaveResults()
optuna_study.optimize(objective_function_KNN_similarities,
                      callbacks=[save_results],
                      n_trials=50)

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

# Convert the time to minutes or hours
hours = elapsed_time // 3600
minutes = (elapsed_time % 3600) // 60
seconds = elapsed_time % 60

print('\n')
# Display the time in a more readable format
if hours > 0:
    print(f"Training loop of 50 trials took {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds.")
elif minutes > 0:
    print(f"Training loop of 50 trials took {int(minutes)} minutes and {int(seconds)} seconds.")
else:
    print(f"Training loop of 50 trials took {int(seconds)} seconds.")

[I 2024-11-10 19:03:42,358] A new study created in memory with name: no-name-a70169b4-9ec8-4046-a773-25dc0b59ae3f


Similarity column 38121 (100.0%), 3034.27 column/sec. Elapsed time 12.56 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 42.29 sec. Users per second: 841


[I 2024-11-10 19:04:37,645] Trial 0 finished with value: 0.03618247919732636 and parameters: {'similarity': 'jaccard', 'topK': 108, 'shrink': 889}. Best is trial 0 with value: 0.03618247919732636.
  self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)


Similarity column 38121 (100.0%), 3189.30 column/sec. Elapsed time 11.95 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 56.26 sec. Users per second: 632


[I 2024-11-10 19:05:47,861] Trial 1 finished with value: 0.031666938961715574 and parameters: {'similarity': 'jaccard', 'topK': 778, 'shrink': 846}. Best is trial 0 with value: 0.03618247919732636.


Similarity column 38121 (100.0%), 3273.31 column/sec. Elapsed time 11.65 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 1.04 min. Users per second: 571


[I 2024-11-10 19:07:05,155] Trial 2 finished with value: 0.0320988803763198 and parameters: {'similarity': 'tversky', 'topK': 1466, 'shrink': 515, 'tversky_alpha': 1.0113686774799142, 'tversky_beta': 0.362472596616771}. Best is trial 0 with value: 0.03618247919732636.


Similarity column 38121 (100.0%), 3224.39 column/sec. Elapsed time 11.82 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 46.77 sec. Users per second: 760


[I 2024-11-10 19:08:05,684] Trial 3 finished with value: 0.03718856811639088 and parameters: {'similarity': 'tversky', 'topK': 956, 'shrink': 246, 'tversky_alpha': 1.1975928916150314, 'tversky_beta': 0.7328826083937854}. Best is trial 3 with value: 0.03718856811639088.


Similarity column 38121 (100.0%), 3592.99 column/sec. Elapsed time 10.61 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 35.73 sec. Users per second: 995


[I 2024-11-10 19:08:52,507] Trial 4 finished with value: 0.03544687300381919 and parameters: {'similarity': 'asymmetric', 'topK': 270, 'shrink': 187, 'asymmetric_alpha': 0.30749688125763774}. Best is trial 3 with value: 0.03718856811639088.


Similarity column 38121 (100.0%), 3313.15 column/sec. Elapsed time 11.51 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 39.13 sec. Users per second: 909


[I 2024-11-10 19:09:43,906] Trial 5 finished with value: 0.03476388894468585 and parameters: {'similarity': 'tversky', 'topK': 357, 'shrink': 916, 'tversky_alpha': 1.8048533328280576, 'tversky_beta': 0.06981944669010698}. Best is trial 3 with value: 0.03718856811639088.


Similarity column 38121 (100.0%), 3521.70 column/sec. Elapsed time 10.82 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 58.06 sec. Users per second: 612


[I 2024-11-10 19:10:55,261] Trial 6 finished with value: 0.03281676060845804 and parameters: {'similarity': 'asymmetric', 'topK': 1291, 'shrink': 620, 'asymmetric_alpha': 1.3443301349452965}. Best is trial 3 with value: 0.03718856811639088.


Similarity column 38121 (100.0%), 332.01 column/sec. Elapsed time 1.91 min
EvaluatorHoldout: Processed 35559 (100.0%) in 56.78 sec. Users per second: 626


[I 2024-11-10 19:13:49,528] Trial 7 finished with value: 0.014677197566128964 and parameters: {'similarity': 'euclidean', 'topK': 1326, 'shrink': 76, 'normalize_avg_row': True, 'similarity_from_distance_mode': 'lin', 'normalize': True}. Best is trial 3 with value: 0.03718856811639088.


Similarity column 38121 (100.0%), 3511.08 column/sec. Elapsed time 10.86 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 51.50 sec. Users per second: 690


[I 2024-11-10 19:14:52,918] Trial 8 finished with value: 0.033480365964546094 and parameters: {'similarity': 'asymmetric', 'topK': 632, 'shrink': 864, 'asymmetric_alpha': 1.7987645783920634}. Best is trial 3 with value: 0.03718856811639088.


Similarity column 38121 (100.0%), 3068.82 column/sec. Elapsed time 12.42 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 33.79 sec. Users per second: 1052


[I 2024-11-10 19:15:39,425] Trial 9 finished with value: 0.038444764502723223 and parameters: {'similarity': 'tversky', 'topK': 18, 'shrink': 879, 'tversky_alpha': 0.2805327624022278, 'tversky_beta': 1.2701936475905395}. Best is trial 9 with value: 0.038444764502723223.


Similarity column 38121 (100.0%), 3611.50 column/sec. Elapsed time 10.56 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 29.28 sec. Users per second: 1215


[I 2024-11-10 19:16:19,404] Trial 10 finished with value: 0.0380563869482273 and parameters: {'similarity': 'cosine', 'topK': 16, 'shrink': 667}. Best is trial 9 with value: 0.038444764502723223.


Similarity column 38121 (100.0%), 3528.60 column/sec. Elapsed time 10.80 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 28.07 sec. Users per second: 1267


[I 2024-11-10 19:16:58,419] Trial 11 finished with value: 0.03801388660116271 and parameters: {'similarity': 'cosine', 'topK': 16, 'shrink': 679}. Best is trial 9 with value: 0.038444764502723223.


Similarity column 38121 (100.0%), 3594.87 column/sec. Elapsed time 10.60 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 43.42 sec. Users per second: 819


[I 2024-11-10 19:17:53,182] Trial 12 finished with value: 0.03092828060852944 and parameters: {'similarity': 'cosine', 'topK': 422, 'shrink': 717}. Best is trial 9 with value: 0.038444764502723223.


Similarity column 38121 (100.0%), 3747.17 column/sec. Elapsed time 10.17 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 29.36 sec. Users per second: 1211


[I 2024-11-10 19:18:33,037] Trial 13 finished with value: 0.04221054143415435 and parameters: {'similarity': 'dice', 'topK': 17, 'shrink': 415}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3094.48 column/sec. Elapsed time 12.32 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 44.12 sec. Users per second: 806


[I 2024-11-10 19:19:30,556] Trial 14 finished with value: 0.035783360495879275 and parameters: {'similarity': 'dice', 'topK': 515, 'shrink': 369}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3651.16 column/sec. Elapsed time 10.44 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 34.98 sec. Users per second: 1017


[I 2024-11-10 19:20:16,515] Trial 15 finished with value: 0.037852493753951805 and parameters: {'similarity': 'dice', 'topK': 161, 'shrink': 442}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3602.22 column/sec. Elapsed time 10.58 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 46.47 sec. Users per second: 765


[I 2024-11-10 19:21:15,343] Trial 16 finished with value: 0.03504238317091502 and parameters: {'similarity': 'dice', 'topK': 906, 'shrink': 320}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 534.40 column/sec. Elapsed time 1.19 min
EvaluatorHoldout: Processed 35559 (100.0%) in 28.01 sec. Users per second: 1270


[I 2024-11-10 19:22:54,949] Trial 17 finished with value: 0.02628149862267783 and parameters: {'similarity': 'euclidean', 'topK': 219, 'shrink': 544, 'normalize_avg_row': False, 'similarity_from_distance_mode': 'log', 'normalize': False}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3635.02 column/sec. Elapsed time 10.49 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 42.67 sec. Users per second: 833


[I 2024-11-10 19:23:49,166] Trial 18 finished with value: 0.03288555974711283 and parameters: {'similarity': 'dice', 'topK': 547, 'shrink': 756}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3122.02 column/sec. Elapsed time 12.21 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 35.30 sec. Users per second: 1007


[I 2024-11-10 19:24:37,436] Trial 19 finished with value: 0.0357747039684091 and parameters: {'similarity': 'tversky', 'topK': 351, 'shrink': 85, 'tversky_alpha': 0.017187888332216794, 'tversky_beta': 1.7347908783596457}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3533.14 column/sec. Elapsed time 10.79 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 48.57 sec. Users per second: 732


[I 2024-11-10 19:25:38,693] Trial 20 finished with value: 0.03036330297466629 and parameters: {'similarity': 'dice', 'topK': 1051, 'shrink': 990}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3697.91 column/sec. Elapsed time 10.31 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 30.47 sec. Users per second: 1167


[I 2024-11-10 19:26:19,670] Trial 21 finished with value: 0.037045066393120156 and parameters: {'similarity': 'cosine', 'topK': 44, 'shrink': 607}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3668.96 column/sec. Elapsed time 10.39 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 29.30 sec. Users per second: 1213


[I 2024-11-10 19:26:59,569] Trial 22 finished with value: 0.03715445981348996 and parameters: {'similarity': 'cosine', 'topK': 30, 'shrink': 776}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3651.64 column/sec. Elapsed time 10.44 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 36.23 sec. Users per second: 981


[I 2024-11-10 19:27:46,609] Trial 23 finished with value: 0.03456204912291857 and parameters: {'similarity': 'cosine', 'topK': 187, 'shrink': 449}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3233.32 column/sec. Elapsed time 11.79 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 38.08 sec. Users per second: 934


[I 2024-11-10 19:28:37,109] Trial 24 finished with value: 0.03065006648909369 and parameters: {'similarity': 'tversky', 'topK': 274, 'shrink': 988, 'tversky_alpha': 0.026506333742532195, 'tversky_beta': 1.4988649521790331}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 548.48 column/sec. Elapsed time 1.16 min
EvaluatorHoldout: Processed 35559 (100.0%) in 30.84 sec. Users per second: 1153


[I 2024-11-10 19:30:17,661] Trial 25 finished with value: 0.024637501634885588 and parameters: {'similarity': 'euclidean', 'topK': 129, 'shrink': 636, 'normalize_avg_row': True, 'similarity_from_distance_mode': 'exp', 'normalize': True}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3270.61 column/sec. Elapsed time 11.66 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 41.06 sec. Users per second: 866


[I 2024-11-10 19:31:11,363] Trial 26 finished with value: 0.0346190023109357 and parameters: {'similarity': 'jaccard', 'topK': 459, 'shrink': 550}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3256.01 column/sec. Elapsed time 11.71 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 27.44 sec. Users per second: 1296


[I 2024-11-10 19:31:50,811] Trial 27 finished with value: 0.039360287418583474 and parameters: {'similarity': 'tversky', 'topK': 14, 'shrink': 779, 'tversky_alpha': 0.5373146415834493, 'tversky_beta': 1.1675418068191288}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3188.65 column/sec. Elapsed time 11.96 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 38.75 sec. Users per second: 918


[I 2024-11-10 19:32:42,246] Trial 28 finished with value: 0.03242754384954573 and parameters: {'similarity': 'tversky', 'topK': 313, 'shrink': 786, 'tversky_alpha': 0.47938157824966265, 'tversky_beta': 1.1213535460826312}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3270.25 column/sec. Elapsed time 11.66 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 32.71 sec. Users per second: 1087


[I 2024-11-10 19:33:27,004] Trial 29 finished with value: 0.03514354078198539 and parameters: {'similarity': 'tversky', 'topK': 100, 'shrink': 917, 'tversky_alpha': 0.5290810557564682, 'tversky_beta': 1.1799918101923113}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3188.68 column/sec. Elapsed time 11.96 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 43.44 sec. Users per second: 819


[I 2024-11-10 19:34:23,687] Trial 30 finished with value: 0.03297913550338981 and parameters: {'similarity': 'tversky', 'topK': 660, 'shrink': 425, 'tversky_alpha': 0.5568690818399755, 'tversky_beta': 1.4333168618026084}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3733.84 column/sec. Elapsed time 10.21 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 27.23 sec. Users per second: 1306


[I 2024-11-10 19:35:01,418] Trial 31 finished with value: 0.039187252842022735 and parameters: {'similarity': 'jaccard', 'topK': 5, 'shrink': 828}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3671.26 column/sec. Elapsed time 10.38 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 34.89 sec. Users per second: 1019


[I 2024-11-10 19:35:47,140] Trial 32 finished with value: 0.03611224604580586 and parameters: {'similarity': 'jaccard', 'topK': 121, 'shrink': 835}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 4835.58 column/sec. Elapsed time 7.88 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 26.17 sec. Users per second: 1359


[I 2024-11-10 19:36:21,459] Trial 33 finished with value: 0.031401672248714485 and parameters: {'similarity': 'jaccard', 'topK': 1, 'shrink': 823}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3478.75 column/sec. Elapsed time 10.96 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 33.75 sec. Users per second: 1054


[I 2024-11-10 19:37:06,618] Trial 34 finished with value: 0.03577370518123776 and parameters: {'similarity': 'jaccard', 'topK': 122, 'shrink': 921}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3662.89 column/sec. Elapsed time 10.41 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 36.47 sec. Users per second: 975


[I 2024-11-10 19:37:54,022] Trial 35 finished with value: 0.039396299554907636 and parameters: {'similarity': 'jaccard', 'topK': 216, 'shrink': 258}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3662.95 column/sec. Elapsed time 10.41 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 35.23 sec. Users per second: 1009


[I 2024-11-10 19:38:40,243] Trial 36 finished with value: 0.040185937344461166 and parameters: {'similarity': 'jaccard', 'topK': 222, 'shrink': 210}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3633.85 column/sec. Elapsed time 10.49 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 35.95 sec. Users per second: 989


[I 2024-11-10 19:39:27,251] Trial 37 finished with value: 0.040147949953060855 and parameters: {'similarity': 'jaccard', 'topK': 238, 'shrink': 206}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3651.00 column/sec. Elapsed time 10.44 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 36.20 sec. Users per second: 982


[I 2024-11-10 19:40:14,455] Trial 38 finished with value: 0.040191841906385015 and parameters: {'similarity': 'jaccard', 'topK': 234, 'shrink': 203}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3600.33 column/sec. Elapsed time 10.59 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 38.23 sec. Users per second: 930


[I 2024-11-10 19:41:04,094] Trial 39 finished with value: 0.040060767550641835 and parameters: {'similarity': 'jaccard', 'topK': 396, 'shrink': 152}. Best is trial 13 with value: 0.04221054143415435.


Similarity column 38121 (100.0%), 3516.46 column/sec. Elapsed time 10.84 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 34.22 sec. Users per second: 1039


[I 2024-11-10 19:41:49,767] Trial 40 finished with value: 0.04399164790732171 and parameters: {'similarity': 'jaccard', 'topK': 272, 'shrink': 25}. Best is trial 40 with value: 0.04399164790732171.


Similarity column 38121 (100.0%), 3645.20 column/sec. Elapsed time 10.46 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 33.47 sec. Users per second: 1062


[I 2024-11-10 19:42:34,276] Trial 41 finished with value: 0.044327081930007714 and parameters: {'similarity': 'jaccard', 'topK': 231, 'shrink': 21}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3640.61 column/sec. Elapsed time 10.47 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 34.22 sec. Users per second: 1039


[I 2024-11-10 19:43:19,649] Trial 42 finished with value: 0.043937083996772174 and parameters: {'similarity': 'jaccard', 'topK': 302, 'shrink': 15}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3616.42 column/sec. Elapsed time 10.54 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 34.22 sec. Users per second: 1039


[I 2024-11-10 19:44:05,042] Trial 43 finished with value: 0.04391618981553887 and parameters: {'similarity': 'jaccard', 'topK': 295, 'shrink': 8}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3459.48 column/sec. Elapsed time 11.02 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 32.87 sec. Users per second: 1082


[I 2024-11-10 19:44:49,498] Trial 44 finished with value: 0.04085855075646533 and parameters: {'similarity': 'asymmetric', 'topK': 331, 'shrink': 2, 'asymmetric_alpha': 0.10118139983188645}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3101.48 column/sec. Elapsed time 12.29 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 38.05 sec. Users per second: 935


[I 2024-11-10 19:45:40,956] Trial 45 finished with value: 0.0433216592410449 and parameters: {'similarity': 'jaccard', 'topK': 472, 'shrink': 11}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3605.46 column/sec. Elapsed time 10.57 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 38.30 sec. Users per second: 928


[I 2024-11-10 19:46:30,887] Trial 46 finished with value: 0.043056814362178425 and parameters: {'similarity': 'jaccard', 'topK': 516, 'shrink': 6}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3588.58 column/sec. Elapsed time 10.62 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 43.11 sec. Users per second: 825


[I 2024-11-10 19:47:26,020] Trial 47 finished with value: 0.041129796800933274 and parameters: {'similarity': 'jaccard', 'topK': 679, 'shrink': 74}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3412.47 column/sec. Elapsed time 11.17 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 38.78 sec. Users per second: 917


[I 2024-11-10 19:48:16,870] Trial 48 finished with value: 0.040652519376469465 and parameters: {'similarity': 'jaccard', 'topK': 438, 'shrink': 121}. Best is trial 41 with value: 0.044327081930007714.


Similarity column 38121 (100.0%), 3468.34 column/sec. Elapsed time 10.99 sec
EvaluatorHoldout: Processed 35559 (100.0%) in 36.51 sec. Users per second: 974


[I 2024-11-10 19:49:05,258] Trial 49 finished with value: 0.04312164290334188 and parameters: {'similarity': 'jaccard', 'topK': 378, 'shrink': 41}. Best is trial 41 with value: 0.044327081930007714.




Training loop of 50 trials took 45 minutes and 22 seconds.


In [87]:
pruned_trials = [t for t in optuna_study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in optuna_study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(optuna_study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
print("  Value Validation: ", optuna_study.best_trial.value)

Study statistics: 
  Number of finished trials:  50
  Number of pruned trials:  0
  Number of complete trials:  50
Best trial:
  Value Validation:  0.044327081930007714


In [88]:
optuna_study.best_trial

FrozenTrial(number=41, state=TrialState.COMPLETE, values=[0.044327081930007714], datetime_start=datetime.datetime(2024, 11, 10, 19, 41, 49, 770714), datetime_complete=datetime.datetime(2024, 11, 10, 19, 42, 34, 276324), params={'similarity': 'jaccard', 'topK': 231, 'shrink': 21}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'similarity': CategoricalDistribution(choices=('cosine', 'dice', 'jaccard', 'asymmetric', 'tversky', 'euclidean')), 'topK': IntDistribution(high=1500, log=False, low=0, step=1), 'shrink': IntDistribution(high=1000, log=False, low=0, step=1)}, trial_id=41, value=None)

In [89]:
optuna_study.best_trial.params

{'similarity': 'jaccard', 'topK': 231, 'shrink': 21}

In [90]:
save_results.results_df

Unnamed: 0,result,similarity,topK,shrink,tversky_alpha,tversky_beta,asymmetric_alpha,normalize_avg_row,similarity_from_distance_mode,normalize
0,0.036182,jaccard,108.0,889.0,,,,,,
1,0.031667,jaccard,778.0,846.0,,,,,,
2,0.032099,tversky,1466.0,515.0,1.011369,0.362473,,,,
3,0.037189,tversky,956.0,246.0,1.197593,0.732883,,,,
4,0.035447,asymmetric,270.0,187.0,,,0.307497,,,
5,0.034764,tversky,357.0,916.0,1.804853,0.069819,,,,
6,0.032817,asymmetric,1291.0,620.0,,,1.34433,,,
7,0.014677,euclidean,1326.0,76.0,,,,True,lin,True
8,0.03348,asymmetric,632.0,864.0,,,1.798765,,,
9,0.038445,tversky,18.0,879.0,0.280533,1.270194,,,,


In [91]:
recommender_instance = ItemKNNCFRecommender(URM_train + URM_validation)
recommender_instance.fit(**optuna_study.best_trial.params)

Similarity column 38121 (100.0%), 3220.71 column/sec. Elapsed time 11.84 sec


# Testing

In [92]:
data_target_users_test = pd.read_csv("C:/Users/mauro/Downloads/Recommender System/Input/data_target_users_test.csv")
data_target_users_test.head(10)

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [None]:
target_result = []
for target in data_target_users_test["user_id"]:
    target_result.append(recommender_instance.recommend(target, cutoff = 10, remove_seen_flag=True))

In [None]:
user_ids = data_target_users_test["user_id"]

# Convert `target_result` to the required format for each user
formatted_data = {
    "user_id": user_ids,
    "item_list": [" ".join(map(str, items)) for items in target_result]
}

# Create the DataFrame
submission_df = pd.DataFrame(formatted_data)

submission_df.to_csv("C:/Users/mauro/Downloads/Recommender System/Submissions/submission_optuna_ItemKNNCFRecommender.csv", index=False, header=["user_id", "item_list"])