# Set connection with GitHub

In [1]:
# ! cd /kaggle/working && rm -rf RECsys_Challenge2024

In [2]:
from kaggle_secrets import UserSecretsClient

token = UserSecretsClient().get_secret("Token")

! git clone https://{token}@github.com/madratak/RECsys_Challenge2024.git

Cloning into 'RECsys_Challenge2024'...
remote: Enumerating objects: 4516, done.[K
remote: Counting objects: 100% (128/128), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 4516 (delta 64), reused 28 (delta 9), pack-reused 4388 (from 1)[K
Receiving objects: 100% (4516/4516), 175.76 MiB | 36.77 MiB/s, done.
Resolving deltas: 100% (2645/2645), done.
Updating files: 100% (408/408), done.


In [3]:
! pip install PyGithub requests

Collecting PyGithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.5.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.9/375.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.5.0 pynacl-1.5.0


In [4]:
%cd /kaggle/working/RECsys_Challenge2024 
! python run_compile_all_cython.py

/kaggle/working/RECsys_Challenge2024
run_compile_all_cython: Found 11 Cython files in 5 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/usr/bin/python3'
Compiling [1/11]: MatrixFactorization_Cython_Epoch.pyx... 
In file included from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1929[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/arrayobject.h:5[m[K,
                 from [01m[KMatrixFactorization_Cython_Epoch.c:1252[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorization_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_pf_32MatrixFactorization_Cython_Epoch_32MatrixFactorization_Cython_Epoch_10epochIteration_Cython_ASY_SVD_SGD[m[K’:
26256 |         for (__pyx_t_22 = __pyx_v_start_pos_seen_i

In [5]:
from Utils.notebookFunctions import *
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time
import shutil
import optuna
import json
import os
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit


K_PATH = '/kaggle/working/RECsys_Challenge2024'
GH_PATH = 'TrainedModels/WithoutKFCV/Neural'

np.random.seed(42)

## Import the repository

In [6]:
repo = get_repo_from_github(token)

Repository 'RECsys_Challenge2024' found.


In [7]:
config = {
    'model': 'MultVAE',
    'metric': 'MAP',
    'tune_parameters': True,
    'database_path': '/kaggle/working/history_MultVAE_MAP.db',
    'copy_prev_best_params': False,
    'tune_best_params': True,
    'save_github': True
}

Import the database where previous tuning trials have been saved.

In [8]:
try:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db', 
        config['database_path']
    )
except FileNotFoundError:
    pass # if not present optuna will create it

# Construction of URM and ICM matrices

In [9]:
URM_all_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_train.csv")

n_users = len(URM_all_dataframe["user_id"].unique())
n_items = len(URM_all_dataframe["item_id"].unique())

URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

<35736x38121 sparse matrix of type '<class 'numpy.float64'>'
	with 1764607 stored elements in Compressed Sparse Row format>

In [10]:
ICM_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_ICM_metadata.csv")

n_items = len(ICM_dataframe["item_id"].unique())
n_features = len(ICM_dataframe["feature_id"].unique())

ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

<38121x94331 sparse matrix of type '<class 'numpy.float64'>'
	with 2940040 stored elements in Compressed Sparse Row format>

# Training

In [11]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])

EvaluatorHoldout: Ignoring 141 ( 0.4%) Users that have less than 1 test interactions


In [12]:
from Recommenders.Neural.MultVAE_PyTorch_Recommender import MultVAERecommender_PyTorch_OptimizerMask 

def objective_function_MultVAE(optuna_trial):
    
    recommender_instance = MultVAERecommender_PyTorch_OptimizerMask(URM_train, use_gpu=True)

    early_stopping_params = {
        "validation_every_n": 1,
        "stop_on_validation": True,
        "evaluator_object": evaluator_validation,
        "lower_validations_allowed": 10,
        "validation_metric": "MAP",
        "epochs": 500
    } 

    full_hyperp = {
                    "batch_size": optuna_trial.suggest_int("batch_size", 100, 1000, step=100),
                    "total_anneal_steps": optuna_trial.suggest_int("total_anneal_steps", 10000, 200000),
                    "learning_rate": optuna_trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
                    "l2_reg": optuna_trial.suggest_float("l2_reg", 1e-6, 1e-2, log=True),
                    "dropout": optuna_trial.suggest_float("dropout", 0.1, 0.9, step=0.1),
                    "anneal_cap": optuna_trial.suggest_float("anneal_cap", 0.1, 1.0, step=0.1),
                    # "sgd_mode": optuna_trial.suggest_categorical('optimizer', ['adagrad', 'rmsprop', 'adam', 'adam-w', 'sgd']),
                    "encoding_size": optuna_trial.suggest_int("encoding_size", 50, 500, step=50),
                    "next_layer_size_multiplier": optuna_trial.suggest_float("next_layer_size_multiplier", 1.5, 3.0, step=0.5),
                    "max_n_hidden_layers": optuna_trial.suggest_int("max_n_hidden_layers", 1, 3),
                    **early_stopping_params
                }       
    
    recommender_instance.fit(**full_hyperp)

    epochs = recommender_instance.get_early_stopping_final_epochs_dict()["epochs"]
    optuna_trial.set_user_attr("epochs", epochs) 
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)
    
    return result_df.loc[10]["MAP"]

In [13]:
if config['tune_parameters']:
    
    optuna_study = optuna.create_study(direction='maximize', study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}',
                                storage=f'sqlite:///{config["database_path"]}', load_if_exists=True)

    optuna_study.optimize(objective_function_MultVAE, n_trials=5)

[I 2024-12-25 09:39:35,216] Using an existing study with name 'hyperparameters_tuning_MultVAE_MAP' instead of creating a new one.
  user_batch_tensor = torch.sparse_csr_tensor(user_batch_tensor.indptr,


MultVAERecommender_PyTorch: Validation begins...
EvaluatorHoldout: Processed 35595 (100.0%) in 37.73 sec. Users per second: 943
MultVAERecommender_PyTorch: CUTOFF: 10 - PRECISION: 0.0015339, PRECISION_RECALL_MIN_DEN: 0.0020904, RECALL: 0.0016209, MAP: 0.0004901, MAP_MIN_DEN: 0.0006807, MRR: 0.0047740, NDCG: 0.0019078, F1: 0.0015762, HIT_RATE: 0.0149740, ARHR_ALL_HITS: 0.0048354, NOVELTY: 0.0034976, AVERAGE_POPULARITY: 0.1517622, DIVERSITY_MEAN_INTER_LIST: 0.8799058, DIVERSITY_HERFINDAHL: 0.9879881, COVERAGE_ITEM: 0.0746045, COVERAGE_ITEM_HIT: 0.0041447, ITEMS_IN_GT: 0.9957241, COVERAGE_USER: 0.9960544, COVERAGE_USER_HIT: 0.0149149, USERS_IN_GT: 0.9960544, DIVERSITY_GINI: 0.0064998, SHANNON_ENTROPY: 7.8651295, RATIO_DIVERSITY_HERFINDAHL: 0.9880453, RATIO_DIVERSITY_GINI: 0.0106913, RATIO_SHANNON_ENTROPY: 0.5328385, RATIO_AVERAGE_POPULARITY: 2.0743608, RATIO_NOVELTY: 0.2286604, 

MultVAERecommender_PyTorch: New best model found! Updating.
MultVAERecommender_PyTorch: Epoch 1 of 500. Elapse

[I 2024-12-25 11:48:53,966] Trial 9 finished with value: 0.013987589660577717 and parameters: {'batch_size': 500, 'total_anneal_steps': 89991, 'learning_rate': 2.7795799766068534e-05, 'l2_reg': 3.449570205354873e-06, 'dropout': 0.5, 'anneal_cap': 0.1, 'encoding_size': 250, 'next_layer_size_multiplier': 2.5, 'max_n_hidden_layers': 2}. Best is trial 4 with value: 0.035636114622839145.


MultVAERecommender_PyTorch: Validation begins...
EvaluatorHoldout: Processed 35595 (100.0%) in 37.04 sec. Users per second: 961
MultVAERecommender_PyTorch: CUTOFF: 10 - PRECISION: 0.0215395, PRECISION_RECALL_MIN_DEN: 0.0313796, RECALL: 0.0255732, MAP: 0.0086973, MAP_MIN_DEN: 0.0125179, MRR: 0.0656163, NDCG: 0.0291181, F1: 0.0233837, HIT_RATE: 0.1676078, ARHR_ALL_HITS: 0.0748026, NOVELTY: 0.0031015, AVERAGE_POPULARITY: 0.4108261, DIVERSITY_MEAN_INTER_LIST: 0.8494304, DIVERSITY_HERFINDAHL: 0.9849407, COVERAGE_ITEM: 0.0246321, COVERAGE_ITEM_HIT: 0.0139031, ITEMS_IN_GT: 0.9957241, COVERAGE_USER: 0.9960544, COVERAGE_USER_HIT: 0.1669465, USERS_IN_GT: 0.9960544, DIVERSITY_GINI: 0.0026122, SHANNON_ENTROPY: 6.9262466, RATIO_DIVERSITY_HERFINDAHL: 0.9849977, RATIO_DIVERSITY_GINI: 0.0042967, RATIO_SHANNON_ENTROPY: 0.4692321, RATIO_AVERAGE_POPULARITY: 5.6153751, RATIO_NOVELTY: 0.2027668, 

MultVAERecommender_PyTorch: New best model found! Updating.
MultVAERecommender_PyTorch: Epoch 1 of 500. Elapse

[I 2024-12-25 12:02:46,909] Trial 10 finished with value: 0.032691476197164025 and parameters: {'batch_size': 100, 'total_anneal_steps': 196501, 'learning_rate': 0.0009903790338709735, 'l2_reg': 0.008455990329420609, 'dropout': 0.1, 'anneal_cap': 0.8, 'encoding_size': 350, 'next_layer_size_multiplier': 2.0, 'max_n_hidden_layers': 1}. Best is trial 4 with value: 0.035636114622839145.


MultVAERecommender_PyTorch: Validation begins...
EvaluatorHoldout: Processed 35595 (100.0%) in 37.50 sec. Users per second: 949
MultVAERecommender_PyTorch: CUTOFF: 10 - PRECISION: 0.0171878, PRECISION_RECALL_MIN_DEN: 0.0244699, RECALL: 0.0195505, MAP: 0.0065555, MAP_MIN_DEN: 0.0091731, MRR: 0.0523295, NDCG: 0.0226210, F1: 0.0182932, HIT_RATE: 0.1391488, ARHR_ALL_HITS: 0.0582699, NOVELTY: 0.0030189, AVERAGE_POPULARITY: 0.4851381, DIVERSITY_MEAN_INTER_LIST: 0.7567310, DIVERSITY_HERFINDAHL: 0.9756710, COVERAGE_ITEM: 0.0130112, COVERAGE_ITEM_HIT: 0.0074762, ITEMS_IN_GT: 0.9957241, COVERAGE_USER: 0.9960544, COVERAGE_USER_HIT: 0.1385997, USERS_IN_GT: 0.9960544, DIVERSITY_GINI: 0.0014816, SHANNON_ENTROPY: 6.1358151, RATIO_DIVERSITY_HERFINDAHL: 0.9757275, RATIO_DIVERSITY_GINI: 0.0024369, RATIO_SHANNON_ENTROPY: 0.4156828, RATIO_AVERAGE_POPULARITY: 6.6311093, RATIO_NOVELTY: 0.1973637, 

MultVAERecommender_PyTorch: New best model found! Updating.
MultVAERecommender_PyTorch: Epoch 1 of 500. Elapse

[I 2024-12-25 12:18:10,425] Trial 11 finished with value: 0.03321346185147163 and parameters: {'batch_size': 100, 'total_anneal_steps': 193479, 'learning_rate': 0.0008045960003626156, 'l2_reg': 0.008883120771645766, 'dropout': 0.1, 'anneal_cap': 0.7000000000000001, 'encoding_size': 350, 'next_layer_size_multiplier': 2.0, 'max_n_hidden_layers': 1}. Best is trial 4 with value: 0.035636114622839145.


MultVAERecommender_PyTorch: Validation begins...
EvaluatorHoldout: Processed 35595 (100.0%) in 38.47 sec. Users per second: 925
MultVAERecommender_PyTorch: CUTOFF: 10 - PRECISION: 0.0092148, PRECISION_RECALL_MIN_DEN: 0.0128406, RECALL: 0.0101386, MAP: 0.0041241, MAP_MIN_DEN: 0.0057876, MRR: 0.0344390, NDCG: 0.0133779, F1: 0.0096547, HIT_RATE: 0.0761624, ARHR_ALL_HITS: 0.0375716, NOVELTY: 0.0028484, AVERAGE_POPULARITY: 0.6625771, DIVERSITY_MEAN_INTER_LIST: 0.3369935, DIVERSITY_HERFINDAHL: 0.9336984, COVERAGE_ITEM: 0.0020461, COVERAGE_ITEM_HIT: 0.0013641, ITEMS_IN_GT: 0.9957241, COVERAGE_USER: 0.9960544, COVERAGE_USER_HIT: 0.0758619, USERS_IN_GT: 0.9960544, DIVERSITY_GINI: 0.0003845, SHANNON_ENTROPY: 4.1322012, RATIO_DIVERSITY_HERFINDAHL: 0.9337525, RATIO_DIVERSITY_GINI: 0.0006324, RATIO_SHANNON_ENTROPY: 0.2799440, RATIO_AVERAGE_POPULARITY: 9.0564341, RATIO_NOVELTY: 0.1862177, 

MultVAERecommender_PyTorch: New best model found! Updating.
MultVAERecommender_PyTorch: Epoch 1 of 500. Elapse

[I 2024-12-25 12:44:45,274] Trial 12 finished with value: 0.03538348194079205 and parameters: {'batch_size': 100, 'total_anneal_steps': 199766, 'learning_rate': 0.00036885529121181954, 'l2_reg': 0.009710360648759095, 'dropout': 0.1, 'anneal_cap': 0.5, 'encoding_size': 350, 'next_layer_size_multiplier': 2.0, 'max_n_hidden_layers': 1}. Best is trial 4 with value: 0.035636114622839145.


MultVAERecommender_PyTorch: Validation begins...
EvaluatorHoldout: Processed 35595 (100.0%) in 36.96 sec. Users per second: 963
MultVAERecommender_PyTorch: CUTOFF: 10 - PRECISION: 0.0279281, PRECISION_RECALL_MIN_DEN: 0.0407340, RECALL: 0.0337479, MAP: 0.0116452, MAP_MIN_DEN: 0.0167841, MRR: 0.0856247, NDCG: 0.0383358, F1: 0.0305634, HIT_RATE: 0.2110409, ARHR_ALL_HITS: 0.0989486, NOVELTY: 0.0031708, AVERAGE_POPULARITY: 0.3624598, DIVERSITY_MEAN_INTER_LIST: 0.9070532, DIVERSITY_HERFINDAHL: 0.9907028, COVERAGE_ITEM: 0.0601243, COVERAGE_ITEM_HIT: 0.0323444, ITEMS_IN_GT: 0.9957241, COVERAGE_USER: 0.9960544, COVERAGE_USER_HIT: 0.2102082, USERS_IN_GT: 0.9960544, DIVERSITY_GINI: 0.0061985, SHANNON_ENTROPY: 7.9747419, RATIO_DIVERSITY_HERFINDAHL: 0.9907601, RATIO_DIVERSITY_GINI: 0.0101956, RATIO_SHANNON_ENTROPY: 0.5402644, RATIO_AVERAGE_POPULARITY: 4.9542806, RATIO_NOVELTY: 0.2072969, 

MultVAERecommender_PyTorch: New best model found! Updating.
MultVAERecommender_PyTorch: Epoch 1 of 500. Elapse

[I 2024-12-25 12:54:46,154] Trial 13 finished with value: 0.01801190197481806 and parameters: {'batch_size': 200, 'total_anneal_steps': 127945, 'learning_rate': 0.005198053120567999, 'l2_reg': 0.0014079950641827112, 'dropout': 0.9, 'anneal_cap': 0.4, 'encoding_size': 350, 'next_layer_size_multiplier': 1.5, 'max_n_hidden_layers': 1}. Best is trial 4 with value: 0.035636114622839145.


## Some optuna visualizations on recommender parameters

In [14]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_slice(optuna_study)
fig.show()

In [15]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_param_importances(optuna_study)
fig.show()

## Let's train the recommender with best parameter values

In [16]:
if config['tune_best_params']:

    if config['tune_parameters']:
        best_params = optuna_study.best_trial.params
    else: 
        with open(f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 'r') as best_params_json:
            best_params = json.load(best_params_json)

    recommender_instance = MultVAERecommender_PyTorch_OptimizerMask(URM_train + URM_validation, use_gpu=True)
    recommender_instance.fit(**best_params)

MultVAERecommender_PyTorch: Epoch 1 of 10. Elapsed time 2.35 sec
MultVAERecommender_PyTorch: Epoch 2 of 10. Elapsed time 4.69 sec
MultVAERecommender_PyTorch: Epoch 3 of 10. Elapsed time 7.03 sec
MultVAERecommender_PyTorch: Epoch 4 of 10. Elapsed time 9.37 sec
MultVAERecommender_PyTorch: Epoch 5 of 10. Elapsed time 11.72 sec
MultVAERecommender_PyTorch: Epoch 6 of 10. Elapsed time 14.06 sec
MultVAERecommender_PyTorch: Epoch 7 of 10. Elapsed time 16.41 sec
MultVAERecommender_PyTorch: Epoch 8 of 10. Elapsed time 18.75 sec
MultVAERecommender_PyTorch: Epoch 9 of 10. Elapsed time 21.10 sec
MultVAERecommender_PyTorch: Epoch 10 of 10. Elapsed time 23.45 sec
MultVAERecommender_PyTorch: Terminating at epoch 10. Elapsed time 23.68 sec


# Testing

Create the recommendations for the submission. 

In [17]:
if config['tune_best_params']:

    data_target_users_test = pd.read_csv('/kaggle/input/recommender-system-2024-challenge-polimi/data_target_users_test.csv')
    create_submission(data_target_users_test, recommender_instance, f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv')

Submission file saved as /kaggle/working/submission_MultVAE_MAP.csv


# Save Version on GitHub 

Write or import a json file where best hyperparameters are saved. 

In [18]:
if config['tune_parameters']:
    with open(f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 'w') as params_file:
        json.dump(optuna_study.best_params, params_file)
        
    if config['save_github']:
        upload_file(
            f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{config["model"]}_{config["metric"]} tuning results (from kaggle notebook)',
            repo
        )
elif config['copy_prev_best_params']:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/'\
        f'best_params_{config["model"]}_{config["metric"]}.json', 
        f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json'
    )

File 'TrainedModels/WithoutKFCV/Neural/MultVAERecommender/OptimizingMAP/best_params_MultVAE_MAP.json' created successfully.


Save the history of the tuned model.

In [19]:
if config['save_github'] and config['tune_parameters']:
    upload_file(
        config['database_path'], 
        f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db',
        f'Tuning {config["model"]}_{config["metric"]} db updated results (from kaggle notebook)',
        repo
    )

File 'TrainedModels/WithoutKFCV/Neural/MultVAERecommender/OptimizingMAP/history_MultVAE_MAP.db' updated successfully.


Save the best trained model and its submission.

In [20]:
if config['save_github'] and config['tune_best_params']: 
    upload_file(
                f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv', 
                f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/Submission/submission_{config["model"]}_{config["metric"]}.csv', 
                f'New {config["model"]}_{config["metric"]} submission (from kaggle notebook)',
                repo
            )

File 'TrainedModels/WithoutKFCV/Neural/MultVAERecommender/OptimizingMAP/Submission/submission_MultVAE_MAP.csv' created successfully.
