# Set connection with GitHub

In [1]:
# ! cd /kaggle/working && rm -rf RECsys_Challenge2024

In [2]:
from kaggle_secrets import UserSecretsClient

token = UserSecretsClient().get_secret("Token")

! git clone https://{token}@github.com/madratak/RECsys_Challenge2024.git

Cloning into 'RECsys_Challenge2024'...
remote: Enumerating objects: 6727, done.[K
remote: Counting objects: 100% (277/277), done.[K
remote: Compressing objects: 100% (239/239), done.[K
Receiving objects: 100% (6727/6727), 408.86 MiB | 23.24 MiB/s, done.
remote: Total 6727 (delta 105), reused 30 (delta 6), pack-reused 6450 (from 2)[K
Resolving deltas: 100% (3680/3680), done.
Updating files: 100% (503/503), done.


In [3]:
! pip install PyGithub requests

Collecting PyGithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.5.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.9/375.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.5.0 pynacl-1.5.0


In [4]:
%cd /kaggle/working/RECsys_Challenge2024 
! python run_compile_all_cython.py

/kaggle/working/RECsys_Challenge2024
run_compile_all_cython: Found 11 Cython files in 5 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/usr/bin/python3'
Compiling [1/11]: MatrixFactorization_Cython_Epoch.pyx... 
In file included from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1929[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/arrayobject.h:5[m[K,
                 from [01m[KMatrixFactorization_Cython_Epoch.c:1252[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorization_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_pf_32MatrixFactorization_Cython_Epoch_32MatrixFactorization_Cython_Epoch_10epochIteration_Cython_ASY_SVD_SGD[m[K’:
26256 |         for (__pyx_t_22 = __pyx_v_start_pos_seen_i

In [5]:
from Utils.notebookFunctions import *
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time
import shutil
import optuna
import json
import os
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit


K_PATH = '/kaggle/working/RECsys_Challenge2024'
GH_PATH = 'TrainedModels/WithKFCV/SLIM'

np.random.seed(42)

## Import the repository **RECsys_Challenge2024**

In [6]:
repo = get_repo_from_github(token)

Repository 'RECsys_Challenge2024' found.


In [7]:
config = {
    'model': 'SLIM_BPR',
    'n_folds': 5,
    'metric': 'MAP',
    'tune_parameters': True,
    'database_path': '/kaggle/working/history_SLIM_BPR_MAP.db',
    'copy_prev_best_params': False,
    'tune_best_params': True,
    'save_github': True
}

Import the database where previous tuning trials have been saved.

In [8]:
try:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db', 
        config['database_path']
    )
except FileNotFoundError:
    pass # if not present optuna will create it

# Construction of URM and ICM matrices

In [9]:
URM_all_dataframe = pd.read_csv("/kaggle/working/RECsys_Challenge2024/Dataset/data_train.csv")

n_users = len(URM_all_dataframe["user_id"].unique())
n_items = len(URM_all_dataframe["item_id"].unique())

URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

<35736x38121 sparse matrix of type '<class 'numpy.float64'>'
	with 1764607 stored elements in Compressed Sparse Row format>

In [10]:
ICM_dataframe = pd.read_csv("/kaggle/working/RECsys_Challenge2024/Dataset/data_ICM_metadata.csv")

n_items = len(ICM_dataframe["item_id"].unique())
n_features = len(ICM_dataframe["feature_id"].unique())

ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

<38121x94331 sparse matrix of type '<class 'numpy.float64'>'
	with 2940040 stored elements in Compressed Sparse Row format>

# Training

In [11]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Data_manager.split_functions.split_train_k_folds import split_train_k_folds

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

folds = split_train_k_folds(URM_all, k=config['n_folds'])



In [12]:
from Recommenders.SLIM.SLIM_BPR_Python import SLIM_BPR_Python

def objective_function_SLIM_BPR(optuna_trial):

    full_hyperp = {
                    "topK": optuna_trial.suggest_int("topK", 0, 500),
                    "epochs": optuna_trial.suggest_int("epochs", 1, 50),
                    "lambda_i": optuna_trial.suggest_float("lambda_i", 1e-5, 1e-1, log=True),
                    "lambda_j": optuna_trial.suggest_float("lambda_j", 1e-5, 1e-1, log=True),
                    "learning_rate": optuna_trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True)
                }
    
    validation_results = []
    
    for fold_idx, (URM_train_fold, URM_validation_fold) in enumerate(folds):
    
        recommender_instance = SLIM_BPR_Python(URM_train_fold)
        recommender_instance.fit(**full_hyperp)
        
        evaluator = EvaluatorHoldout(URM_validation_fold, cutoff_list=[10])
        result_df, _ = evaluator.evaluateRecommender(recommender_instance)
        
        validation_results.append(result_df.loc[10]["MAP"])
    
    return np.mean(validation_results)

In [13]:
if config['tune_parameters']:
    
    optuna_study = optuna.create_study(direction='maximize', study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}',
                                storage=f'sqlite:///{config["database_path"]}', load_if_exists=True)

    optuna_study.optimize(objective_function_SLIM_BPR, n_trials=6)

[I 2025-01-06 09:56:15,321] Using an existing study with name 'hyperparameters_tuning_SLIM_BPR_MAP' instead of creating a new one.


Epoch 1, Iteration 35736 in 18.12 seconds. Samples per second 1972.36
Epoch 2, Iteration 35736 in 11.94 seconds. Samples per second 2993.85
Epoch 3, Iteration 35736 in 8.83 seconds. Samples per second 4049.21
Epoch 4, Iteration 35736 in 7.24 seconds. Samples per second 4933.09
Epoch 5, Iteration 35736 in 6.12 seconds. Samples per second 5840.73
Epoch 6, Iteration 35736 in 5.39 seconds. Samples per second 6636.05
Epoch 7, Iteration 35736 in 4.84 seconds. Samples per second 7382.21
Epoch 8, Iteration 35736 in 4.67 seconds. Samples per second 7656.45
Epoch 9, Iteration 35736 in 4.18 seconds. Samples per second 8558.54
Epoch 10, Iteration 35736 in 3.98 seconds. Samples per second 8986.43
Epoch 11, Iteration 35736 in 3.88 seconds. Samples per second 9203.29
Epoch 12, Iteration 35736 in 3.71 seconds. Samples per second 9621.17
Epoch 13, Iteration 35736 in 3.57 seconds. Samples per second 10022.96
Epoch 14, Iteration 35736 in 3.51 seconds. Samples per second 10192.22
Epoch 15, Iteration 35736

[I 2025-01-06 10:24:32,272] Trial 35 finished with value: 0.04017609152696382 and parameters: {'topK': 24, 'epochs': 40, 'lambda_i': 0.014383044621009453, 'lambda_j': 4.6398358065652234e-05, 'learning_rate': 0.09395445630129953}. Best is trial 25 with value: 0.04090163381949502.


Epoch 1, Iteration 35736 in 10.80 seconds. Samples per second 3308.42
Epoch 2, Iteration 35736 in 7.89 seconds. Samples per second 4526.49
Epoch 3, Iteration 35736 in 6.46 seconds. Samples per second 5532.73
Epoch 4, Iteration 35736 in 5.48 seconds. Samples per second 6521.49
Epoch 5, Iteration 35736 in 4.87 seconds. Samples per second 7343.76
Epoch 6, Iteration 35736 in 4.50 seconds. Samples per second 7949.16
Epoch 7, Iteration 35736 in 4.06 seconds. Samples per second 8800.22
Epoch 8, Iteration 35736 in 4.03 seconds. Samples per second 8860.77
Epoch 9, Iteration 35736 in 3.79 seconds. Samples per second 9418.76
Epoch 10, Iteration 35736 in 3.52 seconds. Samples per second 10145.86
Epoch 11, Iteration 35736 in 3.46 seconds. Samples per second 10342.56
Epoch 12, Iteration 35736 in 3.35 seconds. Samples per second 10658.78
Epoch 13, Iteration 35736 in 3.42 seconds. Samples per second 10446.69
Epoch 14, Iteration 35736 in 3.29 seconds. Samples per second 10873.83
Epoch 15, Iteration 357

[I 2025-01-06 10:54:44,361] Trial 36 finished with value: 0.02965150432893593 and parameters: {'topK': 469, 'epochs': 29, 'lambda_i': 0.034235563261137136, 'lambda_j': 2.714967511637935e-05, 'learning_rate': 0.021552457998884066}. Best is trial 25 with value: 0.04090163381949502.


Epoch 1, Iteration 35736 in 10.55 seconds. Samples per second 3388.53
Epoch 2, Iteration 35736 in 7.99 seconds. Samples per second 4473.54
Epoch 3, Iteration 35736 in 6.29 seconds. Samples per second 5676.96
Epoch 4, Iteration 35736 in 5.39 seconds. Samples per second 6632.04
Epoch 5, Iteration 35736 in 4.75 seconds. Samples per second 7527.70
Epoch 6, Iteration 35736 in 4.48 seconds. Samples per second 7982.89
Epoch 7, Iteration 35736 in 4.02 seconds. Samples per second 8887.77
Epoch 8, Iteration 35736 in 3.85 seconds. Samples per second 9273.54
Epoch 9, Iteration 35736 in 3.70 seconds. Samples per second 9656.25
Epoch 10, Iteration 35736 in 3.49 seconds. Samples per second 10236.25
Epoch 11, Iteration 35736 in 3.47 seconds. Samples per second 10294.46
Epoch 12, Iteration 35736 in 3.34 seconds. Samples per second 10712.51
Epoch 13, Iteration 35736 in 3.33 seconds. Samples per second 10732.15
Epoch 14, Iteration 35736 in 3.37 seconds. Samples per second 10590.25
Epoch 15, Iteration 357

[I 2025-01-06 11:25:35,866] Trial 37 finished with value: 0.03123923088165621 and parameters: {'topK': 388, 'epochs': 34, 'lambda_i': 0.004985126829308985, 'lambda_j': 0.00012076796144957506, 'learning_rate': 0.05111442843438457}. Best is trial 25 with value: 0.04090163381949502.


Epoch 1, Iteration 35736 in 10.81 seconds. Samples per second 3306.57
Epoch 2, Iteration 35736 in 7.93 seconds. Samples per second 4507.15
Epoch 3, Iteration 35736 in 6.36 seconds. Samples per second 5618.48
Epoch 4, Iteration 35736 in 5.43 seconds. Samples per second 6578.27
Epoch 5, Iteration 35736 in 4.96 seconds. Samples per second 7201.71
Epoch 6, Iteration 35736 in 4.39 seconds. Samples per second 8144.84
Epoch 7, Iteration 35736 in 4.14 seconds. Samples per second 8622.24
Epoch 8, Iteration 35736 in 4.01 seconds. Samples per second 8907.02
Epoch 9, Iteration 35736 in 3.82 seconds. Samples per second 9361.25
Epoch 10, Iteration 35736 in 3.65 seconds. Samples per second 9798.30
Epoch 11, Iteration 35736 in 3.59 seconds. Samples per second 9942.68
Epoch 12, Iteration 35736 in 3.44 seconds. Samples per second 10385.32
Epoch 13, Iteration 35736 in 3.43 seconds. Samples per second 10433.14
Epoch 14, Iteration 35736 in 3.30 seconds. Samples per second 10836.56
Epoch 15, Iteration 35736

[I 2025-01-06 11:55:51,629] Trial 38 finished with value: 0.0329995209507064 and parameters: {'topK': 272, 'epochs': 38, 'lambda_i': 0.00861873131882086, 'lambda_j': 0.004624294579047549, 'learning_rate': 0.07720245549804158}. Best is trial 25 with value: 0.04090163381949502.


Epoch 1, Iteration 35736 in 10.73 seconds. Samples per second 3329.51
Epoch 2, Iteration 35736 in 7.77 seconds. Samples per second 4600.00
Epoch 3, Iteration 35736 in 6.24 seconds. Samples per second 5727.16
Epoch 4, Iteration 35736 in 5.41 seconds. Samples per second 6607.18
Epoch 5, Iteration 35736 in 4.78 seconds. Samples per second 7470.14
Epoch 6, Iteration 35736 in 4.37 seconds. Samples per second 8176.52
Epoch 7, Iteration 35736 in 4.21 seconds. Samples per second 8488.83
Epoch 8, Iteration 35736 in 3.81 seconds. Samples per second 9375.09
Epoch 9, Iteration 35736 in 3.68 seconds. Samples per second 9714.95
Epoch 10, Iteration 35736 in 3.58 seconds. Samples per second 9985.27
Epoch 11, Iteration 35736 in 3.47 seconds. Samples per second 10298.50
Epoch 12, Iteration 35736 in 3.40 seconds. Samples per second 10512.33
Epoch 13, Iteration 35736 in 3.34 seconds. Samples per second 10712.75
Epoch 14, Iteration 35736 in 3.28 seconds. Samples per second 10884.85
Epoch 15, Iteration 3573

[I 2025-01-06 12:24:10,607] Trial 39 finished with value: 0.03114898949972079 and parameters: {'topK': 75, 'epochs': 41, 'lambda_i': 0.03374534361810787, 'lambda_j': 7.676099375971975e-05, 'learning_rate': 0.00025218061449998303}. Best is trial 25 with value: 0.04090163381949502.


Epoch 1, Iteration 35736 in 10.80 seconds. Samples per second 3309.08
Epoch 2, Iteration 35736 in 7.79 seconds. Samples per second 4589.10
Epoch 3, Iteration 35736 in 6.41 seconds. Samples per second 5578.87
Epoch 4, Iteration 35736 in 5.56 seconds. Samples per second 6426.20
Epoch 5, Iteration 35736 in 4.85 seconds. Samples per second 7370.74
Epoch 6, Iteration 35736 in 4.52 seconds. Samples per second 7901.42
Epoch 7, Iteration 35736 in 4.16 seconds. Samples per second 8585.89
Epoch 8, Iteration 35736 in 3.93 seconds. Samples per second 9085.81
Epoch 9, Iteration 35736 in 3.80 seconds. Samples per second 9414.02
Epoch 10, Iteration 35736 in 3.56 seconds. Samples per second 10037.19
Epoch 11, Iteration 35736 in 3.53 seconds. Samples per second 10130.95
Epoch 12, Iteration 35736 in 3.46 seconds. Samples per second 10316.17
Epoch 13, Iteration 35736 in 3.38 seconds. Samples per second 10579.09
Epoch 14, Iteration 35736 in 3.30 seconds. Samples per second 10835.78
Epoch 15, Iteration 357

[I 2025-01-06 12:47:00,435] Trial 40 finished with value: 0.034891086386285707 and parameters: {'topK': 30, 'epochs': 25, 'lambda_i': 0.0640811970265339, 'lambda_j': 2.5629203197373537e-05, 'learning_rate': 0.02447576222001349}. Best is trial 25 with value: 0.04090163381949502.


## Some optuna visualizations on recommender parameters

In [14]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_slice(optuna_study)
fig.show()

In [15]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_param_importances(optuna_study)
fig.show()

## Let's train the recommender with best parameter values

In [16]:
if config['tune_best_params']:

    if config['tune_parameters']:
        best_params = optuna_study.best_trial.params
    else: 
        with open(f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 'r') as best_params_json:
            best_params = json.load(best_params_json)

    recommender_instance = SLIM_BPR_Python(URM_train + URM_validation)
    recommender_instance.fit(**best_params)

Epoch 1, Iteration 35736 in 11.70 seconds. Samples per second 3054.85
Epoch 2, Iteration 35736 in 8.41 seconds. Samples per second 4251.70
Epoch 3, Iteration 35736 in 6.53 seconds. Samples per second 5469.24
Epoch 4, Iteration 35736 in 5.44 seconds. Samples per second 6568.23
Epoch 5, Iteration 35736 in 4.83 seconds. Samples per second 7392.22
Epoch 6, Iteration 35736 in 4.32 seconds. Samples per second 8270.63
Epoch 7, Iteration 35736 in 4.12 seconds. Samples per second 8677.71
Epoch 8, Iteration 35736 in 3.78 seconds. Samples per second 9451.50
Epoch 9, Iteration 35736 in 3.68 seconds. Samples per second 9722.76
Epoch 10, Iteration 35736 in 3.62 seconds. Samples per second 9864.98
Epoch 11, Iteration 35736 in 3.42 seconds. Samples per second 10463.37
Epoch 12, Iteration 35736 in 3.40 seconds. Samples per second 10508.57
Epoch 13, Iteration 35736 in 3.37 seconds. Samples per second 10611.73
Epoch 14, Iteration 35736 in 3.30 seconds. Samples per second 10831.11
Epoch 15, Iteration 3573

# Testing

Create the recommendations for the submission. 

In [17]:
if config['tune_best_params']:

    data_target_users_test = pd.read_csv('/kaggle/working/RECsys_Challenge2024/Dataset/data_target_users_test.csv')
    create_submission(data_target_users_test, recommender_instance, f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv')

Submission file saved as /kaggle/working/submission_SLIM_BPR_MAP.csv


# Save Version on GitHub 

Write or import a json file where best hyperparameters are saved. 

In [18]:
if config['tune_parameters']:
    with open(f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 'w') as params_file:
        json.dump(optuna_study.best_params, params_file)
        
    if config['save_github']:
        upload_file(
            f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{config["model"]}_{config["metric"]} tuning results (from kaggle notebook)',
            repo
        )
elif config['copy_prev_best_params']:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/'\
        f'best_params_{config["model"]}_{config["metric"]}.json', 
        f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json'
    )

File 'TrainedModels/WithKFCV/SLIM/SLIM_BPRRecommender/OptimizingMAP/best_params_SLIM_BPR_MAP.json' updated successfully.


Save the history of the tuned model.

In [19]:
if config['save_github'] and config['tune_parameters']:
    upload_file(
        config['database_path'], 
        f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db',
        f'Tuning {config["model"]}_{config["metric"]} db updated results (from kaggle notebook)',
        repo
    )

File 'TrainedModels/WithKFCV/SLIM/SLIM_BPRRecommender/OptimizingMAP/history_SLIM_BPR_MAP.db' updated successfully.


Save the best trained model and its submission.

In [20]:
if config['save_github'] and config['tune_best_params']: 
    upload_file(
                f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv', 
                f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/Submission/submission_{config["model"]}_{config["metric"]}.csv', 
                f'New {config["model"]}_{config["metric"]} submission (from kaggle notebook)',
                repo
            )

File 'TrainedModels/WithKFCV/SLIM/SLIM_BPRRecommender/OptimizingMAP/Submission/submission_SLIM_BPR_MAP.csv' updated successfully.
