# Set connection with GitHub

In [1]:
# ! cd /kaggle/working && rm -rf RECsys_Challenge2024

In [2]:
from kaggle_secrets import UserSecretsClient

token = UserSecretsClient().get_secret("Token")

! git clone https://{token}@github.com/madratak/RECsys_Challenge2024.git

Cloning into 'RECsys_Challenge2024'...
remote: Enumerating objects: 2369, done.[K
remote: Counting objects: 100% (751/751), done.[K
remote: Compressing objects: 100% (286/286), done.[K
remote: Total 2369 (delta 480), reused 711 (delta 447), pack-reused 1618 (from 1)[K
Receiving objects: 100% (2369/2369), 148.51 MiB | 26.88 MiB/s, done.
Resolving deltas: 100% (1425/1425), done.


In [3]:
! pip install PyGithub requests

Collecting PyGithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.5.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.9/375.9 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.5.0 pynacl-1.5.0


In [4]:
%cd /kaggle/working/RECsys_Challenge2024 
! python run_compile_all_cython.py

/kaggle/working/RECsys_Challenge2024
run_compile_all_cython: Found 11 Cython files in 5 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/11]: MatrixFactorization_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1929[m[K,
                 from [01m[K/opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5[m[K,
                 from [01m[KMatrixFactorization_Cython_Epoch.c:1252[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorization_Cython_Epoch.c:[m[K In function '[01m[K__pyx_pf_32MatrixFactorization_Cython_Epoch_32MatrixFactorization_Cython_Epoch_10epochIteration_Cython_ASY_SVD_SGD[m[K':
26255 |         [01;35m[Kfor[m[K (__pyx_t_21 = __

In [5]:
from Utils.notebookFunctions import *
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time
import shutil
import optuna
import json
import os
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit


K_PATH = '/kaggle/working/RECsys_Challenge2024'
GH_PATH = 'TrainedModels/WithoutKFCV/SingleRecommenders'

np.random.seed(42)

## Import the repository

In [6]:
repo = get_repo_from_github(token)

Repository 'RECsys_Challenge2024' found.


In [7]:
config = {
    'model': 'MultiThreadSLIM',
    'tune_parameters': True,
    'database_path': '/kaggle/working/history_MultiThreadSLIM.db',
    'copy_prev_best_params': False,
    'tune_best_params': False,
    'save_github': True
}

Import the database where previous tuning trials have been saved.

In [8]:
try:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/history_{config["model"]}.db', 
        config['database_path']
    )
except FileNotFoundError:
    pass # if not present optuna will create it

# Construction of URM and ICM matrices

In [9]:
URM_all_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_train.csv")

n_users = len(URM_all_dataframe["user_id"].unique())
n_items = len(URM_all_dataframe["item_id"].unique())

URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1764607 stored elements and shape (35736, 38121)>

In [10]:
ICM_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_ICM_metadata.csv")

n_items = len(ICM_dataframe["item_id"].unique())
n_features = len(ICM_dataframe["feature_id"].unique())

ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2940040 stored elements and shape (38121, 94331)>

# Training

In [11]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])

EvaluatorHoldout: Ignoring 141 ( 0.4%) Users that have less than 1 test interactions


Check how many core are available.

In [12]:
from multiprocessing import Pool, cpu_count, shared_memory
workers = int(cpu_count())
workers

4

In [13]:
from Recommenders.SLIM.SLIMElasticNetRecommender import MultiThreadSLIM_SLIMElasticNetRecommender

def objective_function_MultiThreadSLIM(optuna_trial):

    start_time = time.time()
    
    print(f"\tNew optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training")
    
    recommender_instance = MultiThreadSLIM_SLIMElasticNetRecommender(URM_train)
    
    full_hyperp = {
                   "topK": optuna_trial.suggest_int("topK", 0, 1500),
                   "l1_ratio": optuna_trial.suggest_float("l1_ratio", 0.01, 1.0, log=True),
                    "alpha": optuna_trial.suggest_float("alpha", 1e-4, 1e-3, log=True),
                    "positive_only": optuna_trial.suggest_categorical("positive_only", [True, False]),
                  }
    
    recommender_instance.fit(**full_hyperp, workers=workers, verbose=False)
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)

    new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time)
    print("Training done in {:2f}{}.\n".format(new_time_value, new_time_unit))
    
    return result_df.loc[10]["MAP"]

In [14]:
if config['tune_parameters']:
    
    optuna_study = optuna.create_study(direction='maximize', study_name=f'hyperparameters_tuning_{config["model"]}',
                                storage=f'sqlite:///{config["database_path"]}', load_if_exists=True)

    optuna_study.optimize(objective_function_MultiThreadSLIM, n_trials=15)

[I 2024-11-24 12:06:13,714] Using an existing study with name 'hyperparameters_tuning_MultiThreadSLIM' instead of creating a new one.


	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 40.69 sec. Users per second: 875


[I 2024-11-24 12:34:43,671] Trial 40 finished with value: 0.05746141445761977 and parameters: {'topK': 807, 'l1_ratio': 0.15003700513920412, 'alpha': 0.0003702204830326689, 'positive_only': False}. Best is trial 29 with value: 0.05753142718903137.


Training done in 28.497055min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 40.55 sec. Users per second: 878


[I 2024-11-24 13:04:32,265] Trial 41 finished with value: 0.057582817722301505 and parameters: {'topK': 801, 'l1_ratio': 0.13427062297804124, 'alpha': 0.0003786348639101555, 'positive_only': False}. Best is trial 41 with value: 0.057582817722301505.


Training done in 29.807982min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 43.25 sec. Users per second: 823


[I 2024-11-24 13:35:50,767] Trial 42 finished with value: 0.057784079938102396 and parameters: {'topK': 782, 'l1_ratio': 0.14160189163649722, 'alpha': 0.00029655923471111264, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 31.306920min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 49.95 sec. Users per second: 713


[I 2024-11-24 14:11:11,589] Trial 43 finished with value: 0.0572604788883739 and parameters: {'topK': 926, 'l1_ratio': 0.09435687639121573, 'alpha': 0.00028514566305332795, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 35.345554min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 34.52 sec. Users per second: 1031


[I 2024-11-24 14:30:56,324] Trial 44 finished with value: 0.05471606387556492 and parameters: {'topK': 1029, 'l1_ratio': 0.2848283222910839, 'alpha': 0.0003902566038710663, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 19.744357min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 43.54 sec. Users per second: 818


[I 2024-11-24 15:03:09,246] Trial 45 finished with value: 0.05774864714813935 and parameters: {'topK': 1128, 'l1_ratio': 0.18621761340795562, 'alpha': 0.00021642159456306906, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 32.214181min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 43.54 sec. Users per second: 818


[I 2024-11-24 15:35:47,959] Trial 46 finished with value: 0.05775467282947057 and parameters: {'topK': 1170, 'l1_ratio': 0.189468112525039, 'alpha': 0.0002167452284036547, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 32.643655min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 46.50 sec. Users per second: 766


[I 2024-11-24 16:10:04,785] Trial 47 finished with value: 0.057685488420210096 and parameters: {'topK': 1496, 'l1_ratio': 0.18221496953430316, 'alpha': 0.0002009127000007633, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 34.278798min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 43.31 sec. Users per second: 822


[I 2024-11-24 16:43:39,681] Trial 48 finished with value: 0.057771669598679715 and parameters: {'topK': 1443, 'l1_ratio': 0.20625032209093871, 'alpha': 0.00020049706851735298, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 33.579926min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 43.93 sec. Users per second: 810


[I 2024-11-24 17:17:21,264] Trial 49 finished with value: 0.05775397494297405 and parameters: {'topK': 1462, 'l1_ratio': 0.1928087750279425, 'alpha': 0.00020581767428347298, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 33.691621min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 44.01 sec. Users per second: 809


[I 2024-11-24 17:50:39,471] Trial 50 finished with value: 0.057762754265912646 and parameters: {'topK': 1288, 'l1_ratio': 0.21241326105968947, 'alpha': 0.00019199056969821008, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 33.302141min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 44.79 sec. Users per second: 795


[I 2024-11-24 18:24:30,052] Trial 51 finished with value: 0.057760054136368845 and parameters: {'topK': 1293, 'l1_ratio': 0.21431510922896094, 'alpha': 0.00019183524961776047, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 33.841661min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 42.70 sec. Users per second: 834


[I 2024-11-24 18:58:15,775] Trial 52 finished with value: 0.057740411864514424 and parameters: {'topK': 1308, 'l1_ratio': 0.2888546476224872, 'alpha': 0.00015135950950835093, 'positive_only': False}. Best is trial 42 with value: 0.057784079938102396.


Training done in 33.760426min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 42.54 sec. Users per second: 837


[I 2024-11-24 19:31:32,460] Trial 53 finished with value: 0.05781717380495106 and parameters: {'topK': 1468, 'l1_ratio': 0.23369103056818588, 'alpha': 0.00019197240515809089, 'positive_only': False}. Best is trial 53 with value: 0.05781717380495106.


Training done in 33.276639min.

	New optuna trial for MultiThreadSLIM_SLIMElasticNetRecommender model training
EvaluatorHoldout: Processed 35595 (100.0%) in 42.99 sec. Users per second: 828


[I 2024-11-24 20:04:59,122] Trial 54 finished with value: 0.057777868079383204 and parameters: {'topK': 1238, 'l1_ratio': 0.2345632380686937, 'alpha': 0.00018337924645777368, 'positive_only': False}. Best is trial 53 with value: 0.05781717380495106.


Training done in 33.442692min.



## Some optuna visualizations on recommender parameters

In [15]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_slice(optuna_study)
fig.show()

In [16]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_param_importances(optuna_study)
fig.show()

## Let's train the recommender with best parameter values

In [17]:
if config['tune_best_params']:

    if config['tune_parameters']:
        best_params = optuna_study.best_trial.params
    else: 
        with open(f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/best_params_{config["model"]}.json', 'r') as best_params_json:
            best_params = json.load(best_params_json)

    recommender_instance = MultiThreadSLIM_SLIMElasticNetRecommender(URM_train + URM_validation)
    recommender_instance.fit(**best_params)

# Testing

Create the recommendations for the submission. 

In [18]:
if config['tune_best_params']:

    data_target_users_test = pd.read_csv('/kaggle/input/recommender-system-2024-challenge-polimi/data_target_users_test.csv')
    create_submission(data_target_users_test, recommender_instance, f'/kaggle/working/submission_{config["model"]}.csv')

# Save Version on GitHub 

Write or import a json file where best hyperparameters are saved. 

In [19]:
if config['tune_parameters']:
    with open(f'/kaggle/working/best_params_{config["model"]}.json', 'w') as params_file:
        json.dump(optuna_study.best_params, params_file)
        
    if config['save_github']:
        upload_file(
            f'/kaggle/working/best_params_{config["model"]}.json', 
            f'{GH_PATH}/{config["model"]}Recommender/best_params_{config["model"]}.json', 
            f'{config["model"]} tuning results (from kaggle notebook)',
            repo
        )
elif config['copy_prev_best_params']:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/'\
        f'best_params_{config["model"]}.json', 
        f'/kaggle/working/best_params_{config["model"]}.json'
    )

File 'TrainedModels/WithoutKFCV/SingleRecommenders/MultiThreadSLIMRecommender/best_params_MultiThreadSLIM.json' updated successfully.


Save the history of the tuned model.

In [20]:
if config['save_github'] and config['tune_parameters']:
    upload_file(
        config['database_path'], 
        f'{GH_PATH}/{config["model"]}Recommender/history_{config["model"]}.db',
        f'Tuning {config["model"]} db updated results (from kaggle notebook)',
        repo
    )

File 'TrainedModels/WithoutKFCV/SingleRecommenders/MultiThreadSLIMRecommender/history_MultiThreadSLIM.db' updated successfully.


Save the best trained model and its submission.

In [21]:
if config['save_github'] and config['tune_best_params']: 
    upload_file(
                f'/kaggle/working/submission_{config["model"]}.csv', 
                f'{GH_PATH}/{config["model"]}Recommender/Submission/submission_{config["model"]}.csv', 
                f'New {config["model"]} submission (from kaggle notebook)',
                repo
            )