# Set connection with GitHub

In [1]:
# ! cd /kaggle/working && rm -rf RECsys_Challenge2024

In [2]:
from kaggle_secrets import UserSecretsClient

token = UserSecretsClient().get_secret("Token")

! git clone https://{token}@github.com/madratak/RECsys_Challenge2024.git

Cloning into 'RECsys_Challenge2024'...
remote: Enumerating objects: 5079, done.[K
remote: Counting objects: 100% (151/151), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 5079 (delta 74), reused 82 (delta 32), pack-reused 4928 (from 3)[K
Receiving objects: 100% (5079/5079), 379.22 MiB | 38.93 MiB/s, done.
Resolving deltas: 100% (2923/2923), done.
Updating files: 100% (423/423), done.


In [3]:
! pip install PyGithub requests

Collecting PyGithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.5.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.9/375.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.5.0 pynacl-1.5.0


In [4]:
! pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [5]:
%cd /kaggle/working/RECsys_Challenge2024
! python run_compile_all_cython.py

/kaggle/working/RECsys_Challenge2024
run_compile_all_cython: Found 11 Cython files in 5 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/usr/bin/python3'
Compiling [1/11]: MatrixFactorizationImpressions_Cython_Epoch.pyx... 
In file included from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1929[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/arrayobject.h:5[m[K,
                 from [01m[KMatrixFactorizationImpressions_Cython_Epoch.c:1252[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorizationImpressions_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_f_43MatrixFactorizationImpressions_Cython_Epoch_32MatrixFactorization_Cython_Epoch_sampleBPR_Cython[m[K’:
30353 |       [01;35m[K__pyx_t_4

In [6]:
from Utils.notebookFunctions import *
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time
import shutil
import optuna
import json
import os
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit


K_PATH = '/kaggle/working/RECsys_Challenge2024'
GH_PATH = 'TrainedModels/WithoutKFCV/MatrixFactorization'

np.random.seed(42)

## Import the repository **RECsys_Challenge2024**

In [7]:
repo = get_repo_from_github(token)

Repository 'RECsys_Challenge2024' found.


In [8]:
config = {
    'model': 'FasterIALS',
    'metric': 'Recall',
    'tune_parameters': True,
    'database_path': '/kaggle/working/history_FasterIALS_Recall.db',
    'copy_prev_best_params': False,
    'tune_best_params': True,
    'save_github': True
}

Import the database where previous tuning trials have been saved.

In [9]:
try:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db', 
        config['database_path']
    )
except FileNotFoundError:
    pass # if not present optuna will create it

# Construction of URM and ICM matrices

In [10]:
URM_all_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_train.csv")

n_users = len(URM_all_dataframe["user_id"].unique())
n_items = len(URM_all_dataframe["item_id"].unique())

URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

<35736x38121 sparse matrix of type '<class 'numpy.float64'>'
	with 1764607 stored elements in Compressed Sparse Row format>

In [11]:
ICM_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_ICM_metadata.csv")

n_items = len(ICM_dataframe["item_id"].unique())
n_features = len(ICM_dataframe["feature_id"].unique())

ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

<38121x94331 sparse matrix of type '<class 'numpy.float64'>'
	with 2940040 stored elements in Compressed Sparse Row format>

# Training

In [12]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[50])

EvaluatorHoldout: Ignoring 141 ( 0.4%) Users that have less than 1 test interactions


In [13]:
from Recommenders.MatrixFactorization.FasterIALSRecommender import FasterIALSRecommender

def objective_function_FasterIALS(optuna_trial):
    
    recommender_instance = FasterIALSRecommender(URM_train)
    
    full_hyperp = {
                    'factors': optuna_trial.suggest_int('factors', 10, 500),
                    'regularization': optuna_trial.suggest_float('regularization', 1e-4, 0.1, log=True),
                    'iterations': optuna_trial.suggest_int('iterations', 50, 500),
                    'alpha': optuna_trial.suggest_float('alpha', 0.1, 50, log=True)
                  } 
    
    recommender_instance.fit(**full_hyperp)
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)
    
    return result_df.loc[50]["RECALL"]

In [14]:
if config['tune_parameters']:
    
    optuna_study = optuna.create_study(direction='maximize', study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}',
                                storage=f'sqlite:///{config["database_path"]}', load_if_exists=True)

    optuna_study.optimize(objective_function_FasterIALS, n_trials=10)

[I 2024-12-30 19:06:25,337] A new study created in RDB with name: hyperparameters_tuning_FasterIALS_Recall
  check_blas_config()


  0%|          | 0/313 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 42.88 sec. Users per second: 830


[I 2024-12-30 19:13:39,709] Trial 0 finished with value: 0.11623924962387941 and parameters: {'factors': 158, 'regularization': 0.012607162881797813, 'iterations': 313, 'alpha': 0.5005603456478573}. Best is trial 0 with value: 0.11623924962387941.


  0%|          | 0/124 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 41.64 sec. Users per second: 855


[I 2024-12-30 19:16:00,584] Trial 1 finished with value: 0.16430108972364468 and parameters: {'factors': 80, 'regularization': 0.0010375040478024678, 'iterations': 124, 'alpha': 3.7200517764309904}. Best is trial 1 with value: 0.16430108972364468.


  0%|          | 0/264 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 44.50 sec. Users per second: 800


[I 2024-12-30 19:50:18,613] Trial 2 finished with value: 0.2436065234381424 and parameters: {'factors': 274, 'regularization': 0.00028391966588908204, 'iterations': 264, 'alpha': 46.60816400207643}. Best is trial 2 with value: 0.2436065234381424.


  0%|          | 0/412 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 43.30 sec. Users per second: 822


[I 2024-12-30 19:59:20,035] Trial 3 finished with value: 0.18026944124902677 and parameters: {'factors': 198, 'regularization': 0.0002905343800440704, 'iterations': 412, 'alpha': 1.7235139689886068}. Best is trial 2 with value: 0.2436065234381424.


  0%|          | 0/54 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 42.20 sec. Users per second: 843


[I 2024-12-30 20:01:04,630] Trial 4 finished with value: 0.10930090805789232 and parameters: {'factors': 171, 'regularization': 0.06277053449059147, 'iterations': 54, 'alpha': 0.40189151500275555}. Best is trial 2 with value: 0.2436065234381424.


  0%|          | 0/274 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 43.43 sec. Users per second: 820


[I 2024-12-30 20:30:04,862] Trial 5 finished with value: 0.2348716282623931 and parameters: {'factors': 212, 'regularization': 0.006084829321429428, 'iterations': 274, 'alpha': 33.90973191023568}. Best is trial 2 with value: 0.2436065234381424.


  0%|          | 0/186 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 44.17 sec. Users per second: 806


[I 2024-12-30 20:53:58,489] Trial 6 finished with value: 0.24277880692380363 and parameters: {'factors': 271, 'regularization': 0.002490041859256992, 'iterations': 186, 'alpha': 34.32890807226678}. Best is trial 2 with value: 0.2436065234381424.


  0%|          | 0/67 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 38.31 sec. Users per second: 929


[I 2024-12-30 20:55:06,951] Trial 7 finished with value: 0.11882563519711478 and parameters: {'factors': 17, 'regularization': 0.03183782457051688, 'iterations': 67, 'alpha': 34.90444156840907}. Best is trial 2 with value: 0.2436065234381424.


  0%|          | 0/50 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 44.66 sec. Users per second: 797


[I 2024-12-30 21:01:45,354] Trial 8 finished with value: 0.14065225519234936 and parameters: {'factors': 246, 'regularization': 0.0019379807213223863, 'iterations': 50, 'alpha': 0.6646890803023725}. Best is trial 2 with value: 0.2436065234381424.


  0%|          | 0/416 [00:00<?, ?it/s]

EvaluatorHoldout: Processed 35595 (100.0%) in 48.95 sec. Users per second: 727


[I 2024-12-30 22:20:33,530] Trial 9 finished with value: 0.2519049073578284 and parameters: {'factors': 405, 'regularization': 0.01471223031246778, 'iterations': 416, 'alpha': 18.398914241452104}. Best is trial 9 with value: 0.2519049073578284.


## Some optuna visualizations on recommender parameters

In [15]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_slice(optuna_study)
fig.show()

In [16]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_param_importances(optuna_study)
fig.show()

## Let's train the recommender with best parameter values

In [17]:
if config['tune_best_params']:

    if config['tune_parameters']:
        best_params = optuna_study.best_trial.params
    else: 
        with open(f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 'r') as best_params_json:
            best_params = json.load(best_params_json)

    recommender_instance = FasterIALSRecommender(URM_train + URM_validation)
    recommender_instance.fit(**best_params)

  0%|          | 0/416 [00:00<?, ?it/s]

# Testing

Create the recommendations for the submission. 

In [18]:
if config['tune_best_params']:

    data_target_users_test = pd.read_csv('/kaggle/input/recommender-system-2024-challenge-polimi/data_target_users_test.csv')
    create_submission(data_target_users_test, recommender_instance, f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv')

Submission file saved as /kaggle/working/submission_FasterIALS_Recall.csv


# Save Version on GitHub 

Write or import a json file where best hyperparameters are saved. 

In [19]:
if config['tune_parameters']:
    with open(f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 'w') as params_file:
        json.dump(optuna_study.best_params, params_file)
        
    if config['save_github']:
        upload_file(
            f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{config["model"]}_{config["metric"]} tuning results (from kaggle notebook)',
            repo
        )
elif config['copy_prev_best_params']:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/'\
        f'best_params_{config["model"]}_{config["metric"]}.json', 
        f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json'
    )

File 'TrainedModels/WithoutKFCV/MatrixFactorization/FasterIALSRecommender/OptimizingRecall/best_params_FasterIALS_Recall.json' created successfully.


Save the history of the tuned model.

In [20]:
if config['save_github'] and config['tune_parameters']:
    upload_file(
        config['database_path'], 
        f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db',
        f'Tuning {config["model"]}_{config["metric"]} db updated results (from kaggle notebook)',
        repo
    )

File 'TrainedModels/WithoutKFCV/MatrixFactorization/FasterIALSRecommender/OptimizingRecall/history_FasterIALS_Recall.db' created successfully.


Save the best trained model and its submission.

In [21]:
if config['save_github'] and config['tune_best_params']: 
    upload_file(
                f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv', 
                f'{GH_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/Submission/submission_{config["model"]}_{config["metric"]}.csv', 
                f'New {config["model"]}_{config["metric"]} submission (from kaggle notebook)',
                repo
            )

File 'TrainedModels/WithoutKFCV/MatrixFactorization/FasterIALSRecommender/OptimizingRecall/Submission/submission_FasterIALS_Recall.csv' created successfully.
