# Set connection with GitHub

In [1]:
# ! cd /kaggle/working && rm -rf RECsys_Challenge2024

In [2]:
from kaggle_secrets import UserSecretsClient

token = UserSecretsClient().get_secret("Token")

! git clone https://{token}@github.com/madratak/RECsys_Challenge2024.git

Cloning into 'RECsys_Challenge2024'...
remote: Enumerating objects: 1880, done.[K
remote: Counting objects: 100% (262/262), done.[K
remote: Compressing objects: 100% (104/104), done.[K
remote: Total 1880 (delta 153), reused 260 (delta 152), pack-reused 1618 (from 1)[K
Receiving objects: 100% (1880/1880), 72.34 MiB | 25.44 MiB/s, done.
Resolving deltas: 100% (1098/1098), done.


In [3]:
! pip install PyGithub requests

Collecting PyGithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.5.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.9/375.9 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.5.0 pynacl-1.5.0


In [4]:
! cd /kaggle/working/RECsys_Challenge2024 && python run_compile_all_cython.py

run_compile_all_cython: Found 11 Cython files in 5 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/11]: MatrixFactorizationImpressions_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1929[m[K,
                 from [01m[K/opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5[m[K,
                 from [01m[KMatrixFactorizationImpressions_Cython_Epoch.c:1252[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorizationImpressions_Cython_Epoch.c:[m[K In function '[01m[K__pyx_f_43MatrixFactorizationImpressions_Cython_Epoch_32MatrixFactorization_Cython_Epoch_sampleBPR_Cython[m[K':
30351 |       [01;35m[K__pyx_t_4 = (__pyx_v_start_pos_impression

In [17]:
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time
import shutil
import optuna
import json
import os

np.random.seed(42)

## Import the repository

In [6]:
from github import Github, Auth

# Authenticate using a personal access token
auth_token = Auth.Token(token)
github_client = Github(auth=auth_token)

# Define the repository name you want to find
target_repo_name = 'RECsys_Challenge2024'
repo = None

# Search for the repository in the user's repositories
try:
    for repository in github_client.get_user().get_repos():
        if repository.name == target_repo_name:
            repo = repository
            print(f"Repository '{target_repo_name}' found.")
            break
    if repo is None:
        print(f"Repository '{target_repo_name}' not found.")
except Exception as e:
    print("An error occurred while accessing the repositories:", e)

Repository 'RECsys_Challenge2024' found.


In [7]:
def upload_file(filepath_kaggle, filepath_github, commit_message):
    """
    Uploads a file from Kaggle to GitHub, updating it if it already exists in the repository,
    or creating it if it does not.

    Parameters:
    - filepath_kaggle: Path to the file in the Kaggle environment.
    - filepath_github: Target path in the GitHub repository where the file should be uploaded.
    - commit_message: Message for the commit on GitHub.
    """
    try:
        
        # Check if the file already exists in the GitHub repository
        contents = repo.get_contents(filepath_github)
        
        # If it exists, update the file
        with open(filepath_kaggle, "rb") as file:
            repo.update_file(
                contents.path, commit_message, file.read(), contents.sha
            )
        print(f"File '{filepath_github}' updated successfully.")
    
    except Exception as e:
        
        # If the file does not exist, create it
        with open(filepath_kaggle, "rb") as file:
            repo.create_file(
                filepath_github, commit_message, file.read()
            )
        print(f"File '{filepath_github}' created successfully.")

In [None]:
config = {
    'model': 'SLIMElasticNet',
    'tune_parameters': True,
    'database_path': '/kaggle/working/history_SLIMElasticNet.db',
    'copy_prev_best_params': False,
    'tune_best_params': True,
    'save_github': True
}

Import the database where previous tuning trials have been saved.

In [9]:
try:
    shutil.copyfile(
        f'/kaggle/working/RECsys_Challenge2024/TrainedModels/' \
        f'{config["model"]}Recommender/history_{config["model"]}.db', 
        config['database_path']
    )
except FileNotFoundError:
    pass # if not present optuna will create it

# Construction of URM and ICM matrices

In [10]:
URM_all_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_train.csv")

n_users = len(URM_all_dataframe["user_id"].unique())
n_items = len(URM_all_dataframe["item_id"].unique())

URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1764607 stored elements and shape (35736, 38121)>

In [11]:
ICM_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_ICM_metadata.csv")

n_items = len(ICM_dataframe["item_id"].unique())
n_features = len(ICM_dataframe["feature_id"].unique())

ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2940040 stored elements and shape (38121, 94331)>

# Training

In [12]:
%cd /kaggle/working/RECsys_Challenge2024/

/kaggle/working/RECsys_Challenge2024


In [13]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])

EvaluatorHoldout: Ignoring 141 ( 0.4%) Users that have less than 1 test interactions


In [None]:
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender

def objective_function_SLIMElasticNet(optuna_trial):
    
    recommender_instance = SLIMElasticNetRecommender(URM_train)
    
    full_hyperp = {
                   "topK": optuna_trial.suggest_int("topK", 0, 1500),
                   "l1_ratio": optuna_trial.suggest_float("l1_ratio", 0.01, 1.0, log=True),
                    "alpha": optuna_trial.suggest_float("alpha", 1e-4, 1e-1, log=True),
                    "positive_only": optuna_trial.suggest_categorical("positive_only", [True, False]),
                  }        
    
    recommender_instance.fit(**full_hyperp)
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)
    
    return result_df.loc[10]["MAP"]

In [15]:
if config['tune_parameters']:
    
    optuna_study = optuna.create_study(direction='maximize', study_name=f'hyperparameters_tuning_{config["model"]}',
                                storage=f'sqlite:///{config["database_path"]}', load_if_exists=True)

    optuna_study.optimize(objective_function_SLIMElasticNet, n_trials=6)

[I 2024-11-18 02:18:33,864] Using an existing study with name 'hyperparameters_tuning_SLIMElasticNet' instead of creating a new one.


SLIMElasticNetRecommender: Processed 4032 (10.6%) in 5.00 min. Items per second: 13.44
SLIMElasticNetRecommender: Processed 8076 (21.2%) in 10.00 min. Items per second: 13.46
SLIMElasticNetRecommender: Processed 12217 (32.0%) in 15.00 min. Items per second: 13.57
SLIMElasticNetRecommender: Processed 16436 (43.1%) in 20.00 min. Items per second: 13.69
SLIMElasticNetRecommender: Processed 20541 (53.9%) in 25.00 min. Items per second: 13.69
SLIMElasticNetRecommender: Processed 24658 (64.7%) in 30.00 min. Items per second: 13.70
SLIMElasticNetRecommender: Processed 28763 (75.5%) in 35.00 min. Items per second: 13.69
SLIMElasticNetRecommender: Processed 33031 (86.6%) in 40.00 min. Items per second: 13.76
SLIMElasticNetRecommender: Processed 37326 (97.9%) in 45.01 min. Items per second: 13.82
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 45.98 min. Items per second: 13.82
EvaluatorHoldout: Processed 35595 (100.0%) in 42.16 sec. Users per second: 844


[I 2024-11-18 03:05:15,298] Trial 23 finished with value: 0.06005359233171937 and parameters: {'topK': 891, 'l1_ratio': 0.18035799034056169, 'alpha': 0.00022826442877737571, 'positive_only': False}. Best is trial 16 with value: 0.060174874079423415.


SLIMElasticNetRecommender: Processed 3949 (10.4%) in 5.00 min. Items per second: 13.16
SLIMElasticNetRecommender: Processed 7881 (20.7%) in 10.00 min. Items per second: 13.13
SLIMElasticNetRecommender: Processed 11831 (31.0%) in 15.00 min. Items per second: 13.14
SLIMElasticNetRecommender: Processed 15854 (41.6%) in 20.00 min. Items per second: 13.21
SLIMElasticNetRecommender: Processed 19804 (52.0%) in 25.00 min. Items per second: 13.20
SLIMElasticNetRecommender: Processed 23829 (62.5%) in 30.00 min. Items per second: 13.24
SLIMElasticNetRecommender: Processed 27923 (73.2%) in 35.00 min. Items per second: 13.29
SLIMElasticNetRecommender: Processed 31946 (83.8%) in 40.00 min. Items per second: 13.31
SLIMElasticNetRecommender: Processed 36118 (94.7%) in 45.00 min. Items per second: 13.38
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 47.53 min. Items per second: 13.37
EvaluatorHoldout: Processed 35595 (100.0%) in 40.22 sec. Users per second: 885


[I 2024-11-18 03:53:27,824] Trial 24 finished with value: 0.060060629168086424 and parameters: {'topK': 927, 'l1_ratio': 0.2510044196633321, 'alpha': 0.00010901446116481857, 'positive_only': True}. Best is trial 16 with value: 0.060174874079423415.


SLIMElasticNetRecommender: Processed 4018 (10.5%) in 5.00 min. Items per second: 13.39
SLIMElasticNetRecommender: Processed 7995 (21.0%) in 10.00 min. Items per second: 13.32
SLIMElasticNetRecommender: Processed 11978 (31.4%) in 15.00 min. Items per second: 13.31
SLIMElasticNetRecommender: Processed 16054 (42.1%) in 20.00 min. Items per second: 13.38
SLIMElasticNetRecommender: Processed 20091 (52.7%) in 25.00 min. Items per second: 13.39
SLIMElasticNetRecommender: Processed 24165 (63.4%) in 30.00 min. Items per second: 13.42
SLIMElasticNetRecommender: Processed 28273 (74.2%) in 35.00 min. Items per second: 13.46
SLIMElasticNetRecommender: Processed 32423 (85.1%) in 40.00 min. Items per second: 13.51
SLIMElasticNetRecommender: Processed 36584 (96.0%) in 45.01 min. Items per second: 13.55
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 46.96 min. Items per second: 13.53
EvaluatorHoldout: Processed 35595 (100.0%) in 39.62 sec. Users per second: 898


[I 2024-11-18 04:41:05,390] Trial 25 finished with value: 0.06012662849472609 and parameters: {'topK': 935, 'l1_ratio': 0.2810739052996396, 'alpha': 0.0001112660406807617, 'positive_only': True}. Best is trial 16 with value: 0.060174874079423415.


SLIMElasticNetRecommender: Processed 4138 (10.9%) in 5.00 min. Items per second: 13.79
SLIMElasticNetRecommender: Processed 8214 (21.5%) in 10.00 min. Items per second: 13.69
SLIMElasticNetRecommender: Processed 12312 (32.3%) in 15.00 min. Items per second: 13.68
SLIMElasticNetRecommender: Processed 16497 (43.3%) in 20.00 min. Items per second: 13.74
SLIMElasticNetRecommender: Processed 20690 (54.3%) in 25.00 min. Items per second: 13.79
SLIMElasticNetRecommender: Processed 24889 (65.3%) in 30.00 min. Items per second: 13.83
SLIMElasticNetRecommender: Processed 29131 (76.4%) in 35.00 min. Items per second: 13.87
SLIMElasticNetRecommender: Processed 33367 (87.5%) in 40.00 min. Items per second: 13.90
SLIMElasticNetRecommender: Processed 37637 (98.7%) in 45.00 min. Items per second: 13.94
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 45.61 min. Items per second: 13.93
EvaluatorHoldout: Processed 35595 (100.0%) in 39.02 sec. Users per second: 912


[I 2024-11-18 05:27:21,388] Trial 26 finished with value: 0.060000137124661186 and parameters: {'topK': 1086, 'l1_ratio': 0.3806135014421157, 'alpha': 0.00010329482086123877, 'positive_only': True}. Best is trial 16 with value: 0.060174874079423415.


SLIMElasticNetRecommender: Processed 4748 (12.5%) in 5.00 min. Items per second: 15.82
SLIMElasticNetRecommender: Processed 9481 (24.9%) in 10.00 min. Items per second: 15.80
SLIMElasticNetRecommender: Processed 14301 (37.5%) in 15.00 min. Items per second: 15.89
SLIMElasticNetRecommender: Processed 19158 (50.3%) in 20.00 min. Items per second: 15.96
SLIMElasticNetRecommender: Processed 24193 (63.5%) in 25.00 min. Items per second: 16.13
SLIMElasticNetRecommender: Processed 29221 (76.7%) in 30.00 min. Items per second: 16.23
SLIMElasticNetRecommender: Processed 34442 (90.3%) in 35.00 min. Items per second: 16.40
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 38.75 min. Items per second: 16.40
EvaluatorHoldout: Processed 35595 (100.0%) in 35.90 sec. Users per second: 991


[I 2024-11-18 06:06:42,524] Trial 27 finished with value: 0.05930898311471236 and parameters: {'topK': 1363, 'l1_ratio': 0.4120749146112008, 'alpha': 0.0001767040245037006, 'positive_only': True}. Best is trial 16 with value: 0.060174874079423415.


SLIMElasticNetRecommender: Processed 4038 (10.6%) in 5.00 min. Items per second: 13.45
SLIMElasticNetRecommender: Processed 8046 (21.1%) in 10.00 min. Items per second: 13.41
SLIMElasticNetRecommender: Processed 12105 (31.8%) in 15.00 min. Items per second: 13.45
SLIMElasticNetRecommender: Processed 16222 (42.6%) in 20.00 min. Items per second: 13.51
SLIMElasticNetRecommender: Processed 20264 (53.2%) in 25.00 min. Items per second: 13.51
SLIMElasticNetRecommender: Processed 24345 (63.9%) in 30.01 min. Items per second: 13.52
SLIMElasticNetRecommender: Processed 28509 (74.8%) in 35.01 min. Items per second: 13.57
SLIMElasticNetRecommender: Processed 32688 (85.7%) in 40.01 min. Items per second: 13.62
SLIMElasticNetRecommender: Processed 36891 (96.8%) in 45.01 min. Items per second: 13.66
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 46.53 min. Items per second: 13.65
EvaluatorHoldout: Processed 35595 (100.0%) in 40.53 sec. Users per second: 878


[I 2024-11-18 06:53:55,224] Trial 28 finished with value: 0.06010310771309341 and parameters: {'topK': 915, 'l1_ratio': 0.22058256754066696, 'alpha': 0.00012167373828956269, 'positive_only': True}. Best is trial 16 with value: 0.060174874079423415.


## Some optuna visualizations on recommender parameters

In [None]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_slice(optuna_study)
fig.show()

In [None]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_param_importances(optuna_study)
fig.show()

## Let's train the recommender with best parameter values

In [16]:
if config['tune_best_params']:

    if config['tune_parameters']:
        best_params = optuna_study.best_trial.params
    else: 
        with open(f'/kaggle/working/TrainedModels/{config["model"]}Recommender/best_params_{config["model"]}.json', 'r') as best_params_json:
            best_params = json.load(best_params_json)

    recommender_instance = SLIMElasticNetRecommender(URM_train + URM_validation)
    recommender_instance.fit(**best_params)

SLIMElasticNetRecommender: Processed 3373 ( 8.8%) in 5.00 min. Items per second: 11.24
SLIMElasticNetRecommender: Processed 6727 (17.6%) in 10.00 min. Items per second: 11.21
SLIMElasticNetRecommender: Processed 10109 (26.5%) in 15.00 min. Items per second: 11.23
SLIMElasticNetRecommender: Processed 13493 (35.4%) in 20.00 min. Items per second: 11.24
SLIMElasticNetRecommender: Processed 16908 (44.4%) in 25.00 min. Items per second: 11.27
SLIMElasticNetRecommender: Processed 20322 (53.3%) in 30.01 min. Items per second: 11.29
SLIMElasticNetRecommender: Processed 23696 (62.2%) in 35.01 min. Items per second: 11.28
SLIMElasticNetRecommender: Processed 27110 (71.1%) in 40.01 min. Items per second: 11.29
SLIMElasticNetRecommender: Processed 30529 (80.1%) in 45.01 min. Items per second: 11.30
SLIMElasticNetRecommender: Processed 33974 (89.1%) in 50.01 min. Items per second: 11.32
SLIMElasticNetRecommender: Processed 37403 (98.1%) in 55.01 min. Items per second: 11.33
SLIMElasticNetRecommende

# Testing

Create the recommendations for the submission. 

In [17]:
def create_submission(data_target_users_test, recommender_instance, cutoff=10, output_file=f"/kaggle/working/submission_{config['model']}.csv"):
    
    target_result = []

    for target in data_target_users_test["user_id"]:
        target_result.append(recommender_instance.recommend(target, cutoff=cutoff, remove_seen_flag=True))

    user_ids = data_target_users_test["user_id"]
    formatted_data = {
        "user_id": user_ids,
        "item_list": [" ".join(map(str, items)) for items in target_result]
    }

    submission_df = pd.DataFrame(formatted_data)
    submission_df.to_csv(output_file, index=False, header=["user_id", "item_list"])

    print(f"Submission file saved as {output_file}")

In [18]:
if config['tune_best_params']:

    data_target_users_test = pd.read_csv('/kaggle/input/recommender-system-2024-challenge-polimi/data_target_users_test.csv')
    create_submission(data_target_users_test, recommender_instance)

Submission file saved as /kaggle/working/submission_SLIMElasticNet.csv


# Save Version on GitHub 

Write or import a json file where best hyperparameters are saved. 

In [19]:
if config['tune_parameters']:
    with open(f'/kaggle/working/best_params_{config["model"]}.json', 'w') as params_file:
        json.dump(optuna_study.best_params, params_file)
        
    if config['save_github']:
        upload_file(
            f'/kaggle/working/best_params_{config["model"]}.json', 
            f'TrainedModels/{config["model"]}Recommender/best_params_{config["model"]}.json', 
            f'{config["model"]} tuning results (from kaggle notebook)'
        )
elif config['copy_prev_best_params']:
    shutil.copyfile(
        f'/kaggle/working/RECsys_Challenge2024/TrainedModels/{config["model"]}Recommender/'\
        f'best_params_{config["model"]}.json', 
        f'/kaggle/working/best_params_{config["model"]}.json'
    )

File 'TrainedModels/SLIMElasticNetRecommender/best_params_SLIMElasticNet.json' updated successfully.


Save the history of the tuned model.

In [None]:
if config['save_github'] and config['tune_parameters']:
    upload_file(
        config['database_path'], 
        f'TrainedModels/{config["model"]}Recommender/history_{config["model"]}.db',
        f'Tuning {config["model"]} db updated results (from kaggle notebook)'
    )

File 'TrainedModels/SLIMElasticNetRecommender/history_SLIMElasticNet.db' updated successfully.


Save the best trained model and its submission.

In [None]:
if config['save_github'] and config['tune_best_params']:
    
    upload_file(
                f'/kaggle/working/submission_{config["model"]}.csv', 
                f'TrainedModels/{config["model"]}Recommender/Submission/'\
                    f'submission_{config["model"]}.csv', 
                f'New {config["model"]} submission (from kaggle notebook)'
            )
    
    recommender_instance.save_model(folder_path='/kaggle/working/', file_name=f'best_{config["model"]}_tuned')
    
    zip_file_path = f'/kaggle/working/best_{config["model"]}_tuned.zip'
    
    # 50MB limitation management for GitHub pushes. 
    if ((os.path.getsize(zip_file_path) / (1024 * 1024)) < 50):
        upload_file(
            zip_file_path,  
            f'TrainedModels/{config["model"]}Recommender/best_{config["model"]}_tuned.zip', 
            f'New {config["model"]} recommender tuned with best parameters (from kaggle notebook)'
        )
    else:
        print(f"\nThe best recommender was not saved on GitHub because its size is more than 50 MB.")

File 'TrainedModels/SLIMElasticNetRecommender/Submission/submission_SLIMElasticNet.csv' updated successfully. 
SLIMElasticNetRecommender: Saving model in file '/kaggle/working/best_SLIMElasticNet_tuned' 
SLIMElasticNetRecommender: Saving complete

The best recommender was not saved on GitHub because its size is more than 50 MB.
