# Set connection with GitHub

In [None]:
# ! cd /kaggle/working && rm -rf RECsys_Challenge2024

In [None]:
from kaggle_secrets import UserSecretsClient

token = UserSecretsClient().get_secret("Token")

! git clone https://{token}@github.com/madratak/RECsys_Challenge2024.git

In [None]:
! pip install PyGithub requests

In [None]:
! pip install implicit

In [None]:
!pip install timeout-decorator

In [None]:
%cd /kaggle/working/RECsys_Challenge2024
! python run_compile_all_cython.py

In [None]:
from Utils.notebookFunctions import *
from Utils.xgboost_functions import *
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time
import shutil
import optuna
import json
import os
import gc
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
from tqdm import tqdm
from xgboost import XGBRanker

import warnings
import string
from scipy.stats import skew, kurtosis
from numpy import linalg as LA

import timeout_decorator

%matplotlib inline
from xgboost import plot_importance

K_PATH = '/kaggle/working/RECsys_Challenge2024'
GH_PATH = 'TrainedModels/WithKFCV'
GH_PATH_1 = 'TrainedModels/WithoutKFCV'
D_PATH = 'XGBoost'

np.random.seed(42)

In [None]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender
from Recommenders.Hybrid.LinearCombinationRecommender import LinearCombinationRecommender
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from Recommenders.MatrixFactorization.FasterIALSRecommender import FasterIALSRecommender
from Recommenders.MatrixFactorization.NMFRecommender import NMFRecommender
from Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDItemRecommender
from Recommenders.MatrixFactorization.PureSVDRecommender import ScaledPureSVDRecommender
from Recommenders.Neural.MultVAE_PyTorch_Recommender import MultVAERecommender_PyTorch_OptimizerMask 
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender
from Recommenders.SLIM.SLIM_BPR_Python import SLIM_BPR_Python

## Import the repository

In [None]:
repo = get_repo_from_github(token)

In [None]:
config = {
    'model': 'XGBoostNoCont',
    'metric': 'MAP',
    'categorical': True,
    'contents': False,
    'tune_parameters': True,
    'database_path': '/kaggle/working/history_XGBoostNoCont_MAP.db',
    'copy_prev_best_params': False,
    'tune_best_params': True,
    'save_github': True
}

Import the database where previous tuning trials have been saved.

In [None]:
try:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{D_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db', 
        config['database_path']
    )
except FileNotFoundError:
    pass # if not present optuna will create it

Save the best trial value saved in the imported db from optuna.

In [None]:
try:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}',
                                storage=f'sqlite:///{config["database_path"]}')

    # Access the best trial
    best_saved_value = study.best_trial.value
except:
    best_saved_value = 0

print("Best value:", best_saved_value)

# Construction of URM and ICM matrices

In [None]:
URM_all_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_train.csv")

n_users = len(URM_all_dataframe["user_id"].unique())
n_items = len(URM_all_dataframe["item_id"].unique())

URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

In [None]:
ICM_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_ICM_metadata.csv")

n_items = len(ICM_dataframe["item_id"].unique())
n_features = len(ICM_dataframe["feature_id"].unique())

ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

## Import dataframes

In [None]:
dataframes_path = f'/kaggle/input/dataframes-{config["model"].lower()}-kfcv'

if os.path.exists(dataframes_path):
    print("Dataframe directory is ready to be used.")
else:
    print("No dataframes directory is present.")

# Training
Split the dataset in train, validation and test set.

In [None]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.80)

evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

Function to fit the hybrid recommender that is going to be used as candidate generator.

## Candidate Generators

Select the best previously trained recommenders to use inside the hybrid recommender (candidate generator).

In [None]:
cg_recommenders = {
    "ItemKNNCF": ItemKNNCFRecommender,
    "SLIMElasticNet": SLIMElasticNetRecommender,
    "RP3beta": RP3betaRecommender,
}

## Features
Select the other previously trained recommenders to use them as features.

In [None]:
f_recommenders = {
    "RP3beta": RP3betaRecommender,
    "P3alpha": P3alphaRecommender,
    "ItemKNNCF": ItemKNNCFRecommender,
    "ItemKNNCBF": ItemKNNCBFRecommender,
    "UserKNNCF": UserKNNCFRecommender,
    "FasterIALS": FasterIALSRecommender,
    "NMF": NMFRecommender,
    "PureSVDItem": PureSVDItemRecommender,
    "ScaledPureSVD": ScaledPureSVDRecommender,
    "MultVAE": MultVAERecommender_PyTorch_OptimizerMask,
    "SLIMElasticNet": SLIMElasticNetRecommender,
    "SLIM_BPR": SLIM_BPR_Python,
}

## Training dataframe building
Build the dataframe with the predictions, the popularity of the item, the user activity, the profile length, item features and others.

Then, set features and target to use for the training.

In [None]:
if config["tune_parameters"]:
    
    if os.path.exists(dataframes_path + f'/training_dataframe_{config["model"]}.parquet'):
        training_dataframe = pd.read_parquet(dataframes_path + f'/training_dataframe_{config["model"]}.parquet')
        groups_train = training_dataframe.groupby("UserID").size().values
        print("training_dataframe and groups_train loaded.")
    
    else:
        print("***TRAINING CONTENT GENERATION RECOMMENDERS***\n")
        candidate_generator_recommenders = fit_recommenders("Recall", "Train", URM_train, ICM_all, cg_recommenders, GH_PATH_1, "cg", repo, take_kfcv_models=True)
    
        print("***TRAINING FEATURE RECOMMENDERS***\n")
        features_recommenders = fit_recommenders("MAP", "Train", URM_train, ICM_all, f_recommenders, GH_PATH_1, "f", repo, take_kfcv_models=True)

        print()
        training_dataframe, groups_train = create_XGBoost_dataframe(URM_train, candidate_generator_recommenders, features_recommenders, ICM_all, URM_validation, cutoff=50, categorical=config["categorical"], contents=config["contents"])
        training_dataframe.to_parquet(f'/kaggle/working/training_dataframe_{config["model"]}.parquet')

        del candidate_generator_recommenders, features_recommenders
        
    display(training_dataframe)

    y_train = training_dataframe["Label"]
    X_train = training_dataframe.drop(columns=["Label"])

    del training_dataframe

## Validation and Testing dataframe building

The first dataset excludes the label column since it is used to evaluate the performance of the model with the hyperparameters chosen by Optuna. In contrast, the second dataset includes the label column and is used to train the final model selected by Optuna.

In [None]:
if config["tune_parameters"] or config["tune_best_params"]:
    
    if os.path.exists(dataframes_path + f'/testing_dataframe_{config["model"]}.parquet' ):
        testing_dataframe = pd.read_parquet(dataframes_path + f'/testing_dataframe_{config["model"]}.parquet')
        groups_test = testing_dataframe.groupby("UserID").size().values
        print("testing_dataframe and groups_test loaded.")
    
    else:

        print("***TRAINING CONTENT GENERATION RECOMMENDERS***\n")
        candidate_generator_recommenders = fit_recommenders("Recall", "TrainVal", URM_train + URM_validation, ICM_all, cg_recommenders, GH_PATH_1, "cg", repo, take_kfcv_models=True)

        print("***TRAINING FEATURE RECOMMENDERS***\n")
        features_recommenders = fit_recommenders("MAP", "TrainVal", URM_train + URM_validation, ICM_all, f_recommenders, GH_PATH_1, "f", repo, take_kfcv_models=True)

        print()
        testing_dataframe, groups_test = create_XGBoost_dataframe(URM_train + URM_validation, candidate_generator_recommenders, features_recommenders, ICM_all, URM_test, cutoff=50, categorical=config["categorical"], contents=config["contents"])
        testing_dataframe.to_parquet(f'/kaggle/working/testing_dataframe_{config["model"]}.parquet')

        del candidate_generator_recommenders, features_recommenders

    if config["tune_parameters"]:
        validation_dataframe = testing_dataframe.drop(columns=["Label"])
        display(validation_dataframe)

## XGBoost training
Let's train XGBoost to rerank those prediction using as lable whether they should be recommended or not.

In [None]:
from Recommenders.XGBoost.XGBoostRankerRecommender import XGBoostRankerRecommender
from timeout_decorator.timeout_decorator import TimeoutError

@timeout_decorator.timeout(1800, timeout_exception=optuna.TrialPruned, use_signals=False)
def objective_function_XGBoostRanker(optuna_trial):
    
    recommender_instance = XGBoostRankerRecommender(URM_train + URM_validation, X_train, y_train, validation_dataframe)

    full_hyperp = {
            "n_estimators": optuna_trial.suggest_int("n_estimators", 50, 500),
            "learning_rate": optuna_trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
            "reg_alpha": optuna_trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True),
            "reg_lambda": optuna_trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True),
            "max_depth": optuna_trial.suggest_int("max_depth", 3, 7),
            "max_leaves": optuna_trial.suggest_int("max_leaves", 2, 128),
            "grow_policy": optuna_trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
            "booster": optuna_trial.suggest_categorical("booster", ["gbtree", "dart"]),
            # "tree_method": optuna_trial.suggest_categorical("tree_method", ["hist", "gpu_hist", "approx"]),
            "tree_method": "hist",
            "objective": optuna_trial.suggest_categorical("objective", ["map", "pairwise", "ndcg"])
        }

    recommender_instance.fit(
        groups_train,
        **full_hyperp
    )

    result_df, _ = evaluator_test.evaluateRecommender(recommender_instance)
    
    return result_df.loc[10]["MAP"]

In [None]:
if config['tune_parameters']:
    
    optuna_study = optuna.create_study(direction='maximize', study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}',
                                storage=f'sqlite:///{config["database_path"]}', load_if_exists=True)

    optuna_study.optimize(objective_function_XGBoostRanker, timeout=3600 * 5)

Check whether a better value was found in the new tuning performed to avoid repetitive tuning.

In [None]:
if config["tune_parameters"] and optuna_study.best_trial.value <= best_saved_value:
    config["tune_best_params"] = False
    print("Best trial value didn't improve during last new training.")

### Some optuna visualizations on recommender parameters

In [None]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_slice(optuna_study)
fig.show()

In [None]:
if not config['tune_parameters']:
    optuna_study = optuna.load_study(study_name=f'hyperparameters_tuning_{config["model"]}_{config["metric"]}', storage=f'sqlite:///{config["database_path"]}')
    
fig = optuna.visualization.plot_param_importances(optuna_study)
fig.show()

# Testing
## Prediction dataframe building

In [None]:
if config["tune_best_params"]:
    display(testing_dataframe)

In [None]:
if config["tune_best_params"]:
    
    if os.path.exists(dataframes_path + f'/prediction_dataframe_{config["model"]}.parquet'):
        prediction_dataframe = pd.read_parquet(dataframes_path + f'/prediction_dataframe_{config["model"]}.parquet')
        print("prediction_dataframe loaded.")
    
    else:

        print("***TRAINING CONTENT GENERATION RECOMMENDERS***\n")
        candidate_generator_recommenders = fit_recommenders("Recall", "TrainValTest", URM_all, ICM_all, cg_recommenders, GH_PATH_1, "cg", repo, take_kfcv_models=True)

        print("***TRAINING FEATURE RECOMMENDERS***\n")
        features_recommenders = fit_recommenders("MAP", "TrainValTest", URM_all, ICM_all, f_recommenders, GH_PATH_1, "f", repo, take_kfcv_models=True)

        print()
        prediction_dataframe = create_XGBoost_dataframe(URM_all, candidate_generator_recommenders, features_recommenders, ICM_all, cutoff=50, categorical=config["categorical"], contents=config["contents"])
        prediction_dataframe.to_parquet(f'/kaggle/working/prediction_dataframe_{config["model"]}.parquet')

        del candidate_generator_recommenders, features_recommenders
    
    display(prediction_dataframe)

    y_test = testing_dataframe["Label"]
    X_test = testing_dataframe.drop(columns=["Label"])

    del testing_dataframe

### Train best XGBoost model

In [None]:
if config['tune_best_params']:

    if config['tune_parameters']:
        best_params = optuna_study.best_trial.params
    else: 
        with open(f'{K_PATH}/{GH_PATH}/{D_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 'r') as best_params_json:
            best_params = json.load(best_params_json)

    recommender_instance = XGBoostRankerRecommender(URM_all, X_test, y_test, prediction_dataframe)
    recommender_instance.fit(
        groups_test,
        **best_params
    )

## Feature importance

In [None]:
if config['tune_best_params']:
    plot_importance(recommender_instance.model, importance_type='gain', title='Gain')

In [None]:
if config['tune_best_params']:
    plot_importance(recommender_instance.model, importance_type='cover', title='Cover')

In [None]:
if config['tune_best_params']:
    plot_importance(recommender_instance.model, importance_type='weight', title='Weight (Frequence)')

# Submission

Create the recommendations for the submission.

In [None]:
if config['tune_best_params']:

    data_target_users_test = pd.read_csv('/kaggle/input/recommender-system-2024-challenge-polimi/data_target_users_test.csv')
    create_submission(data_target_users_test, recommender_instance, f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv')

# Save Version on GitHub 

Write or import a json file where best hyperparameters are saved. 

In [None]:
if config['tune_parameters']:
    with open(f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 'w') as params_file:
        json.dump(optuna_study.best_params, params_file)
        
    if config['save_github']:
        upload_file(
            f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{GH_PATH}/{D_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/best_params_{config["model"]}_{config["metric"]}.json', 
            f'{config["model"]}_{config["metric"]} tuning results (from kaggle notebook)',
            repo
        )
elif config['copy_prev_best_params']:
    shutil.copyfile(
        f'{K_PATH}/{GH_PATH}/{D_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/'\
        f'best_params_{config["model"]}_{config["metric"]}.json', 
        f'/kaggle/working/best_params_{config["model"]}_{config["metric"]}.json'
    )

Save the history of the tuned model.

In [None]:
if config['save_github'] and config['tune_parameters']:
    upload_file(
        config['database_path'], 
        f'{GH_PATH}/{D_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/history_{config["model"]}_{config["metric"]}.db',
        f'Tuning {config["model"]}_{config["metric"]} db updated results (from kaggle notebook)',
        repo
    )

Save the best trained model and its submission.

In [None]:
if config['save_github'] and config['tune_best_params']: 
    upload_file(
                f'/kaggle/working/submission_{config["model"]}_{config["metric"]}.csv', 
                f'{GH_PATH}/{D_PATH}/{config["model"]}Recommender/Optimizing{config["metric"]}/Submission/submission_{config["model"]}_{config["metric"]}.csv', 
                f'New {config["model"]}_{config["metric"]} submission (from kaggle notebook)',
                repo
            )