# Pre-processing

In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time

## URM

In [6]:
URM_all_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_train.csv")
URM_all_dataframe.head(10)

Unnamed: 0,user_id,item_id,data
0,0,0,1.0
1,0,2,1.0
2,0,120,1.0
3,0,128,1.0
4,0,211,1.0
5,0,232,1.0
6,0,282,1.0
7,0,453,1.0
8,0,458,1.0
9,0,491,1.0


In [7]:
userID_unique = URM_all_dataframe["user_id"].unique()
itemID_unique = URM_all_dataframe["item_id"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("The number of interactions is {}".format(n_interactions))

Number of items	 38121, Number of users	 35736
Max ID items	 38120, Max Id users	 35735

The number of interactions is 1764607


## ICM

In [8]:
ICM_dataframe = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_ICM_metadata.csv")
ICM_dataframe.head(10)

Unnamed: 0,item_id,feature_id,data
0,2519,0,1.0
1,2642,0,1.0
2,3316,0,1.0
3,3400,0,1.0
4,3472,0,1.0
5,4361,0,1.0
6,4661,0,1.0
7,8697,0,1.0
8,9692,0,1.0
9,9693,0,1.0


In [9]:
featureID_unique = ICM_dataframe["feature_id"].unique()
itemID_unique = ICM_dataframe["item_id"].unique()

n_items = len(itemID_unique)
n_features = len(featureID_unique)
n_interactions = len(ICM_dataframe)

print ("Number of items\t {}, Number of features\t {}".format(n_items, n_features))
print ("Max ID items\t {}, Max Id features\t {}\n".format(max(itemID_unique), max(featureID_unique)))
print ("The number of combinations is {}".format(n_interactions))

Number of items	 38121, Number of features	 94331
Max ID items	 38120, Max Id features	 94330

The number of combinations is 2940040


## Construction of URM and ICM matrices

In [10]:
URM_all = sps.csr_matrix((URM_all_dataframe["data"].values, 
                          (URM_all_dataframe["user_id"].values, URM_all_dataframe["item_id"].values)),
                        shape = (n_users, n_items))

URM_all

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1764607 stored elements and shape (35736, 38121)>

In [11]:
ICM_all = sps.csr_matrix((ICM_dataframe["data"].values, 
                          (ICM_dataframe["item_id"].values, ICM_dataframe["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2940040 stored elements and shape (38121, 94331)>

# Training

In [12]:
!git clone https://github.com/recsyspolimi/RecSys_Course_AT_PoliMi.git
%cd RecSys_Course_AT_PoliMi

Cloning into 'RecSys_Course_AT_PoliMi'...
remote: Enumerating objects: 1601, done.[K
remote: Counting objects: 100% (292/292), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 1601 (delta 218), reused 229 (delta 203), pack-reused 1309 (from 1)[K
Receiving objects: 100% (1601/1601), 51.44 MiB | 28.86 MiB/s, done.
Resolving deltas: 100% (945/945), done.
/kaggle/working/RecSys_Course_AT_PoliMi


In [13]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
# URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.80)

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
# evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 162 ( 0.5%) Users that have less than 1 test interactions


In [14]:
import optuna
import pandas as pd
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender

def objective_function_SLIMElasticNet(optuna_trial):
    
    recommender_instance = SLIMElasticNetRecommender(URM_train)
    
    full_hyperp = {
                   "topK": optuna_trial.suggest_int("topK", 0, 1500),
                   "l1_ratio": optuna_trial.suggest_float("l1_ratio", 0.01, 1.0, log=True),
                    "alpha": optuna_trial.suggest_float("alpha", 1e-4, 1e-1, log=True)
                  }        
    
    recommender_instance.fit(**full_hyperp)
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)
    
    return result_df.loc[10]["MAP"]

In [15]:
class SaveResults(object):
    
    def __init__(self):
        self.results_df = pd.DataFrame(columns=["result"])
    
    def __call__(self, optuna_study, optuna_trial):
        hyperparam_dict = optuna_trial.params.copy()
        hyperparam_dict["result"] = optuna_trial.values[0]
        
        new_row = pd.DataFrame([hyperparam_dict])
        
        if not new_row.empty:
            self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)

In [20]:
# Start measuring time
start_time = time.time()

# Create study and optimize
optuna_study = optuna.create_study(direction="maximize")
save_results = SaveResults()
optuna_study.optimize(objective_function_SLIMElasticNet,
                      callbacks=[save_results],
                      n_trials=1)

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

# Convert the time to minutes or hours
hours = elapsed_time // 3600
minutes = (elapsed_time % 3600) // 60
seconds = elapsed_time % 60

print('\n')
# Display the time in a more readable format
if hours > 0:
    print(f"Training loop of 50 trials took {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds.")
elif minutes > 0:
    print(f"Training loop of 50 trials took {int(minutes)} minutes and {int(seconds)} seconds.")
else:
    print(f"Training loop of 50 trials took {int(seconds)} seconds.")

[I 2024-11-10 20:59:59,213] A new study created in memory with name: no-name-0e770fcd-5deb-420c-a797-df554c5f65fc


SLIMElasticNetRecommender: Processed 4555 (11.9%) in 5.00 min. Items per second: 15.18

SLIMElasticNetRecommender: Processed 9208 (24.2%) in 10.00 min. Items per second: 15.34

SLIMElasticNetRecommender: Processed 14549 (38.2%) in 15.00 min. Items per second: 16.16

SLIMElasticNetRecommender: Processed 20093 (52.7%) in 20.00 min. Items per second: 16.74

SLIMElasticNetRecommender: Processed 25758 (67.6%) in 25.00 min. Items per second: 17.17

SLIMElasticNetRecommender: Processed 31173 (81.8%) in 30.00 min. Items per second: 17.32

SLIMElasticNetRecommender: Processed 36961 (97.0%) in 35.00 min. Items per second: 17.60

SLIMElasticNetRecommender: Processed 38121 (100.0%) in 36.11 min. Items per second: 17.59

EvaluatorHoldout: Processed 35289 (100.0%) in 28.44 sec. Users per second: 1241


[I 2024-11-10 21:36:34,489] Trial 0 finished with value: 0.0293423295716494 and parameters: {'topK': 724, 'l1_ratio': 0.08036833072159669, 'alpha': 0.0012837195155802992}. Best is trial 0 with value: 0.0293423295716494.






Training loop of 50 trials took 36 minutes and 35 seconds.



  self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)


In [21]:
pruned_trials = [t for t in optuna_study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in optuna_study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(optuna_study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
print("  Value Validation: ", optuna_study.best_trial.value)

Study statistics: 

  Number of finished trials:  1

  Number of pruned trials:  0

  Number of complete trials:  1

Best trial:

  Value Validation:  0.0293423295716494


In [22]:
optuna_study.best_trial

FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.0293423295716494], datetime_start=datetime.datetime(2024, 11, 10, 20, 59, 59, 217926), datetime_complete=datetime.datetime(2024, 11, 10, 21, 36, 34, 489198), params={'topK': 724, 'l1_ratio': 0.08036833072159669, 'alpha': 0.0012837195155802992}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'topK': IntDistribution(high=1500, log=False, low=0, step=1), 'l1_ratio': FloatDistribution(high=1.0, log=True, low=0.01, step=None), 'alpha': FloatDistribution(high=0.1, log=True, low=0.0001, step=None)}, trial_id=0, value=None)

In [23]:
optuna_study.best_trial.params

{'topK': 724, 'l1_ratio': 0.08036833072159669, 'alpha': 0.0012837195155802992}

In [24]:
save_results.results_df

Unnamed: 0,result,topK,l1_ratio,alpha
0,0.029342,724.0,0.080368,0.001284


In [26]:
recommender_instance = SLIMElasticNetRecommender(URM_train + URM_validation)
recommender_instance.fit(**optuna_study.best_trial.params)

result_df, _ = evaluator_test.evaluateRecommender(recommender_instance)

SLIMElasticNetRecommender: Processed 3365 ( 8.8%) in 5.00 min. Items per second: 11.21

SLIMElasticNetRecommender: Processed 6544 (17.2%) in 10.00 min. Items per second: 10.90

SLIMElasticNetRecommender: Processed 9548 (25.0%) in 15.00 min. Items per second: 10.61

SLIMElasticNetRecommender: Processed 12311 (32.3%) in 20.00 min. Items per second: 10.26

SLIMElasticNetRecommender: Processed 15237 (40.0%) in 25.00 min. Items per second: 10.16

SLIMElasticNetRecommender: Processed 18179 (47.7%) in 30.00 min. Items per second: 10.10

SLIMElasticNetRecommender: Processed 21689 (56.9%) in 35.00 min. Items per second: 10.33

SLIMElasticNetRecommender: Processed 24703 (64.8%) in 40.00 min. Items per second: 10.29

SLIMElasticNetRecommender: Processed 27573 (72.3%) in 45.01 min. Items per second: 10.21

SLIMElasticNetRecommender: Processed 30910 (81.1%) in 50.01 min. Items per second: 10.30

SLIMElasticNetRecommender: Processed 34870 (91.5%) in 55.01 min. Items per second: 10.57

SLIMElasticNet

In [34]:
result_df

Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.103773,0.155308,0.13191,0.055224,0.080721,0.290493,0.151447,0.116162,0.549827,0.393182,...,0.995998,0.547627,0.995998,0.135993,12.603534,0.999696,0.22358,0.853794,1.780028,0.236129


In [16]:
# Training in the full dataset with the best parameters
recommender_instance = SLIMElasticNetRecommender(URM_train + URM_validation)

recommender_instance.fit(
    topK=1012,
    l1_ratio=0.3646567004959359,
    alpha=0.0005365840686464791,
)

SLIMElasticNetRecommender: Processed 6359 (16.7%) in 5.00 min. Items per second: 21.19
SLIMElasticNetRecommender: Processed 12438 (32.6%) in 10.00 min. Items per second: 20.73
SLIMElasticNetRecommender: Processed 19317 (50.7%) in 15.00 min. Items per second: 21.46
SLIMElasticNetRecommender: Processed 26354 (69.1%) in 20.00 min. Items per second: 21.96
SLIMElasticNetRecommender: Processed 33423 (87.7%) in 25.00 min. Items per second: 22.28
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 28.56 min. Items per second: 22.24


# Testing

In [17]:
data_target_users_test = pd.read_csv("/kaggle/input/recommender-system-2024-challenge-polimi/data_target_users_test.csv")
data_target_users_test.head(10)

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [18]:
target_result = []
for target in data_target_users_test["user_id"]:
    target_result.append(recommender_instance.recommend(target, cutoff = 10, remove_seen_flag=True))

In [19]:
user_ids = data_target_users_test["user_id"]

# Convert `target_result` to the required format for each user
formatted_data = {
    "user_id": user_ids,
    "item_list": [" ".join(map(str, items)) for items in target_result]
}

# Create the DataFrame
submission_df = pd.DataFrame(formatted_data)

submission_df.to_csv("/kaggle/working/submission_optuna_SLIMElasticNetRecommender.csv", index=False, header=["user_id", "item_list"])