In [22]:
import Utils.not_random
import Utils.submission as submission
from Data.RecSys2022 import RecSys2022, RecSys2022URMType
from Data_manager.split_functions.split_train_validation_random_holdout import \
    split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout

In [23]:
dataset = RecSys2022(feature_dummies=True)
dataset.build(type=RecSys2022URMType.ONE_INTERACTED)

urm = dataset.get_urm()
icm = dataset.get_icm()

Unzipping dataset...
Loading interactions...
Loading features...
Loading target ids...
Cleaning up...
Building URM and ICM with criteria ONE_INTERACTED...


In [24]:
urm_train, urm_test = split_train_in_two_percentage_global_sample(urm, train_percentage = 0.8)
#urm_train, urm_validation = split_train_in_two_percentage_global_sample(urm_train, train_percentage = 0.8)



In [25]:
from Evaluation.Evaluator import EvaluatorHoldout

cutoff_list=[10]

#evaluator_validation = EvaluatorHoldout(urm_validation, cutoff_list=cutoff_list)
evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=cutoff_list)

EvaluatorHoldout: Ignoring 733 ( 1.8%) Users that have less than 1 test interactions


In [26]:
import scipy.sparse as sps
from Recommenders.SLIM.SLIMElasticNetRecommender import MultiThreadSLIM_SLIMElasticNetRecommender

candidate_generator_recommender = MultiThreadSLIM_SLIMElasticNetRecommender(urm_train)
candidate_generator_recommender.fit(topK=25000, l1_ratio=1.0, alpha=2e-4, workers=8)

100%|█████████▉| 19536/19630 [00:55<00:00, 621.64it/s]

In [27]:
result_df, _ = evaluator_test.evaluateRecommender(candidate_generator_recommender)
print(f'MAP@10: {result_df.loc[10]["MAP"]}')

100%|█████████▉| 19624/19630 [00:59<00:00, 331.25it/s]


EvaluatorHoldout: Processed 40896 (100.0%) in 15.29 sec. Users per second: 2674
MAP@10: 0.03123412155028788


In [28]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [29]:
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sps
import numpy as np
from xgboost import XGBRanker

from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from Recommenders.KNN.ItemKNN_CFCBF_Hybrid_Recommender import ItemKNN_CFCBF_Hybrid_Recommender

In [33]:

from Recommenders.BaseRecommender import BaseRecommender

class XGBoostRecommender(BaseRecommender):
    
    def __init__(self, urm, urm_test, icm):
        super(XGBoostRecommender, self).__init__(urm, verbose = True)
        
        urm_train = self.URM_train
        
        candidate_generator_recommender = MultiThreadSLIM_SLIMElasticNetRecommender(urm_train)
        candidate_generator_recommender.fit(topK=25000, l1_ratio=1.0, alpha=2e-4, workers=8)
        
        n_users, n_items = urm_train.shape

        training_dataframe = pd.DataFrame(index=range(0,n_users), columns = ["ItemID"])
        training_dataframe.index.name='UserID'
        
        for user_id in tqdm(range(n_users)):    
            recommendations = candidate_generator_recommender.recommend(user_id, cutoff = 20)
            training_dataframe.loc[user_id, "ItemID"] = recommendations
        
        recommendations_dataframe = training_dataframe.copy()
        training_dataframe = training_dataframe.explode("ItemID")
        urm_validation_coo = sps.coo_matrix(urm_test)

        correct_recommendations = pd.DataFrame({"UserID": urm_validation_coo.row,
                                                "ItemID": urm_validation_coo.col})
        
        training_dataframe = pd.merge(training_dataframe, correct_recommendations, on=['UserID','ItemID'], how='left', indicator='Exist')
        
        training_dataframe["Label"] = training_dataframe["Exist"] == "both"
        training_dataframe.drop(columns = ['Exist'], inplace=True)
        
        rp3 = RP3betaRecommender(urm_train)
        rp3.fit(alpha=0.719514, beta=0.229898, min_rating=0, topK=80, implicit=True, normalize_similarity=True)

        knn = ItemKNN_CFCBF_Hybrid_Recommender(urm_train, icm)
        knn.fit(topK=100, shrink=75, normalize=True, feature_weighting='TF-IDF', ICM_bias=0.5)

        other_algorithms = {
            "rp3": rp3,
            "knn": knn,
        }
        
        training_dataframe = training_dataframe.set_index('UserID')

        for user_id in tqdm(range(n_users)):  
            for rec_label, rec_instance in other_algorithms.items():
                
                item_list = training_dataframe.loc[user_id, "ItemID"].values.tolist()
                
                all_item_scores = rec_instance._compute_item_score([user_id], items_to_compute = item_list)

                training_dataframe.loc[user_id, rec_label] = all_item_scores[0, item_list] 

        training_dataframe = training_dataframe.reset_index()
        training_dataframe = training_dataframe.rename(columns = {"index": "UserID"})
        
        item_popularity = np.ediff1d(sps.csc_matrix(urm_train).indptr)

        training_dataframe['item_popularity'] = item_popularity[training_dataframe["ItemID"].values.astype(int)]

        user_popularity = np.ediff1d(sps.csr_matrix(urm_train).indptr)

        training_dataframe['user_profile_len'] = user_popularity[training_dataframe["UserID"].values.astype(int)]

        #features_df = pd.DataFrame.sparse.from_spmatrix(icm)
        
        #training_dataframe = training_dataframe.set_index('ItemID').join(features_df, how='inner')
        #training_dataframe = training_dataframe.reset_index()
        #training_dataframe = training_dataframe.rename(columns = {"index": "ItemID"})
        training_dataframe = training_dataframe.sort_values("UserID").reset_index()
        training_dataframe.drop(columns = ['index'], inplace=True)
        
        groups = training_dataframe.groupby("UserID").size().values
        
        y_train = training_dataframe["Label"]
        X_train = training_dataframe.drop(columns=["Label"])
        
        X_train["UserID"] = X_train["UserID"].astype("category")
        X_train["ItemID"] = X_train["ItemID"].astype("category")
        
        self.groups = groups
        self.y_train = y_train
        self.X_train = X_train
        self.recommendations_dataframe = recommendations_dataframe
        
    def fit(self):
        n_estimators = 50
        learning_rate = 1e-1
        reg_alpha = 1e-1
        reg_lambda = 1e-1
        max_depth = 5
        max_leaves = 0
        grow_policy = "depthwise"
        objective = "pairwise"
        booster = "gbtree"
        use_user_profile = False
        random_seed = None

        XGB_model = XGBRanker(objective='rank:{}'.format(objective),
                            n_estimators = int(n_estimators),
                            random_state = random_seed,
                            learning_rate = learning_rate,
                            reg_alpha = reg_alpha,
                            reg_lambda = reg_lambda,
                            max_depth = int(max_depth),
                            max_leaves = int(max_leaves),
                            grow_policy = grow_policy,
                            verbosity = 2, # 2 if self.verbose else 0,
                            booster = booster,
                            enable_categorical = True,
                            tree_method = "hist",  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
                            )

        XGB_model.fit(self.X_train,
                self.y_train,
                group=self.groups,
                verbose=True)
        
        self.model = XGB_model
    
    def _get_item_values(self, user_id, model, recommendations_dataframe, X_train):
        values = model.predict(X_train[X_train["UserID"] == user_id])
        
        items = []
        
        while len(items) < 10:
            index = np.argmax(values)
            item = recommendations_dataframe.loc[user_id, "ItemID"][index]
            items.append(item)
            values[index] = -np.inf

        return items
    
    def recommend(self, user_id_array, cutoff = None, remove_seen_flag=True, items_to_compute = None,
                  remove_top_pop_flag = False, remove_custom_items_flag = False, return_scores = False):
        recommendations = []
        
        for user_id in user_id_array:
            items = self._get_item_values(user_id, self.model, self.recommendations_dataframe, self.X_train)
            recommendations.append(items)
        
        return recommendations

In [34]:
xgmodel = XGBoostRecommender(urm_train, urm_test, icm)

100%|█████████▉| 19624/19630 [00:57<00:00, 340.94it/s]
100%|██████████| 41629/41629 [00:15<00:00, 2755.70it/s]


RP3betaRecommender: Similarity column 19630 (100.0%), 4655.47 column/sec. Elapsed time 4.22 sec
Similarity column 19630 (100.0%), 4255.95 column/sec. Elapsed time 4.61 sec


100%|██████████| 41629/41629 [02:31<00:00, 275.53it/s]


In [37]:
xgmodel.fit()

[01:09:25] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[01:09:25] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[01:09:25] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[01:09:25] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[01:09:25] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.ma

In [38]:
from Evaluator import Evaluator


In [39]:
evaluator_custom = Evaluator(urm_test)

evaluator_custom.calculate_map(xgmodel)

  0%|          | 26/40896 [00:00<02:37, 259.42it/s]

Ignoring 733 ( 1.8%) Users that have less than 1 test interactions
[01:09:39] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.


100%|██████████| 40896/40896 [02:49<00:00, 241.06it/s]


0.011171974943798658