In [7]:
import Utils.not_random
import Utils.submission as submission
from Data.RecSys2022 import RecSys2022, RecSys2022URMType
from Data_manager.split_functions.split_train_validation_random_holdout import \
    split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout

Setting seed random library, os and numpy seed to 18


In [15]:
dataset = RecSys2022(feature_dummies=True)
dataset.build(type=RecSys2022URMType.ONE_INTERACTED)

urm = dataset.get_urm()
icm = dataset.get_icm()

Unzipping dataset...
Loading interactions...
Loading features...
Loading target ids...
Cleaning up...
Building URM and ICM with criteria ONE_INTERACTED...


In [16]:
urm_train, urm_test = split_train_in_two_percentage_global_sample(urm, train_percentage = 0.8)
#urm_train, urm_validation = split_train_in_two_percentage_global_sample(urm_train, train_percentage = 0.8)



In [17]:
from Evaluation.Evaluator import EvaluatorHoldout

cutoff_list=[10]

#evaluator_validation = EvaluatorHoldout(urm_validation, cutoff_list=cutoff_list)
evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=cutoff_list)

EvaluatorHoldout: Ignoring 750 ( 1.8%) Users that have less than 1 test interactions


In [18]:
import scipy.sparse as sps
from Recommenders.SLIM.SLIMElasticNetRecommender import MultiThreadSLIM_SLIMElasticNetRecommender

candidate_generator_recommender = MultiThreadSLIM_SLIMElasticNetRecommender(urm_train)
candidate_generator_recommender.fit(topK=25000, l1_ratio=1.0, alpha=2e-4, workers=8)

100%|█████████▉| 19600/19630 [00:54<00:00, 722.61it/s]

In [19]:
result_df, _ = evaluator_test.evaluateRecommender(candidate_generator_recommender)
print(f'MAP@10: {result_df.loc[10]["MAP"]}')

100%|█████████▉| 19624/19630 [00:55<00:00, 355.08it/s]


EvaluatorHoldout: Processed 40879 (100.0%) in 15.20 sec. Users per second: 2690
MAP@10: 0.031017054008013717


In [20]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sps
import numpy as np
from xgboost import XGBRanker

from Recommenders.SLIM.SLIMElasticNetRecommender import MultiThreadSLIM_SLIMElasticNetRecommender
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from Recommenders.KNN.ItemKNN_CFCBF_Hybrid_Recommender import ItemKNN_CFCBF_Hybrid_Recommender

In [25]:

from Recommenders.BaseRecommender import BaseRecommender

class XGBoostRecommender(BaseRecommender):
    
    def __init__(self, urm, urm_test, icm):
        super(XGBoostRecommender, self).__init__(urm, verbose = True)
        
        urm_train = self.URM_train
        
        candidate_generator_recommender = MultiThreadSLIM_SLIMElasticNetRecommender(urm_train)
        candidate_generator_recommender.fit(topK=25000, l1_ratio=1.0, alpha=2e-4, workers=8)
        
        n_users, n_items = urm_train.shape

        training_dataframe = pd.DataFrame(index=range(0,n_users), columns = ["ItemID"])
        training_dataframe.index.name='UserID'
        
        for user_id in tqdm(range(n_users)):    
            recommendations = candidate_generator_recommender.recommend(user_id, cutoff = 30)
            training_dataframe.loc[user_id, "ItemID"] = recommendations
        
        recommendations_dataframe = training_dataframe.copy()
        training_dataframe = training_dataframe.explode("ItemID")
        urm_validation_coo = sps.coo_matrix(urm_test)

        correct_recommendations = pd.DataFrame({"UserID": urm_validation_coo.row,
                                                "ItemID": urm_validation_coo.col})
        
        training_dataframe = pd.merge(training_dataframe, correct_recommendations, on=['UserID','ItemID'], how='left', indicator='Exist')
        
        training_dataframe["Label"] = training_dataframe["Exist"] == "both"
        training_dataframe.drop(columns = ['Exist'], inplace=True)
        
        rp3 = RP3betaRecommender(urm_train)
        rp3.fit(alpha=0.719514, beta=0.229898, min_rating=0, topK=80, implicit=True, normalize_similarity=True)

        knn = ItemKNN_CFCBF_Hybrid_Recommender(urm_train, icm)
        knn.fit(topK=100, shrink=75, normalize=True, feature_weighting='TF-IDF', ICM_bias=0.5)

        other_algorithms = {
            "rp3": rp3,
            "knn": knn,
        }
        
        training_dataframe = training_dataframe.set_index('UserID')

        for user_id in tqdm(range(n_users)):  
            for rec_label, rec_instance in other_algorithms.items():
                
                item_list = training_dataframe.loc[user_id, "ItemID"].values.tolist()
                
                all_item_scores = rec_instance._compute_item_score([user_id], items_to_compute = item_list)

                training_dataframe.loc[user_id, rec_label] = all_item_scores[0, item_list] 

        training_dataframe = training_dataframe.reset_index()
        training_dataframe = training_dataframe.rename(columns = {"index": "UserID"})
        
        item_popularity = np.ediff1d(sps.csc_matrix(urm_train).indptr)

        training_dataframe['item_popularity'] = item_popularity[training_dataframe["ItemID"].values.astype(int)]

        user_popularity = np.ediff1d(sps.csr_matrix(urm_train).indptr)

        training_dataframe['user_profile_len'] = user_popularity[training_dataframe["UserID"].values.astype(int)]

        #features_df = pd.DataFrame.sparse.from_spmatrix(icm)
        
        #training_dataframe = training_dataframe.set_index('ItemID').join(features_df, how='inner')
        #training_dataframe = training_dataframe.reset_index()
        #training_dataframe = training_dataframe.rename(columns = {"index": "ItemID"})
        training_dataframe = training_dataframe.sort_values("UserID").reset_index()
        training_dataframe.drop(columns = ['index'], inplace=True)
        
        groups = training_dataframe.groupby("UserID").size().values
        
        y_train = training_dataframe["Label"]
        X_train = training_dataframe.drop(columns=["Label"])
        
        X_train["UserID"] = X_train["UserID"].astype("category")
        X_train["ItemID"] = X_train["ItemID"].astype("category")
        
        self.groups = groups
        self.y_train = y_train
        self.X_train = X_train
        self.recommendations_dataframe = recommendations_dataframe
        
    def fit(self):
        n_estimators = 50
        learning_rate = 1e-1
        reg_alpha = 1e-1
        reg_lambda = 1e-1
        max_depth = 5
        max_leaves = 0
        grow_policy = "depthwise"
        objective = "map"
        eval_metric = "map"
        booster = "gbtree"
        use_user_profile = False
        random_seed = None

        XGB_model = XGBRanker(objective='rank:{}'.format(objective),
                              eval_metric='{}'.format(eval_metric),
                            n_estimators = int(n_estimators),
                            random_state = random_seed,
                            learning_rate = learning_rate,
                            reg_alpha = reg_alpha,
                            reg_lambda = reg_lambda,
                            max_depth = int(max_depth),
                            max_leaves = int(max_leaves),
                            grow_policy = grow_policy,
                            verbosity = 2, # 2 if self.verbose else 0,
                            booster = booster,
                            enable_categorical = True,
                            tree_method = "hist",  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
                            )

        XGB_model.fit(self.X_train,
                self.y_train,
                group=self.groups,
                verbose=True)
        
        self.model = XGB_model
    
    def _get_item_values(self, user_id, model, recommendations_dataframe, X_train):
        values = model.predict(X_train[X_train["UserID"] == user_id])
        
        items = []
        
        while len(items) < 10:
            index = np.argmax(values)
            item = recommendations_dataframe.loc[user_id, "ItemID"][index]
            items.append(item)
            values[index] = -np.inf

        return items
    
    def recommend(self, user_id_array, cutoff = None, remove_seen_flag=True, items_to_compute = None,
                  remove_top_pop_flag = False, remove_custom_items_flag = False, return_scores = False):
        recommendations = []
        
        for user_id in user_id_array:
            items = self._get_item_values(user_id, self.model, self.recommendations_dataframe, self.X_train)
            recommendations.append(items)
        
        return recommendations

In [26]:
xgmodel = XGBoostRecommender(urm_train, urm_test, icm)

100%|█████████▉| 19624/19630 [01:03<00:00, 310.09it/s]
100%|██████████| 41629/41629 [00:15<00:00, 2750.66it/s]


RP3betaRecommender: Similarity column 19630 (100.0%), 4716.45 column/sec. Elapsed time 4.16 sec
Similarity column 19630 (100.0%), 4258.57 column/sec. Elapsed time 4.61 sec


100%|██████████| 41629/41629 [03:31<00:00, 197.02it/s]


In [27]:
xgmodel.fit()

[14:51:59] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[14:51:59] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[14:51:59] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[14:51:59] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[14:51:59] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.ma

In [None]:
from Evaluator import Evaluator


In [17]:
evaluator_custom = Evaluator(urm_test)

evaluator_custom.calculate_map(xgmodel)

  0%|          | 26/40856 [00:00<02:40, 254.23it/s]

Ignoring 773 ( 1.9%) Users that have less than 1 test interactions
[14:10:27] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.


100%|██████████| 40856/40856 [02:32<00:00, 268.14it/s]


0.0

In [3]:
%pip install "setuptools<58.0.0"
%pip install ml_metrics

Collecting setuptools<58.0.0
  Downloading setuptools-57.5.0-py3-none-any.whl (819 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.3/819.3 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 65.7.0
    Uninstalling setuptools-65.7.0:
      Successfully uninstalled setuptools-65.7.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.3.3 requires pyqt5<5.16, which is not installed.
spyder 5.3.3 requires pyqtwebengine<5.16, which is not installed.
anaconda-client 1.11.0 requires setuptools>=58.0.4, but you have setuptools 57.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed setuptools-57.5.0
Note: you may need to restart the kernel to use updated packages.
Collecti

In [4]:
from ml_metrics import mapk

In [54]:
preds = xgmodel.model.predict(xgmodel.X_train)

In [55]:
preds

array([ 0.14942063, -0.6721526 , -0.32680368, ..., -0.6749064 ,
       -0.6749064 , -0.6749064 ], dtype=float32)

In [56]:
# create a list of lists from preds each list with 30 elements

preds_list = []

for i in range(0, len(preds), 30):
    preds_list.append(preds[i:i+30])

preds_list

[array([ 0.14942063, -0.6721526 , -0.32680368, -0.7574864 , -0.21649988,
        -0.40446413, -0.18042652, -0.7074972 , -0.32680368, -0.39039558,
        -0.32680368, -0.18042652, -0.32622963, -0.04508866, -0.6790098 ,
        -0.32680368, -0.33788466, -0.50772005,  0.14942063,  0.26585487,
         0.19356501,  0.1386826 ,  0.24232852, -0.12499449,  0.1386826 ,
        -0.32622963, -0.12499449, -0.25104997, -0.32622963, -0.19312803],
       dtype=float32),
 array([-0.95273185, -0.95273185, -0.95273185, -0.95273185, -0.95273185,
        -0.95273185, -0.95273185, -0.95273185, -0.95273185, -0.95273185,
        -0.95273185, -0.95273185, -0.9083642 , -0.95273185, -0.95273185,
        -0.95273185, -0.95273185, -0.95273185,  0.00307419, -0.5879935 ,
         0.7064144 ,  0.68855166, -0.95273185,  0.48509836, -0.95273185,
        -0.95273185, -0.95273185, -0.95273185, -0.95273185, -0.95273185],
       dtype=float32),
 array([ 1.2901275 ,  0.264992  ,  1.4925232 ,  0.31153446,  0.31940898,
   

In [59]:
xgmodel.recommendations_dataframe

Unnamed: 0_level_0,ItemID
UserID,Unnamed: 1_level_1
0,"[497, 2002, 18, 317, 2008, 626, 101, 88, 11790..."
1,"[5535, 11206, 1858, 1358, 539, 13102, 13105, 1..."
2,"[7103, 497, 7152, 549, 5406, 546, 7863, 1323, ..."
3,"[18, 19, 20, 49, 424, 742, 1538, 284, 18203, 4..."
4,"[19, 101, 20, 439, 350, 24, 23, 44, 47, 451, 1..."
...,...
41624,"[17915, 497, 818, 13689, 12588, 8208, 7103, 15..."
41625,"[44, 47, 735, 318, 21, 400, 398, 321, 401, 161..."
41626,"[213, 9715, 19, 46, 44, 482, 321, 214, 1362, 7..."
41627,"[3707, 972, 1494, 4094, 4095, 1257, 15757, 125..."


In [60]:
preds_list[0]

array([ 0.14942063, -0.6721526 , -0.32680368, -0.7574864 , -0.21649988,
       -0.40446413, -0.18042652, -0.7074972 , -0.32680368, -0.39039558,
       -0.32680368, -0.18042652, -0.32622963, -0.04508866, -0.6790098 ,
       -0.32680368, -0.33788466, -0.50772005,  0.14942063,  0.26585487,
        0.19356501,  0.1386826 ,  0.24232852, -0.12499449,  0.1386826 ,
       -0.32622963, -0.12499449, -0.25104997, -0.32622963, -0.19312803],
      dtype=float32)

In [61]:
urm_test.indices[urm_test.indptr[0]:urm_test.indptr[1+1]]

array([   10,    37,   101,  3240,  3559,  6032,  7473,  9178, 17272,
       18532,   321,   942,  5335, 17106], dtype=int32)

In [79]:
model_recommendations = xgmodel.recommendations_dataframe.loc[3, "ItemID"]

for rec in model_recommendations:
    if rec in urm_test.indices[urm_test.indptr[3]:urm_test.indptr[3+1]]:
        print(rec)

20
424
51


In [72]:
def __get_item_values(user_id, recommendations_dataframe, values):  
    items = []
        
    while len(items) < 10:
        index = np.argmax(values)
        item = recommendations_dataframe.loc[user_id, "ItemID"][index]
        items.append(item)
        values[index] = -np.inf

    return items

In [80]:
__get_item_values(3, xgmodel.recommendations_dataframe, preds_list[3])

# 20
# 424 check
# 51 check

[46, 51, 318, 1404, 1999, 788, 424, 1109, 49, 1811]

In [81]:
model_recommendations[0:11]

[18, 19, 20, 49, 424, 742, 1538, 284, 18203, 436, 52]

In [None]:
# 20 check
# 424 check
# 51