In [50]:
!pip install -qq catboost optuna implicit lightfm

In [51]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight
from implicit.nearest_neighbours import ItemItemRecommender

from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRanker, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

import optuna
from optuna.samplers import TPESampler
from BorutaShap import BorutaShap

In [52]:
# from google.colab import drive
# drive.mount('/content/drive')
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c gb-recsys-project
!unzip gb-recsys-project.zip
!rm gb-recsys-project.zip

kaggle.json
Downloading gb-recsys-project.zip to /content
 59% 14.0M/23.9M [00:00<00:00, 147MB/s]
100% 23.9M/23.9M [00:00<00:00, 177MB/s]
Archive:  gb-recsys-project.zip
replace hh_demographic.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [53]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    return precision

def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list, bought_list)
    
    if sum(flags) == 0:
        return 0
    
    sum_ = 0
    for i in range(k):
        
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
            
    result = sum_ / sum(flags)
    
    return result

def map_k(recommend_list, bought_list, k=5):
    return np.mean([ap_k(rec, bt, k) for rec, bt in zip(recommend_list, bought_list)])

def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall

def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)[:4] != 'uint' and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif str(col_type)[:4] != 'uint':
            df[col] = df[col].astype('category')
    return df

def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [54]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')
test = pd.read_csv('test_users.csv')

N_preds = 30

In [55]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [56]:
item_features.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [57]:
user_features.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


##### Dealing with implicit data as users do not rate bought products at any way.

In [58]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [59]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1 = reduce_memory(data_train_lvl_1)
data_val_lvl_1 = reduce_memory(data_val_lvl_1)
data_train_lvl_2 = reduce_memory(data_train_lvl_2)
data_val_lvl_2 = reduce_memory(data_val_lvl_2)

print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')
print(f'Level 1 Valid - users: {data_val_lvl_1.user_id.nunique()}, items: {data_val_lvl_1.item_id.nunique()}')
print(f'Level 2 Train - users: {data_train_lvl_2.user_id.nunique()}, items: {data_train_lvl_2.item_id.nunique()}')
print(f'Level 2 Valid - users: {data_val_lvl_2.user_id.nunique()}, items: {data_val_lvl_2.item_id.nunique()}')

Level 1 Train - users: 2498, items: 83685
Level 1 Valid - users: 2154, items: 27649
Level 2 Train - users: 2154, items: 27649
Level 2 Valid - users: 2042, items: 24329


In [60]:
# ищем общих пользователей
# common_users = list(set(data_train_lvl_1.user_id.values)&(set(data_val_lvl_1.user_id.values))&set(data_val_lvl_2.user_id.values))

# оставляем общих пользователей
# data_train_lvl_1 = data_train_lvl_1[data_train_lvl_1.user_id.isin(common_users)]
# data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(common_users)]
# data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(common_users)]
# data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(common_users)]

# print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')
# print(f'Level 1 Valid - users: {data_val_lvl_1.user_id.nunique()}, items: {data_val_lvl_1.item_id.nunique()}')
# print(f'Level 2 Train - users: {data_train_lvl_2.user_id.nunique()}, items: {data_train_lvl_2.item_id.nunique()}')
# print(f'Level 2 Valid - users: {data_val_lvl_2.user_id.nunique()}, items: {data_val_lvl_2.item_id.nunique()}')

In [61]:
result = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [62]:
# top 5 popular recommendations
popular_recs = popularity_recommendation(data_train_lvl_1, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

Unnamed: 0,user_id,actual,popular_recommendation
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [63]:
def prefilter_items(data, take_n_popular=5000, item_features=None):
    # Уберем самые популярные товары (их и так купят)
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index() / data['user_id'].nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

    top_popular = popularity[popularity['share_unique_users'] > 0.2].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]

    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.02].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]

    # Уберем не интересные для рекоммендаций категории (department)
    if item_features is not None:
        department_size = pd.DataFrame(item_features.groupby('department')['item_id'].nunique().sort_values(ascending=False)).reset_index()
        department_size.columns = ['department', 'n_items']
        rare_departments = department_size[department_size['n_items'] < 150].department.tolist()
        items_in_rare_departments = item_features[item_features['department'].isin(rare_departments)].item_id.unique().tolist()

        data = data[~data['item_id'].isin(items_in_rare_departments)]

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
    data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
    data = data[data['price'] > 2]

    # Уберем слишком дорогие товарыs
    data = data[data['price'] < 50]

    # уберем товары, не продававшиеся более 12-18 месяцев
    data = data[data['week_no'] >= data['week_no'].max() - 52]

    # Возьмем топ по популярности
    popularity = data.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
    top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()
    
    # Заведем фиктивный item_id (если юзер покупал товары из топ-N, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = 999999
    
    return data

def postfilter(recommendations, item_info, N=5):
    """Пост-фильтрация товаров
    
    Input
    -----
    recommendations: list
        Ранжированный список item_id для рекомендаций
    item_info: pd.DataFrame
        Датафрейм с информацией о товарах
    """
    
    # Уникальность
    unique_recommendations = []
    [unique_recommendations.append(item) for item in recommendations if item not in unique_recommendations]
    
    # Разные категории
    categories_used = []
    final_recommendations = []
    CATEGORY_NAME = 'sub_commodity_desc'
    for item in unique_recommendations:
        category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
        
        if category not in categories_used:
            final_recommendations.append(item)
            
        unique_recommendations.remove(item)
        categories_used.append(category)
    
    n_rec = len(final_recommendations)
    if n_rec < N:
        final_recommendations.extend(unique_recommendations[:N - n_rec])
    else:
        final_recommendations = final_recommendations[:N]
    
    assert len(final_recommendations) == N, 'Количество рекомендаций != {}'.format(N)
    return final_recommendations

In [64]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, data_type='quantity', weighting=True, normalize=False, alpha=1.):
        self.data_type = data_type
        self.normalize = normalize
        self.alpha = alpha

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data, data_type=self.data_type, normalize=self.normalize)
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix, K1=150, B=0.8)  # default: 100, 0.8 ver. 0.5.2
            # self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix, alpha=self.alpha)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data, data_type, normalize=False):
        """Готовит user-item матрицу"""
        if data_type == 'quantity':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='quantity',
                                              aggfunc='count',
                                              fill_value=0
                                              )
        elif data_type == 'sales':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='sales_value',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()  # normalize
        elif data_type == 'quantity_sum':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='quantity',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()
        else:
            raise ValueError(f'Agg data type must be "quantity", "sales" or "quantity_sum", given: {data_type}')

        user_item_matrix = user_item_matrix.astype(float)

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).tocsr())  # ver. 0.5.2
        # own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=50, regularization=0.001, iterations=15, num_threads=4, alpha=1.):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads,
                                        use_gpu=False,
                                        random_state=42)
        model.fit(csr_matrix(user_item_matrix).tocsr()*alpha)  # ver. 0.5.2
        # model.fit(csr_matrix(user_item_matrix).T.tocsr()*alpha)

        return model

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[0][1]  # ver. 0.5.2
        # top_rec = recs[1][0]
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        user_id = self.userid_to_id[user]
        
        """ ver. 0.5.2 """
        
        model_name = model.__class__.__name__
        
        if model_name == 'ItemItemRecommender':

            res = model.recommend(userid=user_id,
                                  user_items=csr_matrix(self.user_item_matrix).tocsr()[user_id, :],
                                  N=N-1,
                                  filter_already_liked_items=False,
                                  filter_items=[self.itemid_to_id[999999]],
                                  recalculate_user=True)[0].tolist()
            res = [self.id_to_itemid[rec] for rec in res]

        elif model_name == 'AlternatingLeastSquares':
            res = [self.id_to_itemid[rec] for rec in model.recommend(userid=user_id,
                                        user_items=csr_matrix(self.user_item_matrix).tocsr()[user_id,:],
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)[0]]
        """
        res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=user_id,
                                                                    user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                                                    N=N,
                                                                    filter_already_liked_items=False,
                                                                    filter_items=[self.itemid_to_id[999999]],
                                                                    recalculate_user=True)]
        """
        res = self._extend_with_top_popular(res, N=N)
        
        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""
        
        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        if 999999 in res: res.remove(999999)  # prev. ver.

        res = self._extend_with_top_popular(res, N=N) 

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами: берем N похожих пользователей и с помощью трюка рекомендуем юзеру их топ товары"""
        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec for rec in similar_users[0]]  # ver. 0.5.2
        # similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            user = self.id_to_userid[user]  ## нужно подать для предикта оригинальный идентификатор # ver. 0.4.8
            res.extend(self.get_own_recommendations(user, N=1))
        res = pd.Series(res).drop_duplicates().tolist()

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

#### First Model

In [65]:
data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

In [66]:
print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')

Level 1 Train - users: 2460, items: 5001


In [67]:
recommender = MainRecommender(data_train_lvl_1, data_type='quantity', normalize=False, weighting=True)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [68]:
list_out = result.loc[~result.user_id.isin(data_train_lvl_1.user_id), 'user_id'].tolist()
list_out

[296, 814, 1273, 1355, 1644, 1679, 1813, 1984, 2010, 2348]

In [69]:
def rule(x, y, model, N=5):
    if x in y:
        return recommender.overall_top_purchases[:N]
    if model == 'als':
        return recommender.get_als_recommendations(x, N=N)
    elif model == 'own':
        return recommender.get_own_recommendations(x, N=N)
    elif model == 'similar_items':
        return recommender.get_similar_items_recommendation(x, N=N)
    elif model == 'similar_users':
        return recommender.get_similar_users_recommendation(x, N=N)

In [70]:
result['als'] = result['user_id'].apply(lambda x: rule(x, list_out, model='als', N=5))
result['own'] = result['user_id'].apply(lambda x: rule(x, list_out, model='own', N=5))
result['similar_items'] = result['user_id'].apply(lambda x: rule(x, list_out, model='similar_items', N=5))
result['similar_users'] = result['user_id'].apply(lambda x: rule(x, list_out, model='similar_users', N=5))

In [71]:
def calc_precision_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean() * 100

def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean() * 100

def calc_map_at_k(df_data):
    for col_name in df_data.columns[2:]:
        yield col_name, map_k(df_data[col_name].values.tolist(), df_data['actual'].values.tolist())*100

In [72]:
for pred in calc_precision_at_k(result, 5):
    print(*pred, sep=': ')

popular_recommendation: 17.595171773444754
als: 20.10213556174559
own: 29.1736304549675
similar_items: 10.97493036211699
similar_users: 11.002785515320335


In [73]:
for pred in calc_map_at_k(result):
    print(*pred, sep=': ')

popular_recommendation: 29.00753120808831
als: 36.5010187764366
own: 36.484576498504076
similar_items: 18.140101103889407
similar_users: 23.007930981120396


In [74]:
for pred in calc_recall(result, 5):
    print(*pred, sep=': ')

popular_recommendation: 2.1638250971927806
als: 2.3812058913228915
own: 3.3062448031748675
similar_items: 1.186896403618512
similar_users: 1.5136714284703174


#### Second level model

In [206]:
users_lvl_1 = data_train_lvl_1.user_id.unique()
users_lvl_2 = data_val_lvl_1.user_id.unique()
users_lvl_3 = data_val_lvl_2.user_id.unique()

new_users_lvl_2 = list(set(users_lvl_2) - set(users_lvl_1))
new_users_lvl_3 = list(set(users_lvl_3) - (set(users_lvl_1) | set(users_lvl_2)))

add_to_lvl_2 = list(set(users_lvl_3) - (set(users_lvl_2)))

new_users_lvl_2, new_users_lvl_3, len(add_to_lvl_2)

([1984, 296, 1355, 1644, 2348, 814, 1679, 1813, 1273, 2010],
 [512, 2339, 1636, 206, 1105, 2259, 2039],
 126)

In [207]:
train_val_lvl1 = pd.concat([data_train_lvl_1, data_val_lvl_1])
train_val_lvl1 = train_val_lvl1.merge(item_features[['item_id', 'department']], on='item_id', how='inner')

In [208]:
condition = ((data_train_lvl_2.user_id.isin(data_train_lvl_1.user_id.tolist())) | (pd.Series(data=add_to_lvl_2).isin(data_train_lvl_1.user_id.tolist())))
val_list_out = data_train_lvl_2.loc[~condition, 'user_id'].unique().tolist()
val_list_out

[1813, 1273, 1355, 1679, 1984, 2010, 1644, 296, 814, 2348]

In [209]:
df_candidates = pd.DataFrame(data=data_train_lvl_2['user_id'].unique().tolist() + add_to_lvl_2)
df_candidates.columns = ['user_id']
df_candidates['candidates'] = df_candidates['user_id'].apply(lambda x: rule(x, new_users_lvl_2+new_users_lvl_3, model='own', N=N_preds))
df_candidates.head()

Unnamed: 0,user_id,candidates
0,2070,"[999270, 1067779, 879194, 1097398, 1055168, 83..."
1,2021,"[6424471, 1026984, 9835223, 1131438, 1104649, ..."
2,1753,"[996070, 1044078, 12648296, 8273833, 7441210, ..."
3,2120,"[1029743, 1029743, 1106523, 5569230, 916122, 8..."
4,1346,"[10356272, 1034886, 827919, 5584646, 1022527, ..."


In [210]:
df_candidates = pd.DataFrame({'user_id': df_candidates['user_id'].values.repeat(N_preds),
                              'item_id': np.concatenate(df_candidates['candidates'].values)})

df_train = data_train_lvl_2[['user_id', 'item_id']].copy()
df_train['target'] = 1

df_train = df_candidates.merge(df_train, on=['user_id', 'item_id'], how='left')
df_train.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

df_train['target'].fillna(0, inplace=True)
df_train.head()

Unnamed: 0,user_id,item_id,target
0,2070,999270,0.0
1,2070,1067779,0.0
2,2070,879194,0.0
3,2070,1097398,0.0
4,2070,1055168,0.0


In [211]:
df_train['target'].value_counts()

0.0    58580
1.0     9083
Name: target, dtype: int64

In [212]:
df_train = df_train.merge(item_features, on='item_id', how='left')
df_train = df_train.merge(user_features, on='user_id', how='left')
df_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,999270,0.0,1425,MEAT-PCKGD,National,BACON,ECONOMY,1 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1067779,0.0,69,MEAT-PCKGD,Private,LUNCHMEAT,HAM,1 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,1097398,0.0,111,DRUG GM,National,CIGARETTES,CIGARETTES,CTN,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,1055168,0.0,69,GROCERY,Private,ICE CREAM/MILK/SHERBTS,TRADITIONAL,56 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [213]:
data_department = data_train_lvl_2.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,GROCERY
1,2070,40630625006,594,1019940,1,1.0,311,-0.29,201,86,0.0,0.0,GROCERY


In [214]:
def create_multiple_features(df_train, train_val_lvl1=train_val_lvl1):
    df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('sales_value').sum().rename('total_item_sales_value'), how='left',on='item_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('total_quantity_value'), how='left',on='item_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('user_id').count().rename('item_freq'), how='left',on='item_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('user_id').count().rename('user_freq'), how='left',on='user_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('sales_value').sum().rename('total_user_sales_value'), how='left',on='user_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_week')/train_val_lvl1.week_no.nunique(), how='left',on='item_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_week')/train_val_lvl1.week_no.nunique(), how='left',on='user_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='item_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='user_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('user_id').count().rename('item_freq_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='item_id')
    df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('user_id').count().rename('user_freq_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='user_id')
    return df_train

df_train = create_multiple_features(df_train)

In [215]:
def create_user_features(df_train):
    # Средний чек
    users_sales = data_train_lvl_2.groupby('user_id')['sales_value'].mean().reset_index()
    users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
    df_train = df_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')

    # Количество уникальных категорий покупателя
    users_departments = data_department.groupby('user_id')['department'].nunique().reset_index()
    users_departments.rename(columns = {'department':'users_unique_departments'}, inplace=True)
    df_train = df_train.merge(users_departments, on='user_id', how='left')

    # Среднее время покупки
    bought_time = data_train_lvl_2.groupby('user_id')['trans_time'].mean().reset_index()
    bought_time.rename(columns = {'trans_time':'mean_trans_time_by_user'}, inplace=True)
    df_train = df_train.merge(bought_time, on='user_id', how='left')

    # Средний чек корзины 
    baskets_sales_value = data_train_lvl_2.groupby(['user_id','basket_id'])['sales_value'].mean().reset_index()
    mean_basket_sales_value = baskets_sales_value.groupby('user_id')['sales_value'].mean().reset_index()
    mean_basket_sales_value.rename(columns = {'sales_value':'mean_sales_value_per_basket'}, inplace=True)
    df_train = df_train.merge(mean_basket_sales_value, on='user_id', how='left')

    # Количество купленных уникальных товаров 
    unique_bought_items = data_train_lvl_2.groupby('user_id')['item_id'].nunique().reset_index()
    unique_bought_items.rename(columns = {'item_id':'unique_bought_items'}, inplace=True)
    df_train = df_train.merge(unique_bought_items, on='user_id', how='left')

    # Среднее количество уникальных категорий в корзине
    users_baskets = data_department.groupby(['user_id', 'basket_id'])['department'].nunique().reset_index()
    users_baskets = users_baskets.groupby('user_id')['department'].mean().reset_index()
    users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
    df_train = df_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')

    # Средняя сумма покупки в категории
    department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
    department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
    df_train = df_train.merge(department_sales, on='department', how='left')

    # Средная цена купленных товаров пользователем
    users_sales = data_train_lvl_2.groupby('user_id')[['sales_value', 'quantity']].sum().reset_index()
    users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
    df_train = df_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')
    return df_train

df_train = create_user_features(df_train)

In [216]:
def create_item_features(df_train):    
    # Среднее количество покупок товара в неделю
    num_purchase_week = data_train_lvl_2.groupby('item_id').agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
    num_purchase_week['avg_num_purchases_week'] = num_purchase_week['quantity'] / num_purchase_week['week_no']
    df_train = df_train.merge(num_purchase_week[['item_id', 'avg_num_purchases_week']], on='item_id', how='left')
    df_train['avg_num_purchases_week'].fillna(0, inplace=True)

    # Цена товара
    items_sales = data_department.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
    items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
    items_sales['price'].fillna(0, inplace=True)
    df_train = df_train.merge(items_sales[['item_id', 'price']], on='item_id', how='left')

    # Среднее время покупки товара
    bought_item_time = data_train_lvl_2.groupby('item_id')['trans_time'].mean().reset_index()   
    bought_item_time.rename(columns = {'trans_time':'mean_trans_time_by_item'}, inplace=True)
    df_train = df_train.merge(bought_item_time, on = 'item_id', how = 'left')

    # Количество магазинов, где есть товар
    items_stores = data_department.groupby('item_id')['store_id'].sum().reset_index()
    items_stores.rename(columns={'store_id': 'n_stores_with_item'}, inplace=True)
    items_stores['n_stores_with_item'].fillna(0, inplace = True)
    df_train = df_train.merge(items_stores, on='item_id', how='left')

    # Количество уникальных магазинов, где есть товар
    items_stores = data_department.groupby('item_id')['store_id'].nunique().reset_index()
    items_stores.rename(columns={'store_id': 'n_unique_stores_with_item'}, inplace=True)
    items_stores['n_unique_stores_with_item'].fillna(0, inplace = True)
    df_train = df_train.merge(items_stores, on='item_id', how='left')
    return df_train

df_train = create_item_features(df_train)

In [217]:
# Построим признак, отражающий средний интервал между покупками пользователя.
users_days = train_val_lvl1.groupby('user_id')['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[224, 235, 246, 263, 274, 276, 282, 291, 300, ..."
1,2,"[239, 249, 263, 291, 318, 335, 345, 415, 432, ..."
2,3,"[229, 242, 243, 248, 253, 260, 266, 291, 304, ..."
3,4,"[231, 244, 248, 251, 264, 288, 292, 302, 321, ..."
4,5,"[223, 258, 300, 320, 334, 418, 433, 449, 521, ..."


In [218]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_train = df_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,unique_bought_items,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval
0,2070,999270,0.0,1425,MEAT-PCKGD,National,BACON,ECONOMY,1 LB,45-54,...,156.0,1.790698,3.833539,0.035173,19.333333,2.589914,1569.337662,344916.0,49.0,4.351064
1,2070,1067779,0.0,69,MEAT-PCKGD,Private,LUNCHMEAT,HAM,1 LB,45-54,...,156.0,1.790698,3.833539,0.035173,4.6,3.06,1510.909091,8092.0,15.0,4.351064


In [219]:
users_items = pd.concat([data_train_lvl_2, data_val_lvl_2]).groupby('user_id')['item_id'].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[9677939, 10356149, 13417048, 15741823, 15830875]"
1,2,"[7407562, 10149597, 13776981, 15572067, 17215077]"
2,3,"[7167218, 7167249, 9526563, 9526886, 13842214]"
3,4,"[963365, 1038692, 1133312, 5570830, 7431408]"
4,6,"[1020683, 1047193, 1100691, 1123407, 10311503]"


In [220]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_train['Last5sales'] = df_train[['user_id', 'item_id']].apply(code_last_sales, axis=1)
df_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
0,2070,999270,0.0,1425,MEAT-PCKGD,National,BACON,ECONOMY,1 LB,45-54,...,1.790698,3.833539,0.035173,19.333333,2.589914,1569.337662,344916.0,49.0,4.351064,0
1,2070,1067779,0.0,69,MEAT-PCKGD,Private,LUNCHMEAT,HAM,1 LB,45-54,...,1.790698,3.833539,0.035173,4.6,3.06,1510.909091,8092.0,15.0,4.351064,0


In [221]:
for column in df_train.columns:
    if df_train[column].dtype == 'object':
        df_train[column].fillna('unk', inplace=True)
    elif df_train[column].dtype in ['float64', 'float32']:  # drop float32 to reproduce 0.18035
        df_train[column].fillna(0.0, inplace=True)
print(f'Length of train data: {df_train.shape[0]}')
df_train.dropna(inplace=True)
df_train = reduce_memory(df_train)
df_train.info()

Length of train data: 67663
<class 'pandas.core.frame.DataFrame'>
Int64Index: 67663 entries, 0 to 67662
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   user_id                      67663 non-null  int16   
 1   item_id                      67663 non-null  int32   
 2   target                       67663 non-null  float32 
 3   manufacturer                 67663 non-null  int16   
 4   department                   67663 non-null  category
 5   brand                        67663 non-null  category
 6   commodity_desc               67663 non-null  category
 7   sub_commodity_desc           67663 non-null  category
 8   curr_size_of_product         67663 non-null  category
 9   age_desc                     67663 non-null  category
 10  marital_status_code          67663 non-null  category
 11  income_desc                  67663 non-null  category
 12  homeowner_desc               676

In [None]:
# df_train['total_quantity_value'] = df_train['total_quantity_value'].astype(np.float32)
# df_train['item_freq'] = df_train['item_freq'].astype(np.float32)

In [223]:
X, y = df_train.drop('target', axis=1), df_train['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# the idea to set user_id and item_id as category (catboost combines cat features and may find out user_id-item_id is a good feature)
# did not approved 
cat_cols = X.select_dtypes(include=['category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [None]:
X.sample(5)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
49477,2195,1080853,649,PRODUCE,National,ORGANICS FRUIT & VEGETABLES,ORGANIC SALAD MIX,5 OZ,unk,unk,...,1.727273,2.31392,2.585094,1.8,3.49,1703.555542,34988.0,7.0,11.755555,0
34199,1887,901062,2,PRODUCE,National,APPLES,APPLES GALA (BULK&BAG),,unk,unk,...,2.0,2.31392,1.27,28.0,2.472381,1494.490967,578548.0,67.0,15.435898,0
55811,1721,849697,1225,GROCERY,National,FLUID MILK PRODUCTS,REFRIGERATED COFFEE CREAMERS,32 OZ,unk,unk,...,4.0,2.531705,3.695714,2.0,2.738333,1794.199951,1670.0,5.0,21.0,0
29907,485,857215,3020,MEAT,National,PORK,LOIN - CHOPS BONELESS,,35-44,U,...,2.666667,6.485481,0.012061,0.0,0.0,0.0,0.0,0.0,6.070588,0
55269,2276,967144,320,GROCERY,National,FRZN MEAT/MEAT DINNERS,FRZN SS PREMIUM ENTREES/DNRS/T,15 OZ,unk,unk,...,2.0,2.531705,1.903585,2.5,2.4,1496.0,1661.0,5.0,26.35,0


In [92]:
from google.colab import output
output.enable_custom_widget_manager()

In [98]:
def select_features(algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    model = CatBoostClassifier(iterations=400, auto_class_weights='SqrtBalanced', random_seed=42)
    summary = model.select_features(
        train_pool,
        eval_set=valid_pool,
        features_for_select=list(range(X.shape[1])),
        num_features_to_select=30,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=True,
        logging_level='Silent',
        plot=True
    )
    print('Selected features:', summary['selected_features_names'])
    return summary

feature_set = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=11)

Algorithm: EFeaturesSelectionAlgorithm.RecursiveByShapValues


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Selected features: ['department', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'total_item_sales_value', 'total_quantity_value', 'item_freq', 'user_freq', 'total_user_sales_value', 'user_quantity_per_week', 'item_quantity_per_basket', 'user_quantity_per_basket', 'item_freq_per_basket', 'user_freq_per_basket', 'avg_cheque', 'users_unique_departments', 'mean_trans_time_by_user', 'unique_bought_items', 'avg_basket_department', 'mean_sales_value_category', 'avg_price', 'avg_num_purchases_week', 'price', 'mean_trans_time_by_item', 'n_stores_with_item', 'n_unique_stores_with_item', 'Last5sales']


In [224]:
# ALS
# drop = ['department', 'brand', 'mean_sales_value_per_basket', 'hh_comp_desc', 'homeowner_desc', 'curr_size_of_product']
# X_train.drop(drop, axis=1, inplace=True)
# X_valid.drop(drop, axis=1, inplace=True)

# when user_id, item_id are set to categories
#drop = ['mean_sales_value_per_basket', 'homeowner_desc', 'avg_basket_department', 'brand', 'user_quantity_per_basket']
#X_train.drop(drop, axis=1, inplace=True)
#X_valid.drop(drop, axis=1, inplace=True)

""" own """
drop = ['item_id', 'avg_interval', 'marital_status_code', 'user_id', 'age_desc', 'kid_category_desc', 'manufacturer',
        'household_size_desc', 'mean_sales_value_per_basket', 'item_quantity_per_week', 'brand']

X_train.drop(drop, axis=1, inplace=True)
X_valid.drop(drop, axis=1, inplace=True)

cat_cols = X_train.select_dtypes(include=['category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [None]:
params_model = {
                'eval_metric': 'Logloss',
                'iterations': 2000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                }

In [278]:
def objective(trial):
    param_trials = {
                    'depth': trial.suggest_int('depth', 3, 15),
                    'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.5),
                    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1., 6.),
                    'bagging_temperature': trial.suggest_float('bagging_temperature', 1., 5.),
                    'subsample': trial.suggest_float('subsample', 0.3, 1.),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.),
                    'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                    'auto_class_weights': trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
                    'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Lossguide']),
                    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 255),
                    'max_leaves': trial.suggest_int('max_leaves', 5, 64),
                    }
    param_trials.update(params_model)
    if param_trials['grow_policy'] == 'SymmetricTree':
        param_trials['min_data_in_leaf'] = None
        param_trials['max_leaves'] = None
    if param_trials['grow_policy'] != 'SymmetricTree':
        param_trials['boosting_type'] = 'Plain'
    
    opt_model = CatBoostClassifier(**param_trials)
    opt_model.fit(train_pool, eval_set=valid_pool)
    
    y_pred = opt_model.predict_proba(X_valid)[:,1]
    score = log_loss(y_valid, y_pred)

    # y_pred = opt_model.predict_proba(X_valid)[:,1]
    # score = roc_auc_score(y_valid, y_pred)
    
    return score

In [None]:
# optuna.logging.set_verbosity(optuna.logging.INFO)
# study = optuna.create_study(sampler=TPESampler(seed=42), direction="minimize")  # minimize for logloss, maximize for roc_auc
# study.optimize(objective, n_trials=50, timeout=6000)

# print(f'Number of completed trials: {len(study.trials)}')
# print('Best trial')
# trial = study.best_trial
# print(f'Best score: {trial.value}')
# print('Best params')
# for key, value in trial.params.items():
#     print(f'{key}: {value}')

In [176]:
params_model = {
                'eval_metric': 'Logloss', 
                'iterations': 2000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                'depth': 15, 
                'learning_rate': 0.05036379129844091, 
                'l2_leaf_reg': 2.6760037989729177, 
                'bagging_temperature': 3.885879965461792, 
                'subsample': 0.8002252887959993, 
                'colsample_bylevel': 0.5385267306992663, 
                'boosting_type': 'Plain', 
                'auto_class_weights': 'SqrtBalanced', 
                'grow_policy': 'Lossguide', 
                'min_data_in_leaf': 185, 
                'max_leaves': 56
}

params_model = {
                'eval_metric': 'Logloss', 
                'iterations': 2000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                'depth': 13,
                'learning_rate': 0.074520547965526,
                'l2_leaf_reg': 2.311240422752083,
                'bagging_temperature': 3.182687625787709,
                'subsample': 0.9985249510045562,
                'colsample_bylevel': 0.34481632847272464,
                'boosting_type': 'Plain',
                'auto_class_weights': 'SqrtBalanced',
                'grow_policy': 'Depthwise',
                'min_data_in_leaf': 6,
}

In [225]:
X_ = X.drop(drop, axis=1)
train_pool = Pool(X_, y, cat_features=cat_cols)

In [227]:
# from imblearn.under_sampling import RandomUnderSampler

# sampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42) ###
# X_, y = sampler.fit_resample(X_, y)

# train_pool = Pool(X_, y, cat_features=cat_cols) ###

In [230]:
cat = CatBoostClassifier(**params_model).fit(train_pool, eval_set=train_pool)  # eval_set=valid_pool/train_pool

In [231]:
print(f'ROC_AUC: ', f'{roc_auc_score(y, cat.predict_proba(X_)[:,1]):.4f}')
imp = cat.feature_importances_[:10]
for name, value in zip(X_.columns, imp):
    print(name, ' - ', value)

ROC_AUC:  1.0000
department  -  1.0876896688392186
commodity_desc  -  2.9745320750642765
sub_commodity_desc  -  2.8171601532700787
curr_size_of_product  -  2.6762089642940374
income_desc  -  1.357530973123536
homeowner_desc  -  1.0242249406728292
hh_comp_desc  -  1.2760298850666563
total_item_sales_value  -  2.5768846278974737
total_quantity_value  -  2.244656569491351
item_freq  -  2.3240777753168467


In [232]:
rank_results = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
rank_results.columns = ['user_id', 'actual']
cold_users = data_val_lvl_2.loc[~data_val_lvl_2.user_id.isin(data_train_lvl_1.user_id.tolist()), 'user_id'].unique().tolist()

rank_results['own'] = rank_results['user_id'].apply(lambda x: rule(x, cold_users, model='own', N=5))

In [235]:
df_predict = df_train.copy()
df_predict['proba_item_purchase'] = cat.predict_proba(X.drop(drop, axis=1))[:, 1]

In [236]:
def rerank(user_id, N, out=cold_users):
    if user_id in df_predict.user_id:
        return df_predict[df_predict['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(N).item_id.tolist()
    else:
        return rule(user_id, cold_users, model='own', N=5)

In [237]:
rank_results['reranked_own'] = rank_results['user_id'].apply(lambda user_id: rerank(user_id, N=5))

In [238]:
rank_results['own_postfiltered'] = rank_results.user_id.apply(lambda x: postfilter(rule(x, cold_users, model='own', N=20), 
                                                                                   item_features, N=5)) 

In [239]:
rank_results.loc[rank_results['reranked_own'].str.len() == 0, 'reranked_own'] = rank_results.\
loc[rank_results['reranked_own'].str.len() == 0, 'user_id'].apply(lambda x: rule(x, cold_users, model='own', N=5))

In [240]:
def rerank_post(user_id, N_rank=20, N_post=5):  
    try:
        out = rerank(user_id, N=N_rank)
        out = postfilter(out, item_features, N=N_post)
    except AssertionError:
        out = rule(user_id, cold_users, model='own', N=N_post)
    return out

rank_results['postfiltered_reranked_own'] = rank_results['user_id'].apply(lambda user_id: rerank_post(user_id))

In [None]:
rank_results.head()

Unnamed: 0,user_id,actual,own,reranked_own,own_postfiltered,postfiltered_reranked_own
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1005274, 898121, 9392953, 1062002, 8090521, 1...","[9297615, 8293439, 9527558, 865456, 940947]","[1013167, 1088462, 986912, 7441419, 9297615]","[9297615, 9527558, 940947, 1082212, 1029743]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[827656, 9677270, 904777, 913210, 1008714, 126...","[8090521, 983584, 1092937, 1106523, 998206]","[946839, 1136486, 870547, 1019387, 5569230]","[8090521, 983584, 1092937, 1106523, 998206]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1097398, 6548453, 878715, 896613, 930118, 556...","[1098844, 1024306, 6548453, 1029743, 1126899]","[996070, 1084036, 5569845, 1108094, 1058404]","[1098844, 6548453, 1126899, 951412, 13003092]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1021715, 1032703, 909268, 5568378, 1062002, 9...","[1106523, 1126899, 9338009, 993638, 1122358]","[993638, 928932, 914577, 12731436, 939681]","[1106523, 9338009, 1122358, 893018, 5587656]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1004906, 886103, 865456, 1044500, 1040807, 90...","[872137, 12302069, 1029743, 1106523, 1070820]","[999198, 930385, 845307, 12808385, 981086]","[872137, 1029743, 893018, 899624, 1044500]"


In [241]:
(rank_results.reranked_own.str.len() == 0).sum(), len(rank_results)

(0, 2042)

In [242]:
print('-'*15, 'mean precision@5', '-'*15)

for pred in calc_precision_at_k(rank_results, 5):
    print(*pred, sep=': ')
    
print('-'*15, 'map@5', '-'*15)
for pred in calc_map_at_k(rank_results):
    print(*pred, sep=': ')

--------------- mean precision@5 ---------------
own: 24.916748285994128
reranked_own: 27.835455435847212
own_postfiltered: 9.578844270323215
postfiltered_reranked_own: 22.96767874632713
--------------- map@5 ---------------
own: 32.385936989879205
reranked_own: 49.639990749809556
own_postfiltered: 15.674107628686471
postfiltered_reranked_own: 48.098949831320056


In [116]:
def transform_data_for_eval(dataset, rec_col, user_col='user_id'):
    '''
    Func for transforming recommendations into kaggle evaluation format

    Parameters:
    dataset (pd.DataFrame): Dataset with 2 required columns:
        rec_col - column with recommendations should be iterable
        user_col - columns with user id

    rec_col (str): name of column in dataset with recommendations

    user_col (str): name of column in dataset with user id

    Returns:
    pd.DataFrame: DataFrame in suitable format

   '''
    eval_dataset = dataset[[user_col, rec_col]].copy()
    eval_dataset[rec_col] = eval_dataset[rec_col].apply(lambda x: ' '.join([str(i) for i in x]))
    eval_dataset.rename(columns={
        user_col: 'UserId',
        rec_col: 'Predicted'
    }, inplace=True)
    return eval_dataset

In [243]:
new = test[~test.user_id.isin(data_train_lvl_1.user_id.tolist())].user_id.tolist()
print(len(new))

13


In [244]:
test['preds'] = test.user_id.apply(lambda x: rerank(x, 5))

In [245]:
(test.preds.str.len() == 0).sum(), len(test)

(34, 1708)

In [246]:
test.loc[test.preds.str.len() == 0, 'preds'] = test.loc[test.preds.str.len() == 0, 'user_id'].\
                                                    apply(lambda x: rule(x, new, model='own', N=5))

In [247]:
# reranked als
submission = transform_data_for_eval(test[['user_id', 'preds']], rec_col='preds', user_col='user_id')
submission.to_csv('submission.csv', index=False)

In [248]:
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'  # private: 0.18230

100% 71.6k/71.6k [00:03<00:00, 22.1kB/s]
Successfully submitted to GB RecSys Project

#### CatBoostRanker

In [124]:
import catboost

In [195]:
advanced_parameters = {
    'iterations': 2000,
    'verbose': False,
    'random_seed': 42,
    'one_hot_max_size': 15, # 1 for YetiRank, 15 for RMSE
    'early_stopping_rounds': 50,
    'allow_writing_files': False,
    'grow_policy': 'SymmetricTree',
    'boosting_type': 'Plain',
    'use_best_model': True,
    'depth': 8,
    'learning_rate': 0.18905265981777902, 
    'l2_leaf_reg': 4.927368361978105, 
    'subsample': 0.9, 
    'colsample_bylevel': 0.8, 
}

In [126]:
default_parameters = {
                      'iterations': 2000,
                      'verbose': False,
                      'random_seed': 0,
                      'allow_writing_files': False,
}

In [196]:
train = Pool(
    data=X_,
    label=y,
    cat_features=cat_cols,
    group_id=X['user_id'].values
)

In [155]:
from copy import deepcopy

def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=train):
    parameters = deepcopy(advanced_parameters)
    parameters['loss_function'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=False)
    
    return model

In [197]:
model = fit_model('QueryRMSE', {'custom_metric': ['PrecisionAt:top=5', 'RecallAt:top=5', 'MAP:top=5']}) # QueryRMSE
# model = fit_model('YetiRankPairwise', {'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10']})  # very slow, task_type=GPU?

In [198]:
def rerank_cat(df, user_id, N=5):
    return df[df['user_id'] == user_id].sort_values('probabilities', ascending=False).head(N).item_id.tolist()

In [199]:
X_predict = X.copy()
X_predict['probabilities'] = catboost.CatBoost.predict(model, X_, prediction_type='Probability')[:, 1]

In [200]:
rank_results['cat_ranker'] = rank_results['user_id'].apply(lambda x: rerank_cat(X_predict, x))

In [201]:
rank_results.loc[rank_results.cat_ranker.isna(), 'cat_ranker'] = rank_results.loc[rank_results.cat_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, cold_users, model='own', N=5))

In [202]:
print('-'*15, 'mean precision@5', '-'*15)

for pred in calc_precision_at_k(rank_results, 5):
    print(*pred, sep=': ')
    
print('-'*15, 'map@5', '-'*15)
for pred in calc_map_at_k(rank_results):
    print(*pred, sep=': ')

--------------- mean precision@5 ---------------
own: 24.916748285994128
reranked_own: 27.610186092066602
own_postfiltered: 9.578844270323215
postfiltered_reranked_own: 22.458374142997062
cat_ranker: 26.797257590597457
--------------- map@5 ---------------
own: 32.385936989879205
reranked_own: 48.032089998911744
own_postfiltered: 15.674107628686471
postfiltered_reranked_own: 46.660613233213624
cat_ranker: 48.1883910109914


In [203]:
test['cat_ranker'] = test['user_id'].apply(lambda x: rerank_cat(X_predict, x))
test.loc[test.cat_ranker.isna(), 'cat_ranker'] = test.loc[test.cat_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, new, model='own', N=5))

In [204]:
submission = transform_data_for_eval(test[['user_id', 'cat_ranker']], rec_col='cat_ranker', user_col='user_id')
submission.to_csv('submission.csv', index=False)
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'  # private: 0.17454

100% 70.2k/70.2k [00:04<00:00, 17.3kB/s]
Successfully submitted to GB RecSys Project

#### 2-model approach with different data split

In [381]:
test_size_weeks = 3
N_preds = 20

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

print(f'Unique users in train set: {data_train.user_id.nunique()}, items: {data_train.item_id.nunique()}')
print(f'Unique users in valid set: {data_test.user_id.nunique()}, items: {data_test.item_id.nunique()}')
print(f'New users in test: {len(data_test[~data_test.user_id.isin(data_train.user_id)])}')

Unique users in train set: 2499, items: 86865
Unique users in valid set: 2042, items: 24329
New users in test: 0


In [382]:
data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)
print(f'Level 1 Train - users: {data_train.user_id.nunique()}, items: {data_train.item_id.nunique()}')

Level 1 Train - users: 2461, items: 5001


In [383]:
recommender = MainRecommender(data_train, data_type='quantity', normalize=False, weighting=True)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [384]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [385]:
list_out = result.loc[~result.user_id.isin(data_train.user_id), 'user_id'].tolist()
list_out

[60, 206, 512, 1105, 1636, 1984, 2039, 2259, 2339]

In [386]:
result['own'] = result['user_id'].apply(lambda x: rule(x, list_out, model='own', N=5))

In [387]:
for pred in calc_precision_at_k(result, 5):
    print(*pred, sep=': ')

own: 26.15083251714006


In [388]:
for pred in calc_recall(result, 5):
    print(*pred, sep=': ')

own: 3.5104714529187344


In [389]:
for pred in calc_map_at_k(result):
    print(*pred, sep=': ')

own: 33.09935792795734


In [390]:
df_candidates = pd.DataFrame(data=data_test['user_id'].unique().tolist())
df_candidates.columns = ['user_id']
df_candidates['candidates'] = df_candidates['user_id'].apply(lambda x: rule(x, list_out, model='own', N=N_preds))
df_candidates.head()

Unnamed: 0,user_id,candidates
0,338,"[823031, 1079528, 1135258, 13003092, 1009187, ..."
1,2120,"[1029743, 1029743, 1106523, 5569230, 916122, 1..."
2,2324,"[1112405, 948381, 1006342, 1010578, 907099, 55..."
3,514,"[1034956, 832007, 901062, 1068292, 832768, 102..."
4,1762,"[1032512, 913210, 7441419, 1105488, 854405, 11..."


In [391]:
df_candidates = pd.DataFrame({'user_id': df_candidates['user_id'].values.repeat(N_preds),
                              'item_id': np.concatenate(df_candidates['candidates'].values)})

df_train = data_test[['user_id', 'item_id']].copy()
df_train['target'] = 1

df_train = df_candidates.merge(df_train, on=['user_id', 'item_id'], how='left')
df_train.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

df_train['target'].fillna(0, inplace=True)
df_train.head()

Unnamed: 0,user_id,item_id,target
0,338,823031,0.0
1,338,1079528,0.0
2,338,1135258,0.0
3,338,13003092,0.0
4,338,1009187,0.0


In [392]:
df_train['target'].value_counts()

0.0    34504
1.0     6072
Name: target, dtype: int64

In [393]:
df_train = df_train.merge(item_features, on='item_id', how='left')
df_train = df_train.merge(user_features, on='user_id', how='left')

In [394]:
data_department = data_test.merge(item_features[['item_id', 'department']], on='item_id', how='inner')

In [395]:
df_train = create_multiple_features(df_train, train_val_lvl1=data_train)

In [396]:
df_train = create_user_features(df_train)
df_train = create_item_features(df_train)

In [397]:
users_days = data_train.groupby('user_id')['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))

In [398]:
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)
df_train = df_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')

In [399]:
users_items = data_test.groupby('user_id')['item_id'].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])

In [400]:
df_train['Last5sales'] = df_train[['user_id', 'item_id']].apply(code_last_sales, axis=1)

In [401]:
for column in df_train.columns:
    if df_train[column].dtype == 'object':
        df_train[column].fillna('unk', inplace=True)
    elif df_train[column].dtype in ['float64', 'float32']:
        df_train[column].fillna(0.0, inplace=True)
print(f'Length of train data: {df_train.shape[0]}')
df_train.dropna(inplace=True)
df_train = reduce_memory(df_train)
# df_train.info()

Length of train data: 40576


In [402]:
X, y = df_train.drop('target', axis=1), df_train['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

cat_cols = X.select_dtypes(include=['category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [None]:
feature_set = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=11)

In [403]:
drop = ['mean_trans_time_by_item', 'item_id', 'income_desc', 'department', 'avg_num_purchases_week', 'item_freq', 'avg_cheque',
        'mean_sales_value_per_basket', 'user_freq', 'age_desc', 'household_size_desc']

X_train.drop(drop, axis=1, inplace=True)
X_valid.drop(drop, axis=1, inplace=True)

cat_cols = X_train.select_dtypes(include=['category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [404]:
params_model = {
                'eval_metric': 'Logloss',
                'iterations': 1000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                'auto_class_weights': 'SqrtBalanced',
                'bagging_temperature': 2.6004725626586676,
                'boosting_type': 'Plain',
                'colsample_bylevel': 0.9132686318360939,
                'depth': 12,
                'grow_policy': 'SymmetricTree',
                'l2_leaf_reg': 4.948252791068095,
                'learning_rate': 0.1206774907395364,
                'min_data_in_leaf': 240,
                'subsample': 0.8218973936992428
                }

In [405]:
X_ = X.drop(drop, axis=1)
train_pool = Pool(X_, y, cat_features=cat_cols)
cat = CatBoostClassifier(**params_model).fit(train_pool, eval_set=train_pool)  # overfit to perfectly predict only X_
print(f'ROC_AUC: ', f'{roc_auc_score(y, cat.predict_proba(X_)[:,1]):.4f}')

ROC_AUC:  0.9984


In [406]:
df_predict = df_train.copy()
df_predict['proba_item_purchase'] = cat.predict_proba(X.drop(drop, axis=1))[:, 1]

In [407]:
cold_users = test.loc[~test.user_id.isin(data_train.user_id), 'user_id'].unique()

In [408]:
def rerank(user_id, N, out=cold_users):
    if user_id in df_predict.user_id:
        return df_predict[df_predict['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(N).item_id.tolist()
    else:
        return rule(user_id, cold_users, model='own', N=5)

In [409]:
test['pred'] = test['user_id'].apply(lambda x: rerank(x, 5))

In [410]:
submission = transform_data_for_eval(test, rec_col='pred', user_col='user_id')
submission.to_csv('submission.csv', index=False)
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'  # private: 0.19194

100% 67.1k/67.1k [00:02<00:00, 30.8kB/s]
Successfully submitted to GB RecSys Project

#### LGBMRanker

In [None]:
from lightgbm import LGBMRanker

In [None]:
X_r = X_.copy()

In [None]:
X_r['group'] = np.arange(len(X_))
group = X_r[['user_id', 'group']].groupby('user_id', sort=False).max()['group'].values
grs = np.concatenate((np.array([group[0]+1]), group[1:] - group[:-1]))

In [None]:
start_params = {
                'objective':'lambdarank',
                'boosting_type': 'gbdt',
                'n_estimators': 2000,
                'categorical_column': cat_cols,
                'random_state': 42,
                'is_unbalance': True,
                'n_jobs': -1,
}

In [None]:
def objective(trial):
    param_trials = {
                    'max_depth': trial.suggest_int('max_depth', 5, 10),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                    'num_leaves': trial.suggest_int('num_leaves', 20, 265),
                    'subsample': trial.suggest_float('subsample', 0.3, 1.),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.),
                    'max_bin': trial.suggest_int('max_bin', 60, 260),
                    'min_child_samples': trial.suggest_int('min_child_samples', 20, 260),
                    }
    param_trials.update(start_params)
    lgb_rn = LGBMRanker(**param_trials, silent=False)

    eval_h = lgb_rn.fit(X_r, y, group=grs,
                        eval_set=[(X_r, y)], eval_group=[grs], 
                        eval_metric=['ndcg'],
                        eval_at=[5, 10 ], 
                        early_stopping_rounds=50, 
                        verbose=0)
    
    rank_preds = lgb_rn.predict(X_r)
    ranker_prediction = X_[['user_id', 'item_id']]
    ranker_prediction["pred"] = rank_preds
    
    ranker_prediction.drop_duplicates(inplace=True)
    ranker_prediction.sort_values(by="pred", inplace=True, ascending=False)
    
    ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
    ranked_res = ranked_res.rename(columns={'item_id': 'lgbm_ranker'})  

    result = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
    result.columns = ['user_id', 'actual']

    result = result.merge(ranked_res, on='user_id', how='left')
    result.loc[result.lgbm_ranker.isna(), 'lgbm_ranker'] = result.loc[result.lgbm_ranker.isna(), 'user_id'].\
        apply(lambda x: rule(x, cold_users, model='own', N=5))
    
    score = result.apply(lambda row: precision_at_k(row['lgbm_ranker'], row['actual'], k=5), axis=1).mean()
    
    return score

In [None]:
# optuna.logging.set_verbosity(optuna.logging.INFO)
# study = optuna.create_study(sampler=TPESampler(seed=42), direction="maximize")
# study.optimize(objective, n_trials=50, timeout=2000)

# print(f'Number of completed trials: {len(study.trials)}')
# print('Best trial')
# trial = study.best_trial
# print(f'Best score: {trial.value}')
# print('Best params')
# for key, value in trial.params.items():
#     print(f'{key}: {value}')

In [None]:
lgb_params = { 
    'objective':'lambdarank',
    'boosting_type': 'gbdt',
    'n_estimators': 2000,
    'categorical_column': cat_cols,
    'random_state': 42,
    'is_unbalance': True,
    'n_jobs': -1,
    'colsample_bylevel': 0.5487710499653865,
    'learning_rate': 0.053489173237731946,
    'max_bin': 260,
    'max_depth': 8,
    'min_child_samples': 211,
    'num_leaves': 181,
    'subsample': 0.6569737833982974 
}

In [None]:
lgb_rn = LGBMRanker(**lgb_params, silent=False)

eval_h = lgb_rn.fit(X_r, y, group=grs,
                    eval_set=[(X_r, y)], eval_group=[grs], 
                    eval_metric=['ndcg'],
                    eval_at=[5, 10 ], 
                    early_stopping_rounds=50, 
                    verbose=0)

In [None]:
eval_h.evals_result_['training']['ndcg@5'][-1]

0.9710420831856927

In [None]:
rank_preds = lgb_rn.predict(X_r)

In [None]:
ranker_prediction = X_[['user_id', 'item_id']]
ranker_prediction["pred"] = rank_preds
ranker_prediction.head()

Unnamed: 0,user_id,item_id,pred
0,2070,999270,-0.001846
1,2070,1067779,-0.402295
2,2070,879194,-4.138107
3,2070,1097398,-2.204079
4,2070,1055168,-1.874535


In [None]:
ranker_prediction = ranker_prediction.drop_duplicates()

In [None]:
ranker_prediction.sort_values(by="pred", inplace=True, ascending=False)
ranker_prediction.head()

Unnamed: 0,user_id,item_id,pred
17760,1832,8293703,2.384857
30280,1802,9707010,2.124743
32438,2126,1126711,2.123397
57870,2097,6391557,2.113609
18270,916,901557,2.091569


In [None]:
ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
ranked_res = ranked_res.rename(columns={'item_id': 'lgbm_ranker'})  
ranked_res.head()

Unnamed: 0,user_id,lgbm_ranker
0,1,"[9527558, 8293439, 9297615, 865456, 940947, 10..."
1,2,"[1103898, 1106523, 899624, 901062, 916122, 909..."
2,4,"[1052294, 883932, 990797, 891423, 10149640, 11..."
3,6,"[1098844, 6548453, 1024306, 1126899, 1029743, ..."
4,7,"[1126899, 1106523, 9338009, 1122358, 993638, 5..."


In [None]:
# rank_results.drop('lgbm_ranker', axis=1, inplace=True)

In [None]:
rank_results = rank_results.merge(ranked_res, on='user_id', how='left')

In [None]:
rank_results.loc[rank_results.lgbm_ranker.isna(), 'lgbm_ranker'] = rank_results.loc[rank_results.lgbm_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, cold_users, model='own', N=5))

In [None]:
print('-'*15, 'mean precision@5', '-'*15)

for pred in calc_precision_at_k(rank_results, 5):
    print(*pred, sep=': ')
    
print('-'*15, 'map@5', '-'*15)
for pred in calc_map_at_k(rank_results):
    print(*pred, sep=': ')

--------------- mean precision@5 ---------------
own: 6.552399608227229
reranked_own: 27.923604309500494
own_postfiltered: 9.578844270323215
postfiltered_reranked_own: 22.761998041136145
cat_ranker: 29.18580375782881
lgbm_ranker: 27.952987267384916
--------------- map@5 ---------------
own: 3.5835131568889644
reranked_own: 47.98740341712917
own_postfiltered: 15.674107628686471
postfiltered_reranked_own: 46.041394602241816
cat_ranker: 47.21739307868103
lgbm_ranker: 26.90346404618375


In [None]:
# test.drop('lgbm_ranker', axis=1, inplace=True)

In [None]:
test = test.merge(ranked_res, on='user_id', how='left')
test.head()

Unnamed: 0,user_id,preds,cat_ranker,lgbm_ranker
0,1,"[9297615, 8293439, 9527558, 865456, 940947]","[940947, 865456, 8293439, 1082212, 9297615, 55...","[9527558, 8293439, 9297615, 865456, 940947, 10..."
1,2,"[899624, 1106523, 916122, 901062, 1103898]","[899624, 1106523, 916122, 1103898, 901062, 111...","[1103898, 1106523, 899624, 901062, 916122, 909..."
2,3,"[8090521, 983584, 1092937, 1106523, 998206]","[8090521, 983584, 1092937, 1106523, 998206]",
3,6,"[1098844, 1024306, 6548453, 1029743, 1126899]","[1024306, 1098844, 6548453, 896613, 993638, 96...","[1098844, 6548453, 1024306, 1126899, 1029743, ..."
4,7,"[1106523, 1126899, 9338009, 993638, 1122358]","[1106523, 1126899, 993638, 9338009, 1122358, 5...","[1126899, 1106523, 9338009, 1122358, 993638, 5..."


In [None]:
test.loc[test.lgbm_ranker.isna(), 'lgbm_ranker'] = test.loc[test.lgbm_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, new, model='own', N=5))

In [None]:
submission = transform_data_for_eval(test[['user_id', 'lgbm_ranker']], rec_col='lgbm_ranker', user_col='user_id')
submission.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'  # private: 0.17448

100% 372k/372k [00:00<00:00, 388kB/s]
Successfully submitted to GB RecSys Project

#### LightFM

[kaggle beautiful kernel](https://www.kaggle.com/code/niyamatalmass/lightfm-hybrid-recommendation-system)

In [None]:
from lightfm import LightFM
from lightfm.data import Dataset

In [None]:
def df_to_tuple_iterator(df):
    return zip(*df.values.T)

def concat_last_to_list(t):
    return (t[0], list(t[1:])[0])

def df_to_tuple_list_iterator(df):
    return map(concat_last_to_list, zip(*df.values.T))

In [None]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [None]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features=item_features)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [None]:
data_train = data_train[data_train.item_id != 999999]

In [None]:
dataset = Dataset()

In [None]:
dataset.fit(users=data_train['user_id'].unique(), items=data_train['item_id'].unique(), )

In [None]:
user_feats = []
for col in user_features.select_dtypes(exclude='number').columns:
    user_feats += user_features[col].unique().tolist()
user_feats = np.array(user_feats)
user_feats

array(['65+', '45-54', '25-34', '35-44', '19-24', '55-64', 'A', 'U', 'B',
       '35-49K', '50-74K', '25-34K', '75-99K', 'Under 15K', '100-124K',
       '15-24K', '125-149K', '150-174K', '250K+', '175-199K', '200-249K',
       'Homeowner', 'Unknown', 'Renter', 'Probable Renter',
       'Probable Owner', '2 Adults No Kids', '2 Adults Kids',
       'Single Female', 'Unknown', 'Single Male', '1 Adult Kids', '2',
       '3', '4', '1', '5+', 'None/Unknown', '1', '2', '3+'], dtype='<U16')

In [None]:
item_feats = []
item_features.manufacturer = item_features.manufacturer.astype(np.object)

for col in item_features.select_dtypes(exclude='number').columns:
    item_feats += item_features[col].unique().tolist()
item_feats = np.array(item_feats)
item_feats

array(['2', '69', '16', ..., '6.3 IN', '35 LD', '2 LTR PET'], dtype='<U30')

In [None]:
dataset.fit_partial(user_features=user_feats)

In [None]:
dataset.fit_partial(item_features=item_feats)

In [None]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'user_features_mapping': lightfm_mapping[1],
    'items_mapping': lightfm_mapping[2],
    'item_features_mapping': lightfm_mapping[3],
}
print('users_mapping len - ', len(lightfm_mapping['users_mapping']))
print('user_features_mapping len - ', len(lightfm_mapping['user_features_mapping']))
print('items_mapping len - ', len(lightfm_mapping['items_mapping']))
print('Users item_features_mapping len - ', len(lightfm_mapping['item_features_mapping']))

users_mapping len -  2467
user_features_mapping len -  2505
items_mapping len -  5000
Users item_features_mapping len -  18505


In [None]:
lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

In [None]:
train_mat, train_mat_weights = dataset.build_interactions(df_to_tuple_iterator(data_train[['user_id', 'item_id']]))

In [None]:
user_features['features'] = user_features.iloc[:, :-1].astype(str).apply(lambda x: list(x), axis=1)

In [None]:
known_users_filter = user_features['user_id'].isin(data_train['user_id'].unique())

train_user_features = dataset.build_user_features(
    df_to_tuple_list_iterator(
        user_features.loc[known_users_filter, ['user_id', 'features']]
    )
)
train_user_features

<2467x2505 sparse matrix of type '<class 'numpy.float32'>'
	with 8043 stored elements in Compressed Sparse Row format>

In [None]:
item_features['features'] = item_features.iloc[:, 1:].astype(str).apply(lambda x: list(x), axis=1)

In [None]:
known_items_filter = item_features['item_id'].isin(data_train['item_id'].unique())

train_items_features = dataset.build_item_features(
    df_to_tuple_list_iterator(
        item_features.loc[known_items_filter, ['item_id', 'features']]
    )
)
train_items_features

<5000x18505 sparse matrix of type '<class 'numpy.float32'>'
	with 34898 stored elements in Compressed Sparse Row format>

In [None]:
lfm_model = LightFM(no_components=64, learning_rate=0.05, loss='warp', max_sampled=5, random_state=42)

In [None]:
lfm_model.fit(train_mat,
              sample_weight=train_mat_weights,
              user_features=train_user_features,
              item_features=train_items_features,
              epochs=15, 
              num_threads=4) 

<lightfm.lightfm.LightFM at 0x7f2bd29579d0>

In [None]:
all_cols = list(lightfm_mapping['items_mapping'].values())

In [None]:
def predict_fm(user_id, n=5):
    try:
        pred = lfm_model.predict(lightfm_mapping['users_mapping'][user_id], 
                                 all_cols, 
                                 user_features=train_user_features,
                                 item_features=train_items_features,
                                 num_threads=4)
    except:
        return popularity_recommendation(data_train, n=n)
    top_cols = np.argpartition(pred, -np.arange(n))[-n:][::-1]
    pred = [lightfm_mapping['items_inv_mapping'].get(top) for top in top_cols]
    return pred

In [None]:
new_users = test.loc[~test.user_id.isin(data_train.user_id), 'user_id'].values
new_users

array([  62,   75,  650,  954, 1043, 1105, 1636, 1876, 1987, 2120, 2259,
       2325, 2404])

In [None]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

result['pred'] = result['user_id'].apply(lambda x: predict_fm(x))

for pred in calc_precision_at_k(result, 5):
    print(*pred, sep=': ')

for pred in calc_map_at_k(result):
    print(*pred, sep=': ')

pred: 12.095984329089129
pred: 25.02360158885624


In [None]:
test['pred'] = test['user_id'].apply(lambda x: predict_fm(x))

In [None]:
test.head()

Unnamed: 0,user_id,pred
0,1,"[999104, 940947, 1127831, 872137, 866211]"
1,2,"[5569230, 916122, 1029743, 844179, 1127831]"
2,3,"[1044078, 844179, 1029743, 1106523, 907631]"
3,6,"[854852, 1026118, 866211, 878996, 1081177]"
4,7,"[1029743, 916122, 1106523, 899624, 844179]"


In [None]:
submission = transform_data_for_eval(test[['user_id', 'pred']], rec_col='pred', user_col='user_id')
submission.to_csv('submission.csv', index=False)
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'

100% 71.3k/71.3k [00:02<00:00, 26.4kB/s]
Successfully submitted to GB RecSys Project