In [1]:
!pip install -qq catboost optuna implicit

[K     |████████████████████████████████| 76.6 MB 68 kB/s 
[K     |████████████████████████████████| 308 kB 33.8 MB/s 
[K     |████████████████████████████████| 18.5 MB 41.4 MB/s 
[K     |████████████████████████████████| 209 kB 40.7 MB/s 
[K     |████████████████████████████████| 81 kB 7.9 MB/s 
[K     |████████████████████████████████| 78 kB 4.3 MB/s 
[K     |████████████████████████████████| 112 kB 44.5 MB/s 
[K     |████████████████████████████████| 146 kB 37.4 MB/s 
[K     |████████████████████████████████| 49 kB 5.9 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [None]:
# !pip install -qq implicit==0.4.8 --no-use-pep517
# видимо, в новой версии в ItemItemRecommender при K=1 выбирается первый ближайший сосед, но уже не сам пользователь, трюк не удается 

In [2]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight
from implicit.nearest_neighbours import ItemItemRecommender

from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRanker, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

import optuna
from optuna.samplers import TPESampler

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c gb-recsys-project
!unzip gb-recsys-project.zip
!rm gb-recsys-project.zip

Saving kaggle.json to kaggle.json
kaggle.json
Downloading gb-recsys-project.zip to /content
 88% 21.0M/23.9M [00:00<00:00, 215MB/s]
100% 23.9M/23.9M [00:00<00:00, 224MB/s]
Archive:  gb-recsys-project.zip
  inflating: hh_demographic.csv      
  inflating: product.csv             
  inflating: retail_train.csv        
  inflating: sample_submission.csv   
  inflating: test_users.csv          


In [4]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    return precision

def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list, bought_list)
    
    if sum(flags) == 0:
        return 0
    
    sum_ = 0
    for i in range(k):
        
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
            
    result = sum_ / sum(flags)
    
    return result

def map_k(recommend_list, bought_list, k=5):
    return np.mean([ap_k(rec, bt, k) for rec, bt in zip(recommend_list, bought_list)])

def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)[:4] != 'uint' and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif str(col_type)[:4] != 'uint':
            df[col] = df[col].astype('category')
    return df

def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [5]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')
test = pd.read_csv('test_users.csv')

N_preds = 30

In [None]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
item_features.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
user_features.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


##### Dealing with implicit data as users do not rate bought products at any way.

In [6]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [7]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1 = reduce_memory(data_train_lvl_1)
data_val_lvl_1 = reduce_memory(data_val_lvl_1)
data_train_lvl_2 = reduce_memory(data_train_lvl_2)
data_val_lvl_2 = reduce_memory(data_val_lvl_2)

print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')
print(f'Level 1 Valid - users: {data_val_lvl_1.user_id.nunique()}, items: {data_val_lvl_1.item_id.nunique()}')
print(f'Level 2 Train - users: {data_train_lvl_2.user_id.nunique()}, items: {data_train_lvl_2.item_id.nunique()}')
print(f'Level 2 Valid - users: {data_val_lvl_2.user_id.nunique()}, items: {data_val_lvl_2.item_id.nunique()}')

Level 1 Train - users: 2498, items: 83685
Level 1 Valid - users: 2154, items: 27649
Level 2 Train - users: 2154, items: 27649
Level 2 Valid - users: 2042, items: 24329


In [None]:
# ищем общих пользователей
# common_users = list(set(data_train_lvl_1.user_id.values)&(set(data_val_lvl_1.user_id.values))&set(data_val_lvl_2.user_id.values))

# оставляем общих пользователей
# data_train_lvl_1 = data_train_lvl_1[data_train_lvl_1.user_id.isin(common_users)]
# data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(common_users)]
# data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(common_users)]
# data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(common_users)]

# print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')
# print(f'Level 1 Valid - users: {data_val_lvl_1.user_id.nunique()}, items: {data_val_lvl_1.item_id.nunique()}')
# print(f'Level 2 Train - users: {data_train_lvl_2.user_id.nunique()}, items: {data_train_lvl_2.item_id.nunique()}')
# print(f'Level 2 Valid - users: {data_val_lvl_2.user_id.nunique()}, items: {data_val_lvl_2.item_id.nunique()}')

In [8]:
result = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [9]:
# top 5 popular recommendations
popular_recs = popularity_recommendation(data_train_lvl_1, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

Unnamed: 0,user_id,actual,popular_recommendation
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [10]:
def prefilter_items(data, take_n_popular=5000, item_features=None):
    # Уберем самые популярные товары (их и так купят)
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index() / data['user_id'].nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

    top_popular = popularity[popularity['share_unique_users'] > 0.2].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]

    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.02].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]

    # Уберем не интересные для рекоммендаций категории (department)
    if item_features is not None:
        department_size = pd.DataFrame(item_features.groupby('department')['item_id'].nunique().sort_values(ascending=False)).reset_index()
        department_size.columns = ['department', 'n_items']
        rare_departments = department_size[department_size['n_items'] < 150].department.tolist()
        items_in_rare_departments = item_features[item_features['department'].isin(rare_departments)].item_id.unique().tolist()

        data = data[~data['item_id'].isin(items_in_rare_departments)]

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
    data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
    data = data[data['price'] > 2]

    # Уберем слишком дорогие товарыs
    data = data[data['price'] < 50]

    # уберем товары, не продававшиеся более 12-18 месяцев
    data = data[data['week_no'] >= data['week_no'].max() - 78]

    # Возьмем топ по популярности
    popularity = data.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
    top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()
    
    # Заведем фиктивный item_id (если юзер покупал товары из топ-N, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = 999999
    
    return data

def postfilter(recommendations, item_info, N=5):
    """Пост-фильтрация товаров
    
    Input
    -----
    recommendations: list
        Ранжированный список item_id для рекомендаций
    item_info: pd.DataFrame
        Датафрейм с информацией о товарах
    """
    
    # Уникальность
    unique_recommendations = []
    [unique_recommendations.append(item) for item in recommendations if item not in unique_recommendations]
    
    # Разные категории
    categories_used = []
    final_recommendations = []
    CATEGORY_NAME = 'sub_commodity_desc'
    for item in unique_recommendations:
        category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
        
        if category not in categories_used:
            final_recommendations.append(item)
            
        unique_recommendations.remove(item)
        categories_used.append(category)
    
    n_rec = len(final_recommendations)
    if n_rec < N:
        final_recommendations.extend(unique_recommendations[:N - n_rec])
    else:
        final_recommendations = final_recommendations[:N]
    
    assert len(final_recommendations) == N, 'Количество рекомендаций != {}'.format(N)
    return final_recommendations

In [89]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, data_type='quantity', weighting=True, normalize=False, alpha=1.):
        self.data_type = data_type
        self.normalize = normalize
        self.alpha = alpha

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data, data_type=self.data_type, normalize=self.normalize)
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix, K1=100, B=0.8)  # default: 100, 0.8 ver. 0.5.2
            # self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix, alpha=self.alpha)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data, data_type, normalize=False):
        """Готовит user-item матрицу"""
        if data_type == 'quantity':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='quantity',
                                              aggfunc='count',
                                              fill_value=0
                                              )
        elif data_type == 'sales':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='sales_value',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()  # normalize
        elif data_type == 'quantity_sum':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='quantity',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()
        else:
            raise ValueError(f'Agg data type must be "quantity", "sales" or "quantity_sum", given: {data_type}')

        user_item_matrix = user_item_matrix.astype(float)

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).tocsr())  # ver. 0.5.2
        # own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=50, regularization=0.001, iterations=15, num_threads=4, alpha=1.):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads,
                                        use_gpu=False,
                                        random_state=42)
        model.fit(csr_matrix(user_item_matrix).tocsr()*alpha)  # ver. 0.5.2
        # model.fit(csr_matrix(user_item_matrix).T.tocsr()*alpha)

        return model

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[0][1]  # ver. 0.5.2
        # top_rec = recs[1][0]
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        user_id = self.userid_to_id[user]
        
        """ ver. 0.5.2 """
        
        model_name = model.__class__.__name__
        
        if model_name == 'ItemItemRecommender':

            res = model.recommend(userid=user_id,
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=None,
                                        recalculate_user=True)[0].tolist()
            res = [self.id_to_itemid[rec] for rec in res]

        elif model_name == 'AlternatingLeastSquares':
            res = [self.id_to_itemid[rec] for rec in model.recommend(userid=user_id,
                                        user_items=csr_matrix(self.user_item_matrix).tocsr()[user_id,:],
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)[0]]
        """
        res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=user_id,
                                                                    user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                                                    N=N,
                                                                    filter_already_liked_items=False,
                                                                    filter_items=[self.itemid_to_id[999999]],
                                                                    recalculate_user=True)]
        """
        res = self._extend_with_top_popular(res, N=N)
        
        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""
        
        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        if 999999 in res: res.remove(999999)  # prev. ver.

        res = self._extend_with_top_popular(res, N=N) 

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec for rec in similar_users[0]]  # ver. 0.5.2
        # similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            user = self.id_to_userid[user]  ## нужно подать для предикта оригинальный идентификатор # ver. 0.4.8
            res.extend(self.get_own_recommendations(user, N=1))
        res = pd.Series(res).drop_duplicates().tolist()

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

In [12]:
data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

In [13]:
print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')

Level 1 Train - users: 2494, items: 5001


In [90]:
recommender = MainRecommender(data_train_lvl_1, data_type='quantity', normalize=False, weighting=True)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [25]:
list_out = result.loc[~result.user_id.isin(data_train_lvl_1.user_id), 'user_id'].tolist()
list_out

[296, 1813, 1984]

In [16]:
def rule(x, y, model, N=5):
    if x in y:
        return recommender.overall_top_purchases[:N]
    if model == 'als':
        return recommender.get_als_recommendations(x, N=N)
    elif model == 'own':
        return recommender.get_own_recommendations(x, N=N)
    elif model == 'similar_items':
        return recommender.get_similar_items_recommendation(x, N=N)
    elif model == 'similar_users':
        return recommender.get_similar_users_recommendation(x, N=N)

In [91]:
result['als'] = result['user_id'].apply(lambda x: rule(x, list_out, model='als', N=N_preds))
result['own'] = result['user_id'].apply(lambda x: rule(x, list_out, model='own', N=N_preds))
result['similar_items'] = result['user_id'].apply(lambda x: rule(x, list_out, model='similar_items', N=N_preds))
result['similar_users'] = result['user_id'].apply(lambda x: rule(x, list_out, model='similar_users', N=N_preds))

In [18]:
def calc_precision_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean() * 100

In [19]:
def calc_map_at_k(df_data):
    for col_name in df_data.columns[2:]:
        yield col_name, map_k(df_data[col_name].values.tolist(), df_data['actual'].values.tolist())*100

In [92]:
for pred in calc_precision_at_k(result, 5):
    print(*pred, sep=': ')

popular_recommendation: 17.595171773444754
als: 17.97585886722377
own: 2.3955431754874654
similar_items: 8.105849582172702
similar_users: 11.411327762302694


In [93]:
for pred in calc_map_at_k(result):
    print(*pred, sep=': ')

popular_recommendation: 29.00753120808831
als: 15.215427023284498
own: 5.292453316826575
similar_items: 7.218083345520671
similar_users: 10.63401700048636


##### Second level model

In [94]:
users_lvl_1 = data_train_lvl_1.user_id.unique()
users_lvl_2 = data_val_lvl_1.user_id.unique()
users_lvl_3 = data_val_lvl_2.user_id.unique()

new_users_lvl_2 = list(set(users_lvl_2) - set(users_lvl_1))
new_users_lvl_3 = list(set(users_lvl_3) - (set(users_lvl_1) | set(users_lvl_2)))

add_to_lvl_2 = list(set(users_lvl_3) - (set(users_lvl_2)))

new_users_lvl_2, new_users_lvl_3, len(add_to_lvl_2)

([296, 1813, 1984], [2259], 126)

In [95]:
train_val_lvl1 = pd.concat([data_train_lvl_1, data_val_lvl_1])
train_val_lvl1 = train_val_lvl1.merge(item_features[['item_id', 'department']], on='item_id', how='inner')

In [96]:
condition = ((data_train_lvl_2.user_id.isin(data_train_lvl_1.user_id.tolist())) | (pd.Series(data=add_to_lvl_2).isin(data_train_lvl_1.user_id.tolist())))
val_list_out = data_train_lvl_2.loc[~condition, 'user_id'].unique().tolist()
val_list_out

[1813, 1984, 296]

In [97]:
df_candidates = pd.DataFrame(data=data_train_lvl_2['user_id'].unique().tolist() + add_to_lvl_2)
df_candidates.columns = ['user_id']
df_candidates['candidates'] = df_candidates['user_id'].apply(lambda x: rule(x, new_users_lvl_2+new_users_lvl_3, model='als', N=N_preds))
df_candidates.head()  # 'als' for ver. 0.5.2, 'own' otherwise

Unnamed: 0,user_id,candidates
0,2070,"[1029743, 1070820, 866211, 1044078, 5569374, 5..."
1,2021,"[1106523, 1044078, 999270, 844179, 899624, 101..."
2,1753,"[983584, 844179, 1029743, 1070272, 952972, 556..."
3,2120,"[1029743, 1106523, 986912, 5569230, 878996, 86..."
4,1346,"[5569374, 5574377, 1127831, 5569309, 866211, 1..."


In [98]:
df_candidates = pd.DataFrame({'user_id': df_candidates['user_id'].values.repeat(N_preds),
                              'item_id': np.concatenate(df_candidates['candidates'].values)})

df_train = data_train_lvl_2[['user_id', 'item_id']].copy()
df_train['target'] = 1

df_train = df_candidates.merge(df_train, on=['user_id', 'item_id'], how='left')
df_train.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

df_train['target'].fillna(0, inplace=True)
df_train.head()

Unnamed: 0,user_id,item_id,target
0,2070,1029743,0.0
1,2070,1070820,0.0
2,2070,866211,0.0
3,2070,1044078,0.0
4,2070,5569374,0.0


In [99]:
df_train['target'].value_counts()

0.0    61483
1.0     6917
Name: target, dtype: int64

In [100]:
df_train = df_train.merge(item_features, on='item_id', how='left')
df_train = df_train.merge(user_features, on='user_id', how='left')
df_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1070820,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,866211,0.0,2,PRODUCE,National,GRAPES,GRAPES WHITE,18 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,1044078,0.0,2845,MEAT,National,BEEF,LEAN,,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,5569374,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [101]:
data_department = data_train_lvl_2.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,GROCERY
1,2070,40630625006,594,1019940,1,1.0,311,-0.29,201,86,0.0,0.0,GROCERY


In [102]:
df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('sales_value').sum().rename('total_item_sales_value'), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('total_quantity_value'), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('user_id').count().rename('item_freq'), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('user_id').count().rename('user_freq'), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('sales_value').sum().rename('total_user_sales_value'), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_week')/train_val_lvl1.week_no.nunique(), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_week')/train_val_lvl1.week_no.nunique(), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('user_id').count().rename('item_freq_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('user_id').count().rename('user_freq_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='user_id')

In [103]:
# Средний чек
users_sales = data_train_lvl_2.groupby('user_id')['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_train = df_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')

# Количество уникальных категорий покупателя
users_departments = data_department.groupby('user_id')['department'].nunique().reset_index()
users_departments.rename(columns = {'department':'users_unique_departments'}, inplace=True)
df_train = df_train.merge(users_departments, on='user_id', how='left')

# Среднее время покупки
bought_time = data_train_lvl_2.groupby('user_id')['trans_time'].mean().reset_index()
bought_time.rename(columns = {'trans_time':'mean_trans_time_by_user'}, inplace=True)
df_train = df_train.merge(bought_time, on='user_id', how='left')

# Средний чек корзины 
baskets_sales_value = data_train_lvl_2.groupby(['user_id','basket_id'])['sales_value'].mean().reset_index()
mean_basket_sales_value = baskets_sales_value.groupby('user_id')['sales_value'].mean().reset_index()
mean_basket_sales_value.rename(columns = {'sales_value':'mean_sales_value_per_basket'}, inplace=True)
df_train = df_train.merge(mean_basket_sales_value, on='user_id', how='left')

# Количество купленных уникальных товаров 
unique_bought_items = data_train_lvl_2.groupby('user_id')['item_id'].nunique().reset_index()
unique_bought_items.rename(columns = {'item_id':'unique_bought_items'}, inplace=True)
df_train = df_train.merge(unique_bought_items, on='user_id', how='left')

# Среднее количество уникальных категорий в корзине
users_baskets = data_department.groupby(['user_id', 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby('user_id')['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_train = df_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')

# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
df_train = df_train.merge(department_sales, on='department', how='left')

# Средная цена купленных товаров пользователем
users_sales = data_train_lvl_2.groupby('user_id')[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_train = df_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')

In [104]:
# Среднее количество покупок товара в неделю
num_purchase_week = data_train_lvl_2.groupby('item_id').agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
num_purchase_week['avg_num_purchases_week'] = num_purchase_week['quantity'] / num_purchase_week['week_no']
df_train = df_train.merge(num_purchase_week[['item_id', 'avg_num_purchases_week']], on='item_id', how='left')
df_train['avg_num_purchases_week'].fillna(0, inplace=True)

# Цена товара
items_sales = data_department.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)
df_train = df_train.merge(items_sales[['item_id', 'price']], on='item_id', how='left')

# Среднее время покупки товара
bought_item_time = data_train_lvl_2.groupby('item_id')['trans_time'].mean().reset_index()
bought_item_time.rename(columns = {'trans_time':'mean_trans_time_by_item'}, inplace=True)
df_train = df_train.merge(bought_item_time, on = 'item_id', how = 'left')

# Количество магазинов, где есть товар
items_stores = data_department.groupby('item_id')['store_id'].sum().reset_index()
items_stores.rename(columns={'store_id': 'n_stores_with_item'}, inplace=True)
items_stores['n_stores_with_item'].fillna(0, inplace = True)
df_train = df_train.merge(items_stores, on='item_id', how='left')

# Количество уникальных магазинов, где есть товар
items_stores = data_department.groupby('item_id')['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores_with_item'}, inplace=True)
items_stores['n_unique_stores_with_item'].fillna(0, inplace = True)
df_train = df_train.merge(items_stores, on='item_id', how='left')

In [105]:
# Построим признак, отражающий средний интервал между покупками пользователя.
users_days = train_val_lvl1.groupby('user_id')['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[51, 67, 88, 94, 101, 108, 111, 128, 137, 146,..."
1,2,"[103, 112, 117, 118, 139, 140, 154, 160, 161, ..."
2,3,"[113, 121, 136, 141, 142, 163, 166, 169, 173, ..."
3,4,"[104, 140, 154, 181, 190, 199, 216, 231, 244, ..."
4,5,"[88, 97, 111, 154, 168, 181, 191, 192, 223, 25..."


In [106]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_train = df_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,unique_bought_items,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,156.0,1.790698,2.531705,0.035173,211.833333,2.336216,1551.780669,4298090.0,110.0,4.776786
1,2070,1070820,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,156.0,1.790698,2.531705,0.035173,68.333333,2.273415,1512.691877,1494402.0,84.0,4.776786


In [107]:
users_items = pd.concat([data_train_lvl_2, data_val_lvl_2]).groupby('user_id')['item_id'].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[9677939, 10356149, 13417048, 15741823, 15830875]"
1,2,"[7407562, 10149597, 13776981, 15572067, 17215077]"
2,3,"[7167218, 7167249, 9526563, 9526886, 13842214]"
3,4,"[963365, 1038692, 1133312, 5570830, 7431408]"
4,6,"[1020683, 1047193, 1100691, 1123407, 10311503]"


In [108]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_train['Last5sales'] = df_train[['user_id', 'item_id']].apply(code_last_sales, axis=1)
df_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,1.790698,2.531705,0.035173,211.833333,2.336216,1551.780669,4298090.0,110.0,4.776786,0
1,2070,1070820,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,1.790698,2.531705,0.035173,68.333333,2.273415,1512.691877,1494402.0,84.0,4.776786,0


In [109]:
for column in df_train.columns:
    if df_train[column].dtype == 'object':
        df_train[column].fillna('unk', inplace=True)
    elif df_train[column].dtype == 'float64':
        df_train[column].fillna(0.0, inplace=True)
print(f'Length of train data: {df_train.shape[0]}')
df_train.dropna(inplace=True)
df_train = reduce_memory(df_train)
df_train.info()

Length of train data: 68400
<class 'pandas.core.frame.DataFrame'>
Int64Index: 64620 entries, 0 to 64619
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   user_id                      64620 non-null  int16   
 1   item_id                      64620 non-null  int32   
 2   target                       64620 non-null  float32 
 3   manufacturer                 64620 non-null  int16   
 4   department                   64620 non-null  category
 5   brand                        64620 non-null  category
 6   commodity_desc               64620 non-null  category
 7   sub_commodity_desc           64620 non-null  category
 8   curr_size_of_product         64620 non-null  category
 9   age_desc                     64620 non-null  category
 10  marital_status_code          64620 non-null  category
 11  income_desc                  64620 non-null  category
 12  homeowner_desc               646

In [None]:
# df_train['total_quantity_value'] = df_train['total_quantity_value'].astype(np.float32)
# df_train['item_freq'] = df_train['item_freq'].astype(np.float32)

In [110]:
X, y = df_train.drop('target', axis=1), df_train['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# the idea to set user_id and item_id as category (catboosts combines cat features and may find out user_id-item_id is a good feature)
# did not approved 
cat_cols = X.select_dtypes(include=['category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [None]:
X.sample(5)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
49477,2195,1080853,649,PRODUCE,National,ORGANICS FRUIT & VEGETABLES,ORGANIC SALAD MIX,5 OZ,unk,unk,...,1.727273,2.31392,2.585094,1.8,3.49,1703.555542,34988.0,7.0,11.755555,0
34199,1887,901062,2,PRODUCE,National,APPLES,APPLES GALA (BULK&BAG),,unk,unk,...,2.0,2.31392,1.27,28.0,2.472381,1494.490967,578548.0,67.0,15.435898,0
55811,1721,849697,1225,GROCERY,National,FLUID MILK PRODUCTS,REFRIGERATED COFFEE CREAMERS,32 OZ,unk,unk,...,4.0,2.531705,3.695714,2.0,2.738333,1794.199951,1670.0,5.0,21.0,0
29907,485,857215,3020,MEAT,National,PORK,LOIN - CHOPS BONELESS,,35-44,U,...,2.666667,6.485481,0.012061,0.0,0.0,0.0,0.0,0.0,6.070588,0
55269,2276,967144,320,GROCERY,National,FRZN MEAT/MEAT DINNERS,FRZN SS PREMIUM ENTREES/DNRS/T,15 OZ,unk,unk,...,2.0,2.531705,1.903585,2.5,2.4,1496.0,1661.0,5.0,26.35,0


In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
def select_features(algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    model = CatBoostClassifier(iterations=200, auto_class_weights='Balanced', random_seed=42)
    summary = model.select_features(
        train_pool,
        eval_set=valid_pool,
        features_for_select=list(range(train_pool.num_col())),
        num_features_to_select=30,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=False,
        logging_level='Silent',
        plot=True
    )
    print('Selected features:', summary['selected_features_names'])
    return summary

feature_set = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=11)

Algorithm: EFeaturesSelectionAlgorithm.RecursiveByShapValues


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Selected features: ['user_id', 'item_id', 'manufacturer', 'department', 'commodity_desc', 'sub_commodity_desc', 'marital_status_code', 'income_desc', 'hh_comp_desc', 'household_size_desc', 'total_item_sales_value', 'total_quantity_value', 'item_freq', 'total_user_sales_value', 'item_quantity_per_week', 'item_freq_per_basket', 'user_freq_per_basket', 'avg_cheque', 'users_unique_departments', 'mean_trans_time_by_user', 'unique_bought_items', 'mean_sales_value_category', 'avg_price', 'avg_num_purchases_week', 'price', 'mean_trans_time_by_item', 'n_stores_with_item', 'n_unique_stores_with_item', 'avg_interval', 'Last5sales']


In [111]:
drop = ['department', 'brand', 'mean_sales_value_per_basket', 'hh_comp_desc', 'homeowner_desc', 'curr_size_of_product']
X_train.drop(drop, axis=1, inplace=True)
X_valid.drop(drop, axis=1, inplace=True)

# when user_id, item_id are categories
#drop = ['mean_sales_value_per_basket', 'homeowner_desc', 'avg_basket_department', 'brand', 'user_quantity_per_basket']
#X_train.drop(drop, axis=1, inplace=True)
#X_valid.drop(drop, axis=1, inplace=True)

""" ver. 0.4.8
X_train.drop(['commodity_desc', 'item_quantity_per_week', 'curr_size_of_product', 'total_quantity_value', 'user_quantity_per_basket',
              'avg_interval', 'homeowner_desc',], axis=1, inplace=True)
X_valid.drop(['commodity_desc', 'item_quantity_per_week', 'curr_size_of_product', 'total_quantity_value', 'user_quantity_per_basket',
              'avg_interval', 'homeowner_desc',], axis=1, inplace=True)
"""

cat_cols = X_train.select_dtypes(include=['category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [52]:
params_model = {
                'eval_metric': 'Logloss', # AUC
                'iterations': 2000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                }

In [None]:
def objective(trial):
    param_trials = {
                    'depth': trial.suggest_int('depth', 3, 15),
                    'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.5),
                    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1., 6.),
                    'bagging_temperature': trial.suggest_float('bagging_temperature', 1., 5.),
                    'subsample': trial.suggest_float('subsample', 0.3, 1.),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.),
                    'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                    'auto_class_weights': trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
                    'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
                    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 255),
                    'max_leaves': trial.suggest_int('max_leaves', 5, 64),
                    }
    param_trials.update(params_model)
    if param_trials['grow_policy'] == 'SymmetricTree':
        param_trials['min_data_in_leaf'] = None
        param_trials['max_leaves'] = None
    if param_trials['grow_policy'] == 'Depthwise':
        param_trials['max_leaves'] = None
    if param_trials['grow_policy'] != 'SymmetricTree':
        param_trials['boosting_type'] = 'Plain'
    
    opt_model = CatBoostClassifier(**param_trials)
    opt_model.fit(train_pool, eval_set=valid_pool)
    
    y_pred = opt_model.predict_proba(X_valid)[:,1]
    score = log_loss(y_valid, y_pred)

    # y_pred = opt_model.predict_proba(X_valid)[:,1]
    # score = roc_auc_score(y_valid, y_pred)
    
    return score

In [None]:
# optuna.logging.set_verbosity(optuna.logging.INFO)
# study = optuna.create_study(sampler=TPESampler(seed=42), direction="minimize")  # minimize for logloss, maximize for roc_auc
# study.optimize(objective, n_trials=50, timeout=6000)

# print(f'Number of completed trials: {len(study.trials)}')
# print('Best trial')
# trial = study.best_trial
# print(f'Best score: {trial.value}')
# print('Best params')
# for key, value in trial.params.items():
#     print(f'{key}: {value}')

In [112]:
params_model = {
                'eval_metric': 'Logloss', 
                'iterations': 2000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                'depth': 15, 
                'learning_rate': 0.05036379129844091, 
                'l2_leaf_reg': 2.6760037989729177, 
                'bagging_temperature': 3.885879965461792, 
                'subsample': 0.8002252887959993, 
                'colsample_bylevel': 0.5385267306992663, 
                'boosting_type': 'Plain', 
                'auto_class_weights': 'SqrtBalanced', 
                'grow_policy': 'Lossguide', 
                'min_data_in_leaf': 185, 
                'max_leaves': 56
}

In [114]:
X_ = X.drop(drop, axis=1)
train_pool = Pool(X_, y, cat_features=cat_cols)

In [115]:
cat = CatBoostClassifier(**params_model).fit(train_pool, eval_set=train_pool)  # eval_set=valid_pool/train_pool

In [None]:
print(f'ROC_AUC: ', f'{roc_auc_score(y, cat.predict_proba(X_)[:,1]):.4f}')
imp = cat.feature_importances_[:10]
for name, value in zip(X_.columns, imp):
    print(name, ' - ', value)

ROC_AUC:  0.9986
user_id  -  3.697980653836688
item_id  -  3.7643138695849445
manufacturer  -  2.3203117882015425
commodity_desc  -  1.5652422556781203
sub_commodity_desc  -  1.8200915956153705
age_desc  -  1.0775203860551508
marital_status_code  -  0.4674321309396743
income_desc  -  1.3961590637853494
household_size_desc  -  0.6192971708230803
kid_category_desc  -  0.408266634134656


In [116]:
rank_results = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
rank_results.columns = ['user_id', 'actual']
cold_users = data_val_lvl_2.loc[~data_val_lvl_2.user_id.isin(data_train_lvl_1.user_id.tolist()), 'user_id'].unique().tolist()

rank_results['als'] = rank_results['user_id'].apply(lambda x: rule(x, cold_users, model='als', N=N_preds))  # 'als' for ver. 0.5.2

In [117]:
df_predict = df_train.copy()
df_predict['proba_item_purchase'] = cat.predict_proba(X.drop(drop, axis=1))[:, 1]

In [118]:
def rerank(user_id, N, out=cold_users):
    if user_id in df_predict.user_id:
        return df_predict[df_predict['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(N).item_id.tolist()
    else:
        return rule(user_id, cold_users, model='als', N=5)  # 'als' for ver 0.5.2

In [119]:
rank_results['reranked_als'] = rank_results['user_id'].apply(lambda user_id: rerank(user_id, N=5))  # 'als' for ver 0.5.2

In [120]:
rank_results['als_postfiltered'] = rank_results.user_id.apply(lambda x: postfilter(rule(x, cold_users, model='als', N=20), 
                                                                                   item_features, N=5))  # 'als' for ver 0.5.2

In [121]:
# 'als' for ver 0.5.2, 'own' 0.4.8
rank_results.loc[rank_results['reranked_als'].str.len() == 0, 'reranked_als'] = rank_results.\
loc[rank_results['reranked_als'].str.len() == 0, 'user_id'].apply(lambda x: rule(x, cold_users, model='als', N=5))

In [122]:
def rerank_post(user_id, N_rank=20, N_post=5):  # 'als' for ver 0.5.2
    try:
        out = rerank(user_id, N=N_rank)
        out = postfilter(out, item_features, N=N_post)
    except AssertionError:
        out = rule(user_id, cold_users, model='als', N=N_post)
    return out

rank_results['postfiltered_reranked_als'] = rank_results['user_id'].apply(lambda user_id: rerank_post(user_id))

In [None]:
rank_results.head()

Unnamed: 0,user_id,actual,als,reranked_als,als_postfiltered,postfiltered_reranked_als
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 908318, 5577022, 8090541, 1137775, 10...","[9297615, 909497, 9655212, 5577022, 1102067]","[856942, 5577022, 1137775, 940947, 5569374]","[9297615, 9655212, 1102067, 856942, 872137]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1106523, 1029743, 1050851, 5569230, 946839, 9...","[1106523, 1029743, 1050851, 5569230, 946839]","[1106523, 1050851, 1020581, 1110244, 8090521]","[1106523, 1029743, 1050851, 5569230, 946839]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[965267, 930118, 878996, 866211, 831628, 85485...","[1082185, 6548453, 1024306, 878996, 1106523]","[965267, 878996, 831628, 1023720, 1026118]","[1082185, 1024306, 1106523, 1098248, 871611]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1122358, 1029743, 1112238, 1126899, 12810393,...","[1126899, 1106523, 1122358, 1029743, 866211]","[1122358, 1112238, 12810393, 878996, 839419]","[1126899, 1122358, 866211, 5569327, 1056509]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1029743, 5569230, 844179, 12301109, 5569471, ...","[1029743, 872137, 12302069, 1044078, 1106523]","[1029743, 844179, 5569471, 1034176, 981086]","[1029743, 12302069, 909714, 1075368, 916122]"


In [None]:
(rank_results.reranked_als.str.len() == 0).sum(), len(rank_results)  # 'als' for ver 0.5.2

(0, 2042)

In [123]:
print('-'*15, 'mean precision@5', '-'*15)

for pred in calc_precision_at_k(rank_results, 5):
    print(*pred, sep=': ')
    
print('-'*15, 'map@5', '-'*15)
for pred in calc_map_at_k(rank_results):
    print(*pred, sep=': ')

--------------- mean precision@5 ---------------
als: 15.122428991185114
reranked_als: 25.17140058765916
als_postfiltered: 12.88932419196866
postfiltered_reranked_als: 18.765915768854065
--------------- map@5 ---------------
als: 13.98302092401015
reranked_als: 45.728520513657635
als_postfiltered: 27.165020676896283
postfiltered_reranked_als: 43.72102513875286


In [None]:
def transform_data_for_eval(dataset, rec_col, user_col='user_id'):
    '''
    Func for transforming recommendations into kaggle evaluation format

    Parameters:
    dataset (pd.DataFrame): Dataset with 2 required columns:
        rec_col - column with recommendations should be iterable
        user_col - columns with user id

    rec_col (str): name of column in dataset with recommendations

    user_col (str): name of column in dataset with user id

    Returns:
    pd.DataFrame: DataFrame in suitable format

   '''
    eval_dataset = dataset[[user_col, rec_col]].copy()
    eval_dataset[rec_col] = eval_dataset[rec_col].apply(lambda x: ' '.join([str(i) for i in x]))
    eval_dataset.rename(columns={
        user_col: 'UserId',
        rec_col: 'Predicted'
    }, inplace=True)
    return eval_dataset

In [124]:
new = test[~test.user_id.isin(data_train_lvl_1.user_id.tolist())].user_id.tolist()
print(len(new))

2


In [125]:
# test['preds'] = test.user_id.apply(lambda x: rule(x, new, model='als', N=30))
test['preds'] = test.user_id.apply(lambda x: rerank(x, 5))

In [None]:
# test.loc[~test.user_id.isin(new), 'preds'] = test.loc[~test.user_id.isin(new), 'user_id'].apply(lambda user_id: rerank(user_id, N=5))

In [None]:
(test.preds.str.len() == 0).sum(), len(test)

(86, 1708)

In [126]:
# 'als' for ver 0.5.2
test.loc[test.preds.str.len() == 0, 'preds'] = test.loc[test.preds.str.len() == 0, 'user_id'].\
                                                    apply(lambda x: rule(x, new, model='als', N=5))

In [127]:
# reranked als
submission = transform_data_for_eval(test[['user_id', 'preds']], rec_col='preds', user_col='user_id')
submission.to_csv('submission.csv', index=False)

In [128]:
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'  # private: 0.16346

100% 71.2k/71.2k [00:03<00:00, 21.9kB/s]
Successfully submitted to GB RecSys Project

##### CatBoostRanker

In [129]:
X_['group'] = np.arange(len(X_))
group = X_[['user_id', 'group']].groupby('user_id', sort=False).max()['group'].values

In [130]:
grs = np.concatenate((np.array([group[0]+1]), group[1:] - group[:-1]))

In [131]:
default_parameters = {
    'iterations': 2000,
    'verbose': False,
    'random_seed': 42,
    'one_hot_max_size': 15,
    'early_stopping_rounds': 50,
    'allow_writing_files': False,
    'grow_policy': 'SymmetricTree',
    'boosting_type': 'Plain',
    'use_best_model': True,
    'depth': 8, 
    'learning_rate': 0.18905265981777902, 
    'l2_leaf_reg': 4.927368361978105, 
    'subsample': 0.9, 
    'colsample_bylevel': 0.8, 
}

In [132]:
train = Pool(
    data=X_,
    label=y,
    cat_features=cat_cols,
    group_id=np.repeat(grs, N_preds)
)

In [133]:
from copy import deepcopy

def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=train):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=False)
    
    return model

In [134]:
model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=5', 'RecallAt:top=5', 'MAP:top=5']})

In [140]:
cat_ranks = model.predict(X_)

cat_ranker_prediction = X_[['user_id', 'item_id']]
cat_ranker_prediction["pred"] = cat_ranks

cat_ranker_prediction.drop_duplicates(inplace=True)
cat_ranker_prediction.sort_values(by="pred", inplace=True, ascending=False)

cat_ranker_prediction.head()

Unnamed: 0,user_id,item_id,pred
2978,895,7024847,1.350675
19828,1906,871633,1.245302
29822,1137,924667,1.199044
37011,2058,995965,1.198162
14580,589,5569230,1.190951


In [143]:
cat_ranked_res = cat_ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
cat_ranked_res = cat_ranked_res.rename(columns={'item_id': 'cat_ranker'})  
cat_ranked_res.head()

Unnamed: 0,user_id,cat_ranker
0,1,"[9655212, 9297615, 9527558, 901062, 10149640, ..."
1,2,"[1106523, 916122, 866211, 1029743, 909714, 100..."
2,4,"[1052294, 891423, 883932, 987044, 1137010, 986..."
3,6,"[1082185, 6548453, 1024306, 878996, 1106523, 9..."
4,7,"[1126899, 9338009, 1106523, 1122358, 6034857, ..."


In [146]:
rank_results = rank_results.merge(cat_ranked_res, on='user_id', how='left')

rank_results.loc[rank_results.cat_ranker.isna(), 'cat_ranker'] = rank_results.loc[rank_results.cat_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, cold_users, model='als', N=5))

In [147]:
print('-'*15, 'mean precision@5', '-'*15)

for pred in calc_precision_at_k(rank_results, 5):
    print(*pred, sep=': ')
    
print('-'*15, 'map@5', '-'*15)
for pred in calc_map_at_k(rank_results):
    print(*pred, sep=': ')

--------------- mean precision@5 ---------------
als: 15.122428991185114
reranked_als: 25.17140058765916
als_postfiltered: 12.88932419196866
postfiltered_reranked_als: 18.765915768854065
cat_ranker: 24.84818805093046
--------------- map@5 ---------------
als: 13.98302092401015
reranked_als: 45.728520513657635
als_postfiltered: 27.165020676896283
postfiltered_reranked_als: 43.72102513875286
cat_ranker: 28.528143558567166


In [148]:
test = test.merge(cat_ranked_res, on='user_id', how='left')
test.loc[test.cat_ranker.isna(), 'cat_ranker'] = test.loc[test.cat_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, new, model='als', N=5))

In [149]:
submission = transform_data_for_eval(test[['user_id', 'cat_ranker']], rec_col='cat_ranker', user_col='user_id')
submission.to_csv('submission.csv', index=False)
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'  # private: 0.16790

100% 371k/371k [00:04<00:00, 94.8kB/s]
Successfully submitted to GB RecSys Project

##### LGBMRanker

In [None]:
from lightgbm import LGBMRanker

In [None]:
lgb_params = { 
    'objective':'lambdarank',
    'boosting_type': 'gbdt',
    'n_estimators': 2000,
    'categorical_column': cat_cols,
    'random_state': 42,
    'is_unbalance': True,
    'n_jobs': -1,
    'max_depth': 8, 
    'learning_rate': 0.08, 
    'num_leaves': 58, 
    'subsample': 0.8, 
    'colsample_bytree': 0.7, 
    'max_bin': 234, 
    'min_child_samples': 204, 
}

In [None]:
lgb_rn = LGBMRanker(**lgb_params, silent=False)

eval_h = lgb_rn.fit(X_, y, group=grs,
                    eval_set=[(X_, y)], eval_group=[grs], 
                    eval_metric=['ndcg'],
                    eval_at=[5, 10 ], 
                    early_stopping_rounds=50, 
                    verbose=0)

In [None]:
eval_h.evals_result_['training']['ndcg@5'][-1]

0.9871283646559261

In [None]:
rank_preds = lgb_rn.predict(X_)

In [None]:
ranker_prediction = X_[['user_id', 'item_id']]
ranker_prediction["pred"] = rank_preds
ranker_prediction.head()

Unnamed: 0,user_id,item_id,pred
0,2070,1029743,-0.403812
1,2070,899624,-0.007976
2,2070,893018,-1.950284
3,2070,1016800,-0.971458
4,2070,917033,-0.996201


In [None]:
ranker_prediction = ranker_prediction.drop_duplicates()

In [None]:
ranker_prediction.sort_values(by="pred", inplace=True, ascending=False)
ranker_prediction.head()

Unnamed: 0,user_id,item_id,pred
15725,817,1082185,3.009591
40252,1698,1082185,2.993699
2123,1567,1082185,2.963474
26729,2006,1082185,2.827805
20101,427,1082185,2.730265


In [None]:
ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
ranked_res = ranked_res.rename(columns={'item_id': 'lgbm_ranker'})  
ranked_res.head()

Unnamed: 0,user_id,lgbm_ranker
0,1,"[9297615, 909497, 9655212, 5577022, 1082212, 8..."
1,2,"[866211, 1106523, 916122, 1029743, 1070820, 10..."
2,4,"[891423, 1052294, 883932, 1070820, 1082627, 10..."
3,6,"[1082185, 1024306, 6548453, 1106523, 878996, 1..."
4,7,"[1122358, 1126899, 1106523, 1029743, 866211, 9..."


In [None]:
rank_results = rank_results.merge(ranked_res, on='user_id', how='left')

In [None]:
rank_results.loc[rank_results.lgbm_ranker.isna(), 'lgbm_ranker'] = rank_results.loc[rank_results.lgbm_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, cold_users, model='als', N=5))

In [None]:
print('-'*15, 'mean precision@5', '-'*15)

for pred in calc_precision_at_k(rank_results, 5):
    print(*pred, sep=': ')
    
print('-'*15, 'map@5', '-'*15)
for pred in calc_map_at_k(rank_results):
    print(*pred, sep=': ')

--------------- mean precision@5 ---------------
als: 15.151811949069542
reranked_als: 24.534769833496576
als_postfiltered: 12.85014691478942
postfiltered_reranked_als: 18.73653281096964
cat_ranker: 24.29970617042116
lgbm_ranker: 24.593535749265428
--------------- map@5 ---------------
als: 13.714179588988742
reranked_als: 45.2827429535314
als_postfiltered: 26.336652519316576
postfiltered_reranked_als: 43.7777505713353
cat_ranker: 28.009859305979028
lgbm_ranker: 27.316224463837823


In [None]:
test = test.merge(ranked_res, on='user_id', how='left')
test.head()

Unnamed: 0,user_id,preds,cat_ranker,lgbm_ranker
0,1,"[9297615, 909497, 9655212, 5577022, 1102067]","[9297615, 909497, 5577022, 9655212, 872137, 11...","[9297615, 909497, 9655212, 5577022, 1082212, 8..."
1,2,"[916122, 1106523, 866211, 1029743, 1044078]","[1106523, 916122, 866211, 1070820, 1029743, 10...","[866211, 1106523, 916122, 1029743, 1070820, 10..."
2,3,"[1106523, 1029743, 1050851, 5569230, 946839]","[1106523, 1029743, 1050851, 5569230, 946839]",
3,6,"[1082185, 6548453, 1024306, 878996, 1106523]","[1082185, 6548453, 1024306, 878996, 1098248, 1...","[1082185, 1024306, 6548453, 1106523, 878996, 1..."
4,7,"[1126899, 1106523, 1122358, 1029743, 866211]","[1126899, 1106523, 1122358, 5569327, 8090537, ...","[1122358, 1126899, 1106523, 1029743, 866211, 9..."


In [None]:
test.loc[test.lgbm_ranker.isna(), 'lgbm_ranker'] = test.loc[test.lgbm_ranker.isna(), 'user_id'].\
apply(lambda x: rule(x, new, model='als', N=5))

In [None]:
submission = transform_data_for_eval(test[['user_id', 'lgbm_ranker']], rec_col='lgbm_ranker', user_col='user_id')
submission.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'  # private: 0.15855

100% 371k/371k [00:04<00:00, 93.8kB/s]
Successfully submitted to GB RecSys Project

##### LightFM
[kaggle beautiful kernel](https://www.kaggle.com/code/niyamatalmass/lightfm-hybrid-recommendation-system)