In [72]:
!pip install -qq catboost optuna implicit

In [73]:
# !pip install -qq implicit==0.4.8

In [74]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight
from implicit.nearest_neighbours import ItemItemRecommender

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRanker, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

import optuna
from optuna.samplers import TPESampler

In [75]:
# from google.colab import drive
# drive.mount('/content/drive')
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c gb-recsys-project
!unzip gb-recsys-project.zip
!rm gb-recsys-project.zip

kaggle.json
Downloading gb-recsys-project.zip to /content
 75% 18.0M/23.9M [00:00<00:00, 181MB/s]
100% 23.9M/23.9M [00:00<00:00, 190MB/s]
Archive:  gb-recsys-project.zip
replace hh_demographic.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


KeyboardInterrupt: ignored

In [None]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    return precision

def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list, bought_list)
    
    if sum(flags) == 0:
        return 0
    
    sum_ = 0
    for i in range(k):
        
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
            
    result = sum_ / sum(flags)
    
    return result

def map_k(recommend_list, bought_list, k=5):
    return np.mean([ap_k(rec, bt, k) for rec, bt in zip(recommend_list, bought_list)])

def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)[:4] != 'uint' and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif str(col_type)[:4] != 'uint':
            df[col] = df[col].astype('category')
    return df

def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [None]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')
test = pd.read_csv('test_users.csv')

In [None]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
item_features.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
user_features.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [None]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [None]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1 = reduce_memory(data_train_lvl_1)
data_val_lvl_1 = reduce_memory(data_val_lvl_1)
data_train_lvl_2 = reduce_memory(data_train_lvl_2)
data_val_lvl_2 = reduce_memory(data_val_lvl_2)

print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')
print(f'Level 1 Valid - users: {data_val_lvl_1.user_id.nunique()}, items: {data_val_lvl_1.item_id.nunique()}')
print(f'Level 2 Train - users: {data_train_lvl_2.user_id.nunique()}, items: {data_train_lvl_2.item_id.nunique()}')
print(f'Level 2 Valid - users: {data_val_lvl_2.user_id.nunique()}, items: {data_val_lvl_2.item_id.nunique()}')

Level 1 Train - users: 2498, items: 83685
Level 1 Valid - users: 2154, items: 27649
Level 2 Train - users: 2154, items: 27649
Level 2 Valid - users: 2042, items: 24329


In [None]:
# ищем общих пользователей
# common_users = list(set(data_train_lvl_1.user_id.values)&(set(data_val_lvl_1.user_id.values))&set(data_val_lvl_2.user_id.values))

# оставляем общих пользователей
# data_train_lvl_1 = data_train_lvl_1[data_train_lvl_1.user_id.isin(common_users)]
# data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(common_users)]
# data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(common_users)]
# data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(common_users)]

# print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')
# print(f'Level 1 Valid - users: {data_val_lvl_1.user_id.nunique()}, items: {data_val_lvl_1.item_id.nunique()}')
# print(f'Level 2 Train - users: {data_train_lvl_2.user_id.nunique()}, items: {data_train_lvl_2.item_id.nunique()}')
# print(f'Level 2 Valid - users: {data_val_lvl_2.user_id.nunique()}, items: {data_val_lvl_2.item_id.nunique()}')

In [None]:
result = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [None]:
# top 5 popular recommendations
popular_recs = popularity_recommendation(data_train_lvl_1, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

Unnamed: 0,user_id,actual,popular_recommendation
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [None]:
def prefilter_items(data, take_n_popular=5000, item_features=None):
    # Уберем самые популярные товары (их и так купят)
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index() / data['user_id'].nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

    top_popular = popularity[popularity['share_unique_users'] > 0.2].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]

    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.02].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]

    # Уберем не интересные для рекоммендаций категории (department)
    if item_features is not None:
        department_size = pd.DataFrame(item_features.groupby('department')['item_id'].nunique().sort_values(ascending=False)).reset_index()
        department_size.columns = ['department', 'n_items']
        rare_departments = department_size[department_size['n_items'] < 150].department.tolist()
        items_in_rare_departments = item_features[item_features['department'].isin(rare_departments)].item_id.unique().tolist()

        data = data[~data['item_id'].isin(items_in_rare_departments)]

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
    data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
    data = data[data['price'] > 2]

    # Уберем слишком дорогие товарыs
    data = data[data['price'] < 50]

    # уберем товары, не продававшиеся более 12 месяцев
    # data = data[data['week_no'] >= data['week_no'].max() - 52]

    # Возьмем топ по популярности
    popularity = data.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
    top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()
    
    # Заведем фиктивный item_id (если юзер покупал товары из топ-N, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = 999999
    
    return data

def postfilter(recommendations, item_info, N=5):
    """Пост-фильтрация товаров
    
    Input
    -----
    recommendations: list
        Ранжированный список item_id для рекомендаций
    item_info: pd.DataFrame
        Датафрейм с информацией о товарах
    """
    
    # Уникальность
    unique_recommendations = []
    [unique_recommendations.append(item) for item in recommendations if item not in unique_recommendations]
    
    # Разные категории
    categories_used = []
    final_recommendations = []
    CATEGORY_NAME = 'sub_commodity_desc'
    for item in unique_recommendations:
        category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
        
        if category not in categories_used:
            final_recommendations.append(item)
            
        unique_recommendations.remove(item)
        categories_used.append(category)
    
    n_rec = len(final_recommendations)
    if n_rec < N:
        final_recommendations.extend(unique_recommendations[:N - n_rec])
    else:
        final_recommendations = final_recommendations[:N]
    
    assert len(final_recommendations) == N, 'Количество рекомендаций != {}'.format(N)
    return final_recommendations

In [None]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, data_type='quantity', weighting=True, normalize=False, alpha=1.):
        self.data_type = data_type
        self.normalize = normalize
        self.alpha = alpha

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data, data_type=self.data_type, normalize=self.normalize)
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix, K1=100, B=0.8)  # default: 100, 0.8

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data, data_type, normalize=False):
        """Готовит user-item матрицу"""
        if data_type == 'quantity':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='quantity',
                                              aggfunc='count',
                                              fill_value=0
                                              )
        elif data_type == 'sales':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='sales_value',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()  # normalize
        elif data_type == 'quantity_sum':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', columns='item_id',
                                              values='quantity',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()
        else:
            raise ValueError(f'Agg data type must be "quantity", "sales" or "quantity_sum", given: {data_type}')

        user_item_matrix = user_item_matrix.astype(float)

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=50, regularization=0.01, iterations=15, num_threads=4, alpha=1.):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads,
                                        random_state=42)
        model.fit(csr_matrix(user_item_matrix).tocsr()*alpha)

        return model

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[0][1]
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        user_id = self.userid_to_id[user]
        model_name = model.__class__.__name__
        
        if model_name == 'ItemItemRecommender':

            res = model.recommend(userid=user_id,
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=None,
                                        recalculate_user=True)[0].tolist()
            res = [self.id_to_itemid[rec] for rec in res]

        elif model_name == 'AlternatingLeastSquares':
            res = [self.id_to_itemid[rec] for rec in model.recommend(userid=user_id,
                                        user_items=csr_matrix(self.user_item_matrix).tocsr()[user_id,:],
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)[0]]

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        if 999999 in res: res.remove(999999)

        res = self._extend_with_top_popular(res, N=N) 

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec for rec in similar_users[0]]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            res.extend(self.get_own_recommendations(user, N=1))
        res = pd.Series(res).drop_duplicates().tolist()

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

In [None]:
data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

In [None]:
print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')

Level 1 Train - users: 2495, items: 5001


In [None]:
recommender = MainRecommender(data_train_lvl_1, data_type='quantity', normalize=False, weighting=True)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [None]:
list_out = result.loc[~result.user_id.isin(data_train_lvl_1.user_id), 'user_id'].tolist()
list_out

[296, 1813, 1984]

In [None]:
def rule(x, y, model, N=5):
    if x in y:
        return recommender.overall_top_purchases[:N]
    if model == 'als':
        return recommender.get_als_recommendations(x, N=N)
    elif model == 'own':
        return recommender.get_own_recommendations(x, N=N)
    elif model == 'similar_items':
        return recommender.get_similar_items_recommendation(x, N=N)
    elif model == 'similar_users':
        return recommender.get_similar_users_recommendation(x, N=N)

In [None]:
result['als'] = result['user_id'].apply(lambda x: rule(x, list_out, model='als', N=30))
result['own'] = result['user_id'].apply(lambda x: rule(x, list_out, model='own', N=30))
result['similar_items'] = result['user_id'].apply(lambda x: rule(x, list_out, model='similar_items', N=30))
result['similar_users'] = result['user_id'].apply(lambda x: rule(x, list_out, model='similar_users', N=30))

In [None]:
def calc_precision_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean() * 100

In [None]:
def calc_map_at_k(df_data):
    for col_name in df_data.columns[2:]:
        yield col_name, map_k(df_data[col_name].values.tolist(), df_data['actual'].values.tolist())*100

In [None]:
for pred in calc_precision_at_k(result, 5):
    print(*pred, sep=': ')

popular_recommendation: 17.595171773444754
als: 17.948003714020427
own: 1.91272051996286
similar_items: 7.901578458681524
similar_users: 11.411327762302694


In [None]:
for pred in calc_map_at_k(result):
    print(*pred, sep=': ')

popular_recommendation: 29.00753120808831
als: 15.044101600624348
own: 4.90921283400392
similar_items: 7.297676413565457
similar_users: 10.63401700048636


##### Second level model

In [None]:
users_lvl_1 = data_train_lvl_1.user_id.unique()
users_lvl_2 = data_val_lvl_1.user_id.unique()
users_lvl_3 = data_val_lvl_2.user_id.unique()

new_users_lvl_2 = list(set(users_lvl_2) - set(users_lvl_1))
new_users_lvl_3 = list(set(users_lvl_3) - (set(users_lvl_1) | set(users_lvl_2)))

add_to_lvl_2 = list(set(users_lvl_3) - (set(users_lvl_2)))

new_users_lvl_2, new_users_lvl_3, len(add_to_lvl_2)

([296, 1813, 1984], [2259], 126)

In [None]:
train_val_lvl1 = pd.concat([data_train_lvl_1, data_val_lvl_1])
train_val_lvl1 = train_val_lvl1.merge(item_features[['item_id', 'department']], on='item_id', how='inner')

In [None]:
condition = ((data_train_lvl_2.user_id.isin(data_train_lvl_1.user_id.tolist())) | (pd.Series(data=add_to_lvl_2).isin(data_train_lvl_1.user_id.tolist())))
val_list_out = data_train_lvl_2.loc[~condition, 'user_id'].unique().tolist()
val_list_out

[1813, 1984, 296]

In [None]:
df_candidates = pd.DataFrame(data=data_train_lvl_2['user_id'].unique().tolist() + add_to_lvl_2)
df_candidates.columns = ['user_id']
df_candidates['candidates'] = df_candidates['user_id'].apply(lambda x: rule(x, new_users_lvl_2+new_users_lvl_3, model='als', N=30))
df_candidates.head()

Unnamed: 0,user_id,candidates
0,2070,"[1029743, 1070820, 1016800, 899624, 1044078, 9..."
1,2021,"[844179, 999270, 1044078, 883932, 1013928, 896..."
2,1753,"[1106523, 1029743, 983584, 5569230, 1004906, 8..."
3,2120,"[1029743, 1106523, 986912, 5569230, 878996, 86..."
4,1346,"[5569374, 5574377, 1127831, 878996, 866211, 55..."


In [None]:
df_candidates = pd.DataFrame({'user_id': df_candidates['user_id'].values.repeat(30),
                              'item_id': np.concatenate(df_candidates['candidates'].values)})

df_train = data_train_lvl_2[['user_id', 'item_id']].copy()
df_train['target'] = 1

df_train = df_candidates.merge(df_train, on=['user_id', 'item_id'], how='left')
df_train.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

df_train['target'].fillna(0, inplace=True)
df_train.head()

Unnamed: 0,user_id,item_id,target
0,2070,1029743,0.0
1,2070,1070820,0.0
2,2070,1016800,0.0
3,2070,899624,1.0
4,2070,1044078,0.0


In [None]:
df_train['target'].value_counts()

0.0    61493
1.0     6907
Name: target, dtype: int64

In [None]:
df_train = df_train.merge(item_features, on='item_id', how='left')
df_train = df_train.merge(user_features, on='user_id', how='left')
df_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1070820,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,1016800,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,899624,1.0,69,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,1044078,0.0,2845,MEAT,National,BEEF,LEAN,,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [None]:
data_department = data_train_lvl_2.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,GROCERY
1,2070,40630625006,594,1019940,1,1.0,311,-0.29,201,86,0.0,0.0,GROCERY


In [None]:
df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('sales_value').sum().rename('total_item_sales_value'), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('total_quantity_value'), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('user_id').count().rename('item_freq'), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('user_id').count().rename('user_freq'), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('sales_value').sum().rename('total_user_sales_value'), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_week')/train_val_lvl1.week_no.nunique(), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_week')/train_val_lvl1.week_no.nunique(), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='user_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='item_id').agg('user_id').count().rename('item_freq_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='item_id')

df_train = df_train.merge(train_val_lvl1.groupby(by='user_id').agg('user_id').count().rename('user_freq_per_basket')/train_val_lvl1.basket_id.nunique(), how='left',on='user_id')

In [None]:
# Средний чек
users_sales = data_train_lvl_2.groupby('user_id')['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_train = df_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')

# Количество уникальных категорий покупателя
users_departments = data_department.groupby('user_id')['department'].nunique().reset_index()
users_departments.rename(columns = {'department':'users_unique_departments'}, inplace=True)
df_train = df_train.merge(users_departments, on='user_id', how='left')

# Среднее время покупки
bought_time = data_train_lvl_2.groupby('user_id')['trans_time'].mean().reset_index()
bought_time.rename(columns = {'trans_time':'mean_trans_time_by_user'}, inplace=True)
df_train = df_train.merge(bought_time, on='user_id', how='left')

# Средний чек корзины 
baskets_sales_value = data_train_lvl_2.groupby(['user_id','basket_id'])['sales_value'].mean().reset_index()
mean_basket_sales_value = baskets_sales_value.groupby('user_id')['sales_value'].mean().reset_index()
mean_basket_sales_value.rename(columns = {'sales_value':'mean_sales_value_per_basket'}, inplace=True)
df_train = df_train.merge(mean_basket_sales_value, on='user_id', how='left')

# Количество купленных уникальных товаров 
unique_bought_items = data_train_lvl_2.groupby('user_id')['item_id'].nunique().reset_index()
unique_bought_items.rename(columns = {'item_id':'unique_bought_items'}, inplace=True)
df_train = df_train.merge(unique_bought_items, on='user_id', how='left')

# Среднее количество уникальных категорий в корзине
users_baskets = data_department.groupby(['user_id', 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby('user_id')['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_train = df_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')

# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
df_train = df_train.merge(department_sales, on='department', how='left')

# Средная цена купленных товаров пользователем
users_sales = data_train_lvl_2.groupby('user_id')[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_train = df_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')

In [None]:
# Среднее количество покупок товара в неделю
num_purchase_week = data_train_lvl_2.groupby('item_id').agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
num_purchase_week['avg_num_purchases_week'] = num_purchase_week['quantity'] / num_purchase_week['week_no']
df_train = df_train.merge(num_purchase_week[['item_id', 'avg_num_purchases_week']], on='item_id', how='left')
df_train['avg_num_purchases_week'].fillna(0, inplace=True)

# Цена товара
items_sales = data_department.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)
df_train = df_train.merge(items_sales[['item_id', 'price']], on='item_id', how='left')

# Среднее время покупки товара
bought_item_time = data_train_lvl_2.groupby('item_id')['trans_time'].mean().reset_index()
bought_item_time.rename(columns = {'trans_time':'mean_trans_time_by_item'}, inplace=True)
df_train = df_train.merge(bought_item_time, on = 'item_id', how = 'left')

# Количество магазинов, где есть товар
items_stores = data_department.groupby('item_id')['store_id'].sum().reset_index()
items_stores.rename(columns={'store_id': 'n_stores_with_item'}, inplace=True)
items_stores['n_stores_with_item'].fillna(0, inplace = True)
df_train = df_train.merge(items_stores, on='item_id', how='left')

# Количество уникальных магазинов, где есть товар
items_stores = data_department.groupby('item_id')['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores_with_item'}, inplace=True)
items_stores['n_unique_stores_with_item'].fillna(0, inplace = True)
df_train = df_train.merge(items_stores, on='item_id', how='left')

In [None]:
# Построим признак, отражающий средний интервал между покупками пользователя.
users_days = train_val_lvl1.groupby('user_id')['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[51, 67, 88, 94, 101, 108, 111, 128, 137, 146,..."
1,2,"[103, 112, 117, 118, 139, 140, 154, 160, 161, ..."
2,3,"[113, 121, 136, 141, 142, 163, 166, 169, 173, ..."
3,4,"[104, 140, 154, 181, 190, 199, 216, 231, 244, ..."
4,5,"[88, 97, 111, 154, 168, 181, 191, 192, 223, 25..."


In [None]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_train = df_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,unique_bought_items,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,156.0,1.790698,2.531705,0.035173,211.833333,2.336216,1551.780669,4298090.0,110.0,4.734513
1,2070,1070820,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,156.0,1.790698,2.531705,0.035173,68.333333,2.273415,1512.691877,1494402.0,84.0,4.734513


In [None]:
users_items = pd.concat([data_train_lvl_2, data_val_lvl_2]).groupby('user_id')['item_id'].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[9677939, 10356149, 13417048, 15741823, 15830875]"
1,2,"[7407562, 10149597, 13776981, 15572067, 17215077]"
2,3,"[7167218, 7167249, 9526563, 9526886, 13842214]"
3,4,"[963365, 1038692, 1133312, 5570830, 7431408]"
4,6,"[1020683, 1047193, 1100691, 1123407, 10311503]"


In [None]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_train['Last5sales'] = df_train[['user_id', 'item_id']].apply(code_last_sales, axis=1)
df_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,1.790698,2.531705,0.035173,211.833333,2.336216,1551.780669,4298090.0,110.0,4.734513,0
1,2070,1070820,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,1.790698,2.531705,0.035173,68.333333,2.273415,1512.691877,1494402.0,84.0,4.734513,0


In [None]:
for column in df_train.columns:
    if df_train[column].dtype == 'object':
        df_train[column].fillna('unk', inplace=True)
    elif df_train[column].dtype == 'float64':
        df_train[column].fillna(0.0, inplace=True)
df_train.dropna(inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64620 entries, 0 to 64619
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   user_id                      64620 non-null  int64  
 1   item_id                      64620 non-null  int64  
 2   target                       64620 non-null  float64
 3   manufacturer                 64620 non-null  int64  
 4   department                   64620 non-null  object 
 5   brand                        64620 non-null  object 
 6   commodity_desc               64620 non-null  object 
 7   sub_commodity_desc           64620 non-null  object 
 8   curr_size_of_product         64620 non-null  object 
 9   age_desc                     64620 non-null  object 
 10  marital_status_code          64620 non-null  object 
 11  income_desc                  64620 non-null  object 
 12  homeowner_desc               64620 non-null  object 
 13  hh_comp_desc    

In [None]:
df_train = reduce_memory(df_train)

In [None]:
X, y = df_train.drop('target', axis=1), df_train['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
cat_cols = X.select_dtypes(include=['category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [None]:
X.sample(5)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,mean_trans_time_by_item,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
5899,1367,8090536,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,35-44,U,...,1.829268,2.531705,0.011935,8.333333,3.4888,1619.945923,327387.0,27.0,3.573099,0
17944,1078,1106523,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,unk,unk,...,2.363636,2.531705,0.006357,141.666672,2.355435,1554.849487,2754167.0,107.0,4.07874,0
63179,814,1138443,3739,DELI,National,DELI MEATS,MEAT: TURKEY BULK,,unk,unk,...,3.0,4.527463,1.716,14.5,4.341149,1588.310303,255345.0,44.0,51.599998,0
19648,1421,12301109,4008,MEAT,National,BEEF,PRIMAL,,35-44,A,...,2.333333,6.485481,0.00617,18.0,3.575222,1588.297241,440797.0,41.0,4.952,0
24891,667,1127758,69,SEAFOOD-PCKGD,Private,SEAFOOD - FROZEN,SEAFOOD-FRZ-RAW FILLETS,2 LB,unk,unk,...,2.833333,5.754523,2.144565,6.5,3.959231,1675.75,76762.0,24.0,15.727273,0


In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
def select_features(algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    model = CatBoostClassifier(iterations=200, auto_class_weights='Balanced', random_seed=42)
    summary = model.select_features(
        train_pool,
        eval_set=valid_pool,
        features_for_select=list(range(train_pool.num_col())),
        num_features_to_select=30,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=False,
        logging_level='Silent',
        plot=True
    )
    print('Selected features:', summary['selected_features_names'])
    return summary

feature_set = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=11)

Algorithm: EFeaturesSelectionAlgorithm.RecursiveByShapValues


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Selected features: ['user_id', 'department', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'household_size_desc', 'kid_category_desc', 'total_item_sales_value', 'total_quantity_value', 'item_freq', 'user_freq', 'total_user_sales_value', 'item_quantity_per_week', 'user_quantity_per_week', 'item_quantity_per_basket', 'item_freq_per_basket', 'user_freq_per_basket', 'avg_cheque', 'users_unique_departments', 'mean_trans_time_by_user', 'unique_bought_items', 'avg_basket_department', 'mean_sales_value_category', 'avg_price', 'avg_num_purchases_week', 'price', 'mean_trans_time_by_item', 'n_stores_with_item', 'n_unique_stores_with_item', 'Last5sales']


In [None]:
X_train.drop(['department', 'brand', 'mean_sales_value_per_basket', 'hh_comp_desc', 'homeowner_desc', 'curr_size_of_product'], axis=1, inplace=True)
X_valid.drop(['department', 'brand', 'mean_sales_value_per_basket', 'hh_comp_desc', 'homeowner_desc', 'curr_size_of_product'], axis=1, inplace=True)

cat_cols = X_train.select_dtypes('category').columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [None]:
params_model = {
                'eval_metric': 'AUC', 
                'iterations': 2000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                }

In [None]:
def objective(trial):
    param_trials = {
                    'depth': trial.suggest_int('depth', 3, 15),
                    'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.5),
                    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1., 6.),
                    'bagging_temperature': trial.suggest_float('bagging_temperature', 1., 5.),
                    'subsample': trial.suggest_float('subsample', 0.3, 1.),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.),
                    'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                    'auto_class_weights': trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
                    'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
                    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 255),
                    'max_leaves': trial.suggest_int('max_leaves', 5, 64),
                    }
    param_trials.update(params_model)
    if param_trials['grow_policy'] == 'SymmetricTree':
        param_trials['min_data_in_leaf'] = None
        param_trials['max_leaves'] = None
    if param_trials['grow_policy'] == 'Depthwise':
        param_trials['max_leaves'] = None
    if param_trials['grow_policy'] != 'SymmetricTree':
        param_trials['boosting_type'] = 'Plain'
    
    opt_model = CatBoostClassifier(**param_trials)
    opt_model.fit(train_pool, eval_set=valid_pool)
    
    y_pred = opt_model.predict_proba(X_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    
    return score

In [None]:
# optuna.logging.set_verbosity(optuna.logging.INFO)
# study = optuna.create_study(sampler=TPESampler(seed=42), direction="maximize")
# study.optimize(objective, n_trials=200, timeout=6000)

# print(f'Number of completed trials: {len(study.trials)}')
# print('Best trial')
# trial = study.best_trial
# print(f'Best score: {trial.value}')
# print('Best params')
# for key, value in trial.params.items():
#     print(f'{key}: {value}')

In [None]:
params_model = {
                'eval_metric': 'AUC', 
                'iterations': 2000,
                'silent': True,
                'one_hot_max_size': 15,
                'use_best_model': True,
                'early_stopping_rounds': 50,
                'allow_writing_files': False,
                'random_seed': 42,
                'depth': 14, 
                'learning_rate': 0.07664266658077104, 
                'l2_leaf_reg': 4.948252791068095, 
                'bagging_temperature': 2.6441394124695616, 
                'subsample': 0.8688639070760653, 
                'colsample_bylevel': 0.45003597863710665, 
                'boosting_type': 'Plain', 
                'auto_class_weights': 'SqrtBalanced', 
                'grow_policy': 'Lossguide', 
                'min_data_in_leaf': 244, 
                'max_leaves': 24
                }

In [None]:
cat = CatBoostClassifier(**params_model).fit(train_pool, eval_set=valid_pool)

In [None]:
rank_results = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
rank_results.columns = ['user_id', 'actual']
cold_users = data_val_lvl_2.loc[~data_val_lvl_2.user_id.isin(data_train_lvl_1.user_id.tolist()), 'user_id'].unique().tolist()

rank_results['als'] = rank_results['user_id'].apply(lambda x: rule(x, cold_users, model='als', N=30))

In [None]:
df_predict = df_train.copy()
df_predict['proba_item_purchase'] = cat.predict_proba(X.drop(['department', 'brand', 
                                                              'mean_sales_value_per_basket', 'hh_comp_desc', 
                                                              'homeowner_desc', 'curr_size_of_product'], axis=1))[:, 1]

In [None]:
def rerank(user_id, N, out=cold_users):
    if user_id in df_predict.user_id:
        return df_predict[df_predict['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(N).item_id.tolist()
    else:
        return rule(user_id, cold_users, model='als', N=5)

In [None]:
rank_results['reranked_als'] = rank_results['user_id'].apply(lambda user_id: rerank(user_id, N=5))

In [None]:
rank_results['als_postfiltered'] = rank_results.user_id.apply(lambda x: postfilter(rule(x, cold_users, model='als', N=20), 
                                                                                   item_features, N=5))

In [None]:
rank_results.loc[rank_results['reranked_als'].str.len() == 0, 'reranked_als'] =\
 rank_results.loc[rank_results['reranked_als'].str.len() == 0, 'user_id'].apply(lambda x: rule(x, cold_users, model='als', N=5))

In [76]:
def rerank_post(user_id, N_rank=20, N_post=5):
    try:
        out = rerank(user_id, N=N_rank)
        out = postfilter(out, item_features, N=N_post)
    except AssertionError:
        out = rule(user_id, cold_users, model='als', N=N_post)
    return out

rank_results['postfiltered_reranked_als'] = rank_results['user_id'].apply(lambda user_id: rerank_post(user_id))

In [77]:
rank_results.head()

Unnamed: 0,user_id,actual,als,reranked_als,als_postfiltered,postfiltered_reranked_als
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1082212, 1062002, 1029743, 940947, 8090541, 8...","[9655212, 9297615, 1124029, 9526630, 865026]","[1082212, 1029743, 8090541, 5577022, 986912]","[9655212, 1124029, 865026, 885290, 1056509]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[916122, 1029743, 907631, 1137346, 1044078, 98...","[916122, 1029743, 907631, 1137346, 1044078]","[916122, 907631, 1044078, 1020581, 998206]","[916122, 1029743, 907631, 1137346, 1044078]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[965267, 878996, 854852, 835098, 866211, 93011...","[1082185, 1029743, 1070820, 899624, 1026118]","[965267, 854852, 866211, 1105488, 1024306]","[1082185, 1070820, 1026118, 919681, 986912]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1112238, 916122, 866211, 1122358, 1126899, 10...","[1106523, 1029743, 1126899, 1124352, 1044078]","[1112238, 866211, 1126899, 12810393, 839419]","[1106523, 1044078, 860703, 844179, 1122358]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[844179, 916122, 1029743, 12301109, 999858, 55...","[1106523, 1029743, 1004906, 1126899, 1044078]","[844179, 1029743, 1004906, 5569471, 1027168]","[1106523, 1004906, 1044078, 844179, 910032]"


In [78]:
(rank_results.reranked_als.str.len() == 0).sum(), len(rank_results)

(0, 2042)

In [79]:
print('-'*15, 'mean precision@5', '-'*15)

for pred in calc_precision_at_k(rank_results, 5):
    print(*pred, sep=': ')
    
print('-'*15, 'map@5', '-'*15)
for pred in calc_map_at_k(rank_results):
    print(*pred, sep=': ')

--------------- mean precision@5 ---------------
als: 15.239960822722823
reranked_als: 21.390793339862878
als_postfiltered: 12.820763956904994
postfiltered_reranked_als: 17.659157688540645
--------------- map@5 ---------------
als: 13.643933674969425
reranked_als: 45.77279899880292
als_postfiltered: 26.544373707693985
postfiltered_reranked_als: 43.339794863423656


In [80]:
def transform_data_for_eval(dataset, rec_col, user_col='user_id'):
    '''
    Func for transforming recommendations into kaggle evaluation format

    Parameters:
    dataset (pd.DataFrame): Dataset with 2 required columns:
        rec_col - column with recommendations should be iterable
        user_col - columns with user id

    rec_col (str): name of column in dataset with recommendations

    user_col (str): name of column in dataset with user id

    Returns:
    pd.DataFrame: DataFrame in suitable format

   '''
    eval_dataset = dataset[[user_col, rec_col]].copy()
    eval_dataset[rec_col] = eval_dataset[rec_col].apply(lambda x: ' '.join([str(i) for i in x]))
    eval_dataset.rename(columns={
        user_col: 'UserId',
        rec_col: 'Predicted'
    }, inplace=True)
    return eval_dataset

In [81]:
new = test[~test.user_id.isin(data_train_lvl_1.user_id.tolist())].user_id.tolist()
print(len(new))

2


In [82]:
# test['preds'] = test.user_id.apply(lambda x: rule(x, new, model='als', N=30))
test['preds'] = test.user_id.apply(lambda x: rerank(x, 5))

In [83]:
# test.loc[~test.user_id.isin(new), 'preds'] = test.loc[~test.user_id.isin(new), 'user_id'].apply(lambda user_id: rerank(user_id, N=5))

In [84]:
(test.preds.str.len() == 0).sum(), len(test)

(86, 1708)

In [85]:
test.loc[test.preds.str.len() == 0, 'preds'] = test.loc[test.preds.str.len() == 0, 'user_id'].apply(lambda x: rule(x, new, model='als', N=5))

In [86]:
# reranked als
submission = transform_data_for_eval(test[['user_id', 'preds']], rec_col='preds', user_col='user_id')
submission.to_csv('submission.csv', index=False)

In [87]:
!kaggle competitions submit -c gb-recsys-project -f submission.csv -m 'test1'

100% 71.5k/71.5k [00:02<00:00, 29.4kB/s]
Successfully submitted to GB RecSys Project