In [1151]:
# !pip install -qq implicit==0.6.0

import implicit
implicit.__version__

'0.6.0'

In [1152]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cdist
import tqdm
from itertools import combinations

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.bpr import BayesianPersonalizedRanking

from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier, LGBMRanker

import optuna
from optuna.samplers import TPESampler

In [1153]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    return precision

def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list, bought_list)
    
    if sum(flags) == 0:
        return 0
    
    sum_ = 0
    for i in range(k):
        
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
            
    result = sum_ / sum(flags)
    
    return result

def map_k(recommend_list, bought_list, k=5):
    return np.mean([ap_k(rec, bt, k) for rec, bt in zip(recommend_list, bought_list)])

def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall

def calc_precision_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

def calc_map_at_k(df_data):
    for col_name in df_data.columns[2:]:
        yield col_name, map_k(df_data[col_name].values.tolist(), df_data['actual'].values.tolist())

def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)[:4] != 'uint' and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif str(col_type)[:4] != 'uint':
            df[col] = df[col].astype('category')
    return df

def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

def transform_data_for_eval(dataset, rec_col, user_col='user_id'):
    eval_dataset = dataset[[user_col, rec_col]].copy()
    eval_dataset[rec_col] = eval_dataset[rec_col].apply(lambda x: ' '.join([str(i) for i in x]))
    eval_dataset.rename(columns={
        user_col: 'UserId',
        rec_col: 'Predicted'
    }, inplace=True)
    return eval_dataset

In [1154]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')
test = pd.read_csv('test_user.csv')

N_preds = 10

In [1155]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [1156]:
val_lvl_1_size_weeks = 12
val_lvl_2_size_weeks = 6

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - val_lvl_1_size_weeks]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - val_lvl_1_size_weeks) &
                      (data['week_no'] < data['week_no'].max() - val_lvl_2_size_weeks)]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1 = reduce_memory(data_train_lvl_1)
data_val_lvl_1 = reduce_memory(data_val_lvl_1)
data_train_lvl_2 = reduce_memory(data_train_lvl_2)
data_val_lvl_2 = reduce_memory(data_val_lvl_2)

print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')
print(f'Level 1 Valid - users: {data_val_lvl_1.user_id.nunique()}, items: {data_val_lvl_1.item_id.nunique()}')
print(f'Level 2 Train - users: {data_train_lvl_2.user_id.nunique()}, items: {data_train_lvl_2.item_id.nunique()}')
print(f'Level 2 Valid - users: {data_val_lvl_2.user_id.nunique()}, items: {data_val_lvl_2.item_id.nunique()}')

Level 1 Train - users: 2498, items: 82059
Level 1 Valid - users: 2169, items: 27912
Level 2 Train - users: 2169, items: 27912
Level 2 Valid - users: 2197, items: 30040


In [1157]:
result = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

popular_recs = popularity_recommendation(data_train_lvl_1, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

Unnamed: 0,user_id,actual,popular_recommendation
0,1,"[836548, 856942, 877391, 933913, 948420, 10368...","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,2,"[831125, 838136, 852864, 899624, 908649, 91504...","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [1158]:
def prefilter_items(data, take_n_popular=5000, item_features=None, n_weeks=95):
    # Уберем самые популярные товары (их и так купят)
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index() / data['user_id'].nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

    top_popular = popularity[popularity['share_unique_users'] > 0.2].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]

    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.02].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]

    # Уберем не интересные для рекоммендаций категории (department)
    if item_features is not None:
        department_size = pd.DataFrame(item_features.groupby('department')['item_id'].nunique().sort_values(ascending=False)).reset_index()
        department_size.columns = ['department', 'n_items']
        rare_departments = department_size[department_size['n_items'] < 150].department.tolist()
        items_in_rare_departments = item_features[item_features['department'].isin(rare_departments)].item_id.unique().tolist()

        data = data[~data['item_id'].isin(items_in_rare_departments)]

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
    data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
    data = data[data['price'] > 2]

    # Уберем слишком дорогие товарыs
    data = data[data['price'] < 50]

    # уберем товары, не продававшиеся более n_week недель
    data = data[data['week_no'] >= data['week_no'].max() - n_weeks]

    # Возьмем топ по популярности
    popularity = data.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
    top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()
    
    # Заведем фиктивный item_id (если юзер покупал товары из топ-N, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = 999999
    
    return data

def postfilter(recommendations, item_info, N=5):
    """Пост-фильтрация товаров
    
    Input
    -----
    recommendations: list
        Ранжированный список item_id для рекомендаций
    item_info: pd.DataFrame
        Датафрейм с информацией о товарах
    """
    
    # Уникальность
    unique_recommendations = []
    [unique_recommendations.append(item) for item in recommendations if item not in unique_recommendations]
    
    # Разные категории
    categories_used = []
    final_recommendations = []
    CATEGORY_NAME = 'sub_commodity_desc'
    for item in unique_recommendations:
        category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
        
        if category not in categories_used:
            final_recommendations.append(item)
            
        unique_recommendations.remove(item)
        categories_used.append(category)
    
    n_rec = len(final_recommendations)
    if n_rec < N:
        final_recommendations.extend(unique_recommendations[:N - n_rec])
    else:
        final_recommendations = final_recommendations[:N]
    
    assert len(final_recommendations) == N, 'Количество рекомендаций != {}'.format(N)
    return final_recommendations

def rule(x, y, model, N=5):
    if x in y:
        return recommender.overall_top_purchases[:N]
    if model == 'als':
        return recommender.get_als_recommendations(x, N=N)
    elif model == 'own':
        return recommender.get_own_recommendations(x, N=N)
    elif model == 'similar_items':
        return recommender.get_similar_items_recommendation(x, N=N)
    elif model == 'similar_users':
        return recommender.get_similar_users_recommendation(x, N=N)
    elif model == 'bayesian':
        return recommender.get_bayesian_recommendations(x, N=N)
    
def get_self_top_purchases(user_id, N=5):
    return recommender.top_purchases[recommender.top_purchases.user_id == user_id].item_id.head(N).tolist()

In [1159]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, data_type='quantity', weighting=True, normalize=False, k1=150, 
                 create_mtx=True, user_item_matrix=None):
        self.data_type = data_type
        self.normalize = normalize
        self.k1 = k1
        if user_item_matrix is not None:
            create_mtx = False

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
        
        if create_mtx:
            self.user_item_matrix = self._prepare_matrix(data, data_type=self.data_type, normalize=self.normalize)
        elif user_item_matrix is not None:
            self.user_item_matrix = user_item_matrix.copy()
        else:
            raise ValueError(f'Set create_mtx to "True" or pass user_item_matrix to "user_item_matrix" attr.')
        
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix, K1=self.k1, B=0.8)  # default: 100, 0.8 ver. 0.5.2

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        self.ranker = self.fit_ranker(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data, data_type, normalize=False):
        """Готовит user-item матрицу"""
        if data_type == 'quantity':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', 
                                              columns='item_id',
                                              values='quantity',
                                              aggfunc='count',
                                              fill_value=0
                                              )
        elif data_type == 'sales':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id', 
                                              columns='item_id',
                                              values='sales_value',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()  # normalize
        elif data_type == 'quantity_sum':
            user_item_matrix = pd.pivot_table(data,
                                              index='user_id',
                                              columns='item_id',
                                              values='quantity',
                                              aggfunc='sum',
                                              fill_value=0
                                              )
            if normalize:
                user_item_matrix = user_item_matrix / user_item_matrix.max()
        elif data_type == 'weighted_sum':
            if 'weighted' not in data.columns:
                return self._prepare_matrix(data, data_type='quantity')
            
            user_item_matrix = pd.pivot_table(data, 
                                              index='user_id', 
                                              columns='item_id', 
                                              values='weighted',
                                              aggfunc='sum',
                                              fill_value=0
                                             )
        else:
            raise ValueError(f'Agg data type must be "quantity", "sales" or "quantity_sum", given: {data_type}')

        user_item_matrix = user_item_matrix.astype(float)

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4): 
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads,
                                        use_gpu=False,
                                        random_state=42)
        model.fit(csr_matrix(user_item_matrix).tocsr())

        return model

    @staticmethod
    def fit_ranker(user_item_matrix, factors=50, learning_rate=0.03, regularization=0.01, iterations=200):
        ranker = BayesianPersonalizedRanking(factors=factors, 
                                     learning_rate=learning_rate, 
                                     regularization=regularization, 
                                     iterations=iterations, 
                                     num_threads=4, 
                                     random_state=42)
        ranker.fit(csr_matrix(user_item_matrix).tocsr())

        return ranker

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)
        top_rec = recs[0][1] 
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, user_id=None, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""
        
        if user_id is not None and len(recommendations) < N:
            recommendations.extend(self.top_purchases[self.top_purchases.user_id == user_id].item_id.head(N).tolist())
            recommendations = pd.Series(data=recommendations).drop_duplicates().tolist()
            if len(recommendations) > N:
                recommendations = recommendations[:N]

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        user_id = self.userid_to_id[user]
                
        model_name = model.__class__.__name__
        
        if model_name == 'ItemItemRecommender':

            res = model.recommend(userid=user_id,
                                  user_items=csr_matrix(self.user_item_matrix).tocsr()[user_id, :],
                                  N=N-1,
                                  filter_already_liked_items=False,
                                  filter_items=[self.itemid_to_id[999999]],
                                  recalculate_user=True) # [0].tolist()
            # ind = np.argsort(res[1])[::-1]  # sorting by scores: to preserve id order with diff. N
            res = [self.id_to_itemid[rec] for rec in res[0]]

        elif model_name == 'AlternatingLeastSquares':
            res = [self.id_to_itemid[rec] for rec in model.recommend(userid=user_id,
                                        user_items=csr_matrix(self.user_item_matrix).tocsr()[user_id,:],
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)[0]]

        elif model_name == 'BayesianPersonalizedRanking':
            res = [self.id_to_itemid[rec] for rec in model.recommend(userid=user_id, 
                                        user_items=csr_matrix(self.user_item_matrix).tocsr()[user_id, :],
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        )[0]]

        res = self._extend_with_top_popular(res, user, N=N)
        
        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, N=N)

    def get_bayesian_recommendations(self, user, N=5):
        """ Рекомендации на основе модели ранжирования """

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.ranker, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""
        
        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        if 999999 in res: res.remove(999999)  # prev. ver.

        res = self._extend_with_top_popular(res, N=N) 

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами: берем N похожих пользователей и с помощью трюка рекомендуем юзеру их топ товары"""
        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec for rec in similar_users[0]]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            user = self.id_to_userid[user]  ## нужно подать для предикта оригинальный идентификатор
            res.extend(self.get_own_recommendations(user, N=1))
        res = pd.Series(res).drop_duplicates().tolist()

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

In [1160]:
data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=10000, n_weeks=95)

In [1161]:
print(f'Level 1 Train - users: {data_train_lvl_1.user_id.nunique()}, items: {data_train_lvl_1.item_id.nunique()}')

Level 1 Train - users: 2495, items: 10001


In [1162]:
recommender = MainRecommender(data_train_lvl_1, k1=10)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [1163]:
list_out = result.loc[~result.user_id.isin(data_train_lvl_1.user_id), 'user_id'].tolist()
list_out

[1813, 1984]

In [1164]:
result['als'] = result['user_id'].apply(lambda x: rule(x, list_out, model='als', N=N_preds))
result['own'] = result['user_id'].apply(lambda x: rule(x, list_out, model='own', N=N_preds))

In [1166]:
for pred in calc_precision_at_k(result, N_preds):
    print(*pred, sep=': ')

popular_recommendation: 0.18275703088981096
als: 0.1295527893038267
own: 0.2439372982941448


In [1167]:
for pred in calc_recall(result, N_preds):
    print(*pred, sep=': ')

popular_recommendation: 0.023245424460641564
als: 0.027019756635783692
own: 0.04995886675569452


**2 level model**

In [1168]:
names = ['manufacturer', 'department', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
for name in names:
    new_name = name + '_freq'
    a = item_features[name].value_counts()
    ind = a.index.tolist()
    for i in ind:
        item_features.loc[item_features[name] == i, new_name] = a[i]

commodities = item_features.commodity_desc.value_counts()
commodities_list = commodities.keys().tolist()
for i, name in enumerate(commodities_list):
    item_features.loc[item_features['commodity_desc'] == name, 'commodity_category'] = i

In [1169]:
class TfidfLDA(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=5, learning_method='batch', max_features=10001, n_jobs=-1, random_state=42):
        self.n_components = n_components
        self.learning_method = learning_method
        self.max_features = max_features
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.pipe = None
        
    def fit(self, X, y=None):
        self.pipe = make_pipeline(TfidfVectorizer(max_features=self.max_features),
                                  LatentDirichletAllocation(n_components=self.n_components,
                                                            learning_method=self.learning_method,
                                                            n_jobs=self.n_jobs,
                                                            random_state=self.random_state)).fit(X['item_id'].values)
        return self
        
    def transform(self, X):
        X_ = np.zeros((X.shape[0], self.n_components))
        for i in range(X_.shape[0]):
            X_[i, :] = self.pipe.transform(X.values[i])
        return pd.DataFrame(data=X_, columns=[f'topic_{i}' for i in range(self.n_components)], index=X.index)
    
def create_multiple_features(df_train, train_val):
    df_train = df_train.merge(train_val.groupby(by='item_id').agg('sales_value').sum().rename('total_item_sales_value'), how='left',on='item_id')
    df_train = df_train.merge(train_val.groupby(by='item_id').agg('quantity').sum().rename('total_quantity_value'), how='left',on='item_id')
    df_train = df_train.merge(train_val.groupby(by='item_id').agg('user_id').count().rename('item_freq'), how='left',on='item_id')
    df_train = df_train.merge(train_val.groupby(by='user_id').agg('user_id').count().rename('user_freq'), how='left',on='user_id')
    df_train = df_train.merge(train_val.groupby(by='user_id').agg('sales_value').sum().rename('total_user_sales_value'), how='left',on='user_id')
    df_train = df_train.merge(train_val.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_week')/train_val.week_no.nunique(), how='left',on='item_id')
    df_train = df_train.merge(train_val.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_week')/train_val.week_no.nunique(), how='left',on='user_id')
    df_train = df_train.merge(train_val.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_basket')/train_val.basket_id.nunique(), how='left',on='item_id')
    df_train = df_train.merge(train_val.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_basket')/train_val.basket_id.nunique(), how='left',on='user_id')
    df_train = df_train.merge(train_val.groupby(by='item_id').agg('user_id').count().rename('item_freq_per_basket')/train_val.basket_id.nunique(), how='left',on='item_id')
    df_train = df_train.merge(train_val.groupby(by='user_id').agg('user_id').count().rename('user_freq_per_basket')/train_val.basket_id.nunique(), how='left',on='user_id')
    return df_train

def create_user_features(df_train, data_train):

    # Средний чек
    users_sales = data_train.groupby('user_id')['sales_value'].mean().reset_index()
    users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
    df_train = df_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')

    # Количество уникальных категорий покупателя
    users_departments = data_train.groupby('user_id')['department'].nunique().reset_index()
    users_departments.rename(columns = {'department':'users_unique_departments'}, inplace=True)
    df_train = df_train.merge(users_departments, on='user_id', how='left')

    # Среднее время покупки
    bought_time = data_train.groupby('user_id')['trans_time'].mean().reset_index()
    bought_time.rename(columns = {'trans_time':'mean_trans_time_by_user'}, inplace=True)
    df_train = df_train.merge(bought_time, on='user_id', how='left')

    # Средний чек корзины 
    baskets_sales_value = data_train.groupby(['user_id','basket_id'])['sales_value'].mean().reset_index()
    mean_basket_sales_value = baskets_sales_value.groupby('user_id')['sales_value'].mean().reset_index()
    mean_basket_sales_value.rename(columns = {'sales_value':'mean_sales_value_per_basket'}, inplace=True)
    df_train = df_train.merge(mean_basket_sales_value, on='user_id', how='left')

    # Количество купленных уникальных товаров 
    unique_bought_items = data_train.groupby('user_id')['item_id'].nunique().reset_index()
    unique_bought_items.rename(columns = {'item_id':'unique_bought_items'}, inplace=True)
    df_train = df_train.merge(unique_bought_items, on='user_id', how='left')

    # Среднее количество уникальных категорий в корзине
    users_baskets = data_train.groupby(['user_id', 'basket_id'])['department'].nunique().reset_index()
    users_baskets = users_baskets.groupby('user_id')['department'].mean().reset_index()
    users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
    df_train = df_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')

    # Средняя сумма покупки в категории
    department_sales = data_train.groupby('department')['sales_value'].mean().reset_index()
    department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
    df_train = df_train.merge(department_sales, on='department', how='left')

    # Средная цена купленных товаров пользователем
    users_sales = data_train.groupby('user_id')[['sales_value', 'quantity']].sum().reset_index()
    users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
    df_train = df_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')
    return df_train

def create_item_features(df_train, train_val):    

    # Среднее количество покупок товара в неделю
    num_purchase_week = train_val.groupby('item_id').agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
    num_purchase_week['avg_num_purchases_week'] = num_purchase_week['quantity'] / num_purchase_week['week_no']
    df_train = df_train.merge(num_purchase_week[['item_id', 'avg_num_purchases_week']], on='item_id', how='left')
    df_train['avg_num_purchases_week'].fillna(0, inplace=True)

    # Цена товара
    items_sales = train_val.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
    items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
    items_sales['price'].fillna(0, inplace=True)
    df_train = df_train.merge(items_sales[['item_id', 'price']], on='item_id', how='left')

    # Среднее время покупки товара
    bought_item_time = train_val.groupby('item_id')['trans_time'].mean().reset_index()   
    bought_item_time.rename(columns = {'trans_time':'mean_trans_time_by_item'}, inplace=True)
    df_train = df_train.merge(bought_item_time, on = 'item_id', how = 'left')

    # Количество магазинов, где есть товар
    items_stores = train_val.groupby('item_id')['store_id'].sum().reset_index()
    items_stores.rename(columns={'store_id': 'n_stores_with_item'}, inplace=True)
    items_stores['n_stores_with_item'].fillna(0, inplace = True)
    df_train = df_train.merge(items_stores, on='item_id', how='left')

    # Количество уникальных магазинов, где есть товар
    items_stores = train_val.groupby('item_id')['store_id'].nunique().reset_index()
    items_stores.rename(columns={'store_id': 'n_unique_stores_with_item'}, inplace=True)
    items_stores['n_unique_stores_with_item'].fillna(0, inplace = True)
    df_train = df_train.merge(items_stores, on='item_id', how='left')
    return df_train

def get_new_features(train_val_lvl1):
    
    # час совершения транзакции
    data = train_val_lvl1.copy()
    data['hour'] = data['trans_time'] // 100
    user_item_features = data.groupby(['user_id', 'item_id'])['hour'].median().reset_index()
    user_item_features.columns = ['user_id', 'item_id', 'median_sales_hour']
    
    # день недели совершения транзакции
    data['weekday'] = data['day'] % 7
    df = data.groupby(['user_id', 'item_id'])['weekday'].median().reset_index()
    df.columns = ['user_id', 'item_id', 'median_weekday']
    user_item_features = user_item_features.merge(df, on=['user_id', 'item_id'])
    
    # cреднее кол-во дней между покупками
    df = data.groupby('user_id')['day'].nunique().reset_index()
    df['mean_visits_interval'] = (data.groupby('user_id')['day'].max() - data.groupby('user_id')['day'].min()) / df['day']
    user_item_features = user_item_features.merge(df[['user_id', 'mean_visits_interval']], on=['user_id'])
    
    # кол-во транзакций клиента
    df = data.groupby(['user_id'])['item_id'].count().reset_index()
    df.columns = ['user_id', 'n_transactions']
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    # mean / max / std кол-ва уникальных товаров в корзине клиента
    df = data.groupby(['user_id', 'basket_id'])['item_id'].nunique().reset_index()
    df1 = df.groupby('user_id')['item_id'].mean().reset_index()
    df1.columns = ['user_id', 'mean_n_items_basket']
    user_item_features = user_item_features.merge(df1, on=['user_id'])

    df2 = df.groupby('user_id')['item_id'].max().reset_index()
    df2.columns = ['user_id', 'max_n_items_basket']
    user_item_features = user_item_features.merge(df2, on=['user_id'])

    df3 = df.groupby('user_id')['item_id'].std().reset_index()
    df3.columns = ['user_id', 'std_n_items_basket']
    user_item_features = user_item_features.merge(df3, on=['user_id'])
    
    # mean / max / std кол-ва уникальных категорий в корзине клиента
    data = data.merge(item_features[['item_id', 'commodity_desc']], on=['item_id'])
    df = data.groupby(['user_id', 'basket_id'])['commodity_desc'].nunique().reset_index()
    df1 = df.groupby('user_id')['commodity_desc'].mean().reset_index()
    df1.columns = ['user_id', 'mean_n_item_categories_basket']
    user_item_features = user_item_features.merge(df1, on=['user_id'])

    df2 = df.groupby('user_id')['commodity_desc'].max().reset_index()
    df2.columns = ['user_id', 'max_n_item_categories_basket']
    user_item_features = user_item_features.merge(df2, on=['user_id'])

    df3 = df.groupby('user_id')['commodity_desc'].std().reset_index()
    df3.columns = ['user_id', 'std_n_item_categories_basket']
    user_item_features = user_item_features.merge(df3, on=['user_id'])
    
    return user_item_features

In [1170]:
id_to_userid = recommender.id_to_userid
id_to_itemid = recommender.id_to_itemid

userid_to_id = recommender.userid_to_id
itemid_to_id = recommender.itemid_to_id

user_factors = recommender.model.user_factors
item_factors = recommender.model.item_factors

In [1171]:
# эмбеддинг юзера = среднее эмбеддингов купленниых им товаров / merge on user
emb_df = data_train_lvl_1[['user_id', 'item_id']].copy()
emb_df['item_id'] = emb_df['item_id'].apply(lambda x: np.array(item_factors[itemid_to_id[x], :]))
emb_df = emb_df.groupby('user_id')['item_id'].sum().reset_index()  # mean

# scalar value
emb_df['item_id'] = emb_df['item_id'].apply(lambda x: x.mean())
emb_df.rename(columns={'item_id': 'scalar_embedding'}, inplace=True)

In [1172]:
# эмбеддинг товара - эмбеддинг юзера  # чем разница меньше, тем товар ближе к интересам пользователя / merge on user-item
diff_emb = data_train_lvl_1[['user_id', 'item_id']].copy()
diff_emb['diff_emb_scalar'] = diff_emb.apply(lambda row: (item_factors[itemid_to_id[row['item_id']]] - user_factors[userid_to_id[row['user_id']]]).mean(), axis=1)
diff_emb.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

In [1173]:
# merge on user-item; SKIP for ver. 0.4.8
def get_item_score(user_id, item_ids):
    items_scores = recommender.own_recommender.recommend(userid=userid_to_id[user_id],
                                      user_items=csr_matrix(recommender.user_item_matrix).tocsr()[userid_to_id[user_id], :],
                                      N=len(item_ids)-1,
                                      filter_already_liked_items=False,
                                      filter_items=[itemid_to_id[999999]],
                                      recalculate_user=True)
    items_recs = [id_to_itemid[item] for item in items_scores[0]]
    indices = [items_recs.index(item) if item in items_recs else -1 for item in item_ids]
    result = np.array([items_scores[1][i] if i != -1 else 0 for i in indices])
    return result

it_it_df = data_train_lvl_1[['user_id', 'item_id']].copy()
for us in it_it_df.user_id.unique():
    it_it_df.loc[it_it_df.user_id == us, 'item_item'] = get_item_score(us, it_it_df.loc[it_it_df.user_id == us, 'item_id'])
it_it_df.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

In [1174]:
def get_dot(user_id, item_id):
    try:
        return user_factors[userid_to_id[user_id], :] @ item_factors[itemid_to_id[item_id], :].T
    except:
        return 0
    
def get_inv_rank(user_id):
    try:
        vector = user_factors[userid_to_id[user_id], :] @ item_factors.T
        vector = vector.flatten()
        return 1 / np.argmax(vector)
    except:
        return 0

In [1175]:
train_test = pd.concat([data_train_lvl_1, data_train_lvl_2])

In [1176]:
def self_popularity_rec(user, data, n=5):

    recs_for_user = data[data['user_id'] == user]['item_id'].value_counts().index[:n+1].tolist()
    if 999999 in recs_for_user:
        recs_for_user.remove(999999)
    if len(recs_for_user) > n:
        recs_for_user = recs_for_user[:n]
    if len(recs_for_user) < n:
        recs_for_user += data['item_id'].value_counts().index[:5].tolist()[:(n - len(recs_for_user))]
        
    return recs_for_user

In [1177]:
df_match_candidates = pd.DataFrame(data_train_lvl_2['user_id'].unique())
df_match_candidates.columns = ['user_id']
    
df_match_candidates['candidates'] = df_match_candidates['user_id'].apply(
        lambda x: self_popularity_rec(x, train_test, n=N_preds))

df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
    
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
    
df_train = data_train_lvl_2[['user_id', 'item_id']].copy()
df_train['target'] = 1 
    
df_train = df_match_candidates.merge(df_train, on=['user_id', 'item_id'], how='left')
df_train = df_train.drop_duplicates(subset=['user_id', 'item_id']) 
df_train['target'].fillna(0, inplace=True)
df_train['item_id'] = df_train['item_id'].astype("int")

In [1178]:
df_train['target'].value_counts(normalize=True)

0.0    0.574882
1.0    0.425118
Name: target, dtype: float64

In [1179]:
X_lda = train_test.groupby('user_id')['item_id'].unique().reset_index()
X_lda['item_id'] = X_lda['item_id'].apply(lambda x: ' '.join([str(i) for i in x]))
X_lda.head(2)

Unnamed: 0,user_id,item_id
0,1,825123 831447 845307 852014 999999 856942 9910...
1,2,930118 5567388 5567582 5568489 5569230 9365106...


In [1180]:
lda = TfidfLDA().fit_transform(pd.DataFrame(data=X_lda['item_id'].values.reshape(-1,1), 
                                            index=X_lda['user_id'].values, columns=['item_id'])).reset_index()
lda.rename(columns={'index': 'user_id'}, inplace=True)
lda.head(1)

Unnamed: 0,user_id,topic_0,topic_1,topic_2,topic_3,topic_4
0,1,0.011969,0.011969,0.01197,0.011969,0.952123


In [1181]:
pca = PCA(n_components=5, random_state=42)
X_users = pca.fit_transform(user_factors)

X_users = pd.DataFrame(data=X_users).reset_index()
X_users.rename(columns={'index': 'user_id'}, inplace=True)
X_users['user_id'] = X_users['user_id'].apply(lambda x: id_to_userid[x])


pca = PCA(n_components=5, random_state=42)
X_items = pca.fit_transform(item_factors)

X_items = pd.DataFrame(data=X_items).reset_index()
X_items.rename(columns={'index': 'item_id'}, inplace=True)
X_items['item_id'] = X_items['item_id'].apply(lambda x: id_to_itemid[x])

In [1182]:
# week_no	coupon_disc	coupon_match_disc / merge on item
disc_df = train_test.groupby('item_id')[['coupon_disc', 'coupon_match_disc']].mean().reset_index()

In [1183]:
# first create new features in item_features! see 1st approach
df_train = df_train.merge(item_features, on='item_id', how='left')
df_train = df_train.merge(user_features, on='user_id', how='left')
df_train = df_train.merge(X_users, on='user_id', how='left')
df_train = df_train.merge(X_items, on='item_id', how='left')
df_train = df_train.merge(emb_df, on='user_id', how='left')  
df_train = df_train.merge(diff_emb, on=['user_id', 'item_id'], how='left')   
df_train = df_train.merge(it_it_df, on=['user_id', 'item_id'], how='left')  
df_train = df_train.merge(disc_df, on='item_id', how='left')   
df_train = df_train.merge(lda, on='user_id', how='left')
df_train['factor'] = df_train.apply(lambda row: get_dot(row['user_id'], row['item_id']), axis=1)
df_train['inv_rank'] = df_train['user_id'].apply(lambda x: get_inv_rank(x))

df_train.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)
train_test = train_test.merge(item_features[['item_id', 'department']], on='item_id', how='left')

df_train = create_multiple_features(df_train, train_val=train_test)  
df_train = create_user_features(df_train, data_train=train_test) 
df_train = create_item_features(df_train, train_val=train_test)
df_train = df_train.merge(get_new_features(train_test), on=['user_id', 'item_id'], how='left')

users_items = train_test.groupby('user_id')['item_id'].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])

def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_train['Last5sales'] = df_train[['user_id', 'item_id']].apply(code_last_sales, axis=1)

In [1184]:
for column in df_train.columns:
    if df_train[column].dtype == 'object':
        df_train[column].fillna('unk', inplace=True)
    elif df_train[column].dtype in ['float64', 'float32']:
        df_train[column].fillna(0.0, inplace=True)

print(f'Length of train data: {df_train.shape[0]}')
df_train.dropna(inplace=True)
df_train = reduce_memory(df_train)

Length of train data: 21674


In [1185]:
X, y = df_train.drop('target', axis=1), df_train['target']

In [1186]:
best_set = ['item_id', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'manufacturer_freq', 
            'department_freq', 'commodity_desc_freq', 'sub_commodity_desc_freq', 'curr_size_of_product_freq', 
            'age_desc', '0_x', '2_x', '3_x', '0_y', '2_y', '3_y', 'scalar_embedding', 'item_item', 'coupon_disc', 
            'factor', 'total_item_sales_value', 'total_quantity_value', 'total_user_sales_value', 'user_quantity_per_week',
            'mean_trans_time_by_user', 'avg_price', 'avg_num_purchases_week', 'price', 'mean_trans_time_by_item', 
            'n_stores_with_item', 'n_unique_stores_with_item', 'median_sales_hour', 'median_weekday', 'mean_n_items_basket',
            'mean_n_item_categories_basket', 'max_n_item_categories_basket']

In [1187]:
X_r = X[best_set].copy()
cat_cols = X.select_dtypes('category').columns.tolist()

grs = X.groupby('user_id', sort=False).size().to_frame()[0].values

In [1188]:
lgb_params = {
              'objective':'lambdarank',
              'boosting_type': 'gbdt',
              'n_estimators': 2000,
              'categorical_column': cat_cols,
              'random_state': 42,
              'is_unbalance': True,
              'n_jobs': -1,
              'max_depth': 14,
              'learning_rate': 0.015104294331090354,
              'num_leaves': 196,
              'subsample': 0.9381810294606675,
              'colsample_bytree': 0.7906492668877401,
              'max_bin': 42,
              'min_child_samples': 27,
              'subsample_freq': 12
}

In [1189]:
lgb_rn = LGBMRanker(**lgb_params, silent=True)

eval_h = lgb_rn.fit(X_r, 
                    y, 
                    group=grs,
                    eval_set=[(X_r, y)], 
                    eval_group=[grs], 
                    eval_metric=['ndcg', 'map'],
                    eval_at=[5, 10 ], 
                    early_stopping_rounds=200, 
                    verbose=0)

In [1190]:
eval_h.evals_result_['training']['ndcg@5'][-1], eval_h.evals_result_['training']['map@5'][-1]

(1.0, 1.0)

In [1191]:
rank_preds = lgb_rn.predict(X_r)

ranker_prediction = X[['user_id', 'item_id']]
ranker_prediction["pred"] = rank_preds

ranker_prediction = ranker_prediction.drop_duplicates()
ranker_prediction.sort_values(by=["user_id", "pred"], inplace=True, ascending=False)

ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
ranked_res = ranked_res.rename(columns={'item_id': 'lgbm_ranker'})  
ranked_res.head()

Unnamed: 0,user_id,lgbm_ranker
0,1,"[940947, 9655212, 856942, 9297615, 877391, 557..."
1,2,"[1106523, 1103898, 8090521, 1076580, 5569230, ..."
2,3,"[1106523, 998206, 1092937, 910032, 1075979, 11..."
3,4,"[883932, 1075368, 902172, 1052294, 6391541, 89..."
4,5,"[962199, 1126899, 1123022, 874972, 1050851, 93..."


In [1192]:
def get_user_prediction(user_id, N=5):
    try:
        return ranker_prediction.loc[ranker_prediction.user_id == user_id, 'item_id'].head(N).tolist()
    except:
        return self_popularity_rec(user_id, train_test, n=N)

In [1193]:
check = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
check.rename(columns={'item_id': 'actual'}, inplace=True)

check['ranker'] = check['user_id'].apply(lambda x: get_user_prediction(x, N=5))

for pred in calc_precision_at_k(check, 5):
    print(*pred, sep=': ')

ranker: 0.42075750122970984


In [1194]:
test1.loc[~test1.user_id.isin(train_test.user_id), 'user_id'].nunique()

2

In [1195]:
test1 = pd.read_csv('retail_test1.csv')

In [1196]:
check2 = test1.groupby('user_id')['item_id'].unique().reset_index()
check2.rename(columns={'item_id': 'actual'}, inplace=True)

check2['ranker'] = check2['user_id'].apply(lambda x: get_user_prediction(x, N=5))

for pred in calc_precision_at_k(check2, 5):
    print(*pred, sep=': ')

ranker: 0.27923033389926427


In [1199]:
test.rename(columns={'UserId': 'user_id'}, inplace=True)

In [1200]:
test['ranker'] = test['user_id'].apply(lambda x: get_user_prediction(x, N=5))

In [923]:
new = test[~test.user_id.isin(data_train_lvl_1.user_id.tolist())].user_id.tolist()
test.loc[test.ranker.isna(), 'ranker'] = test.loc[test.ranker.isna(), 'user_id'].\
                                                                apply(lambda x: get_self_top_purchases(x))
submission = transform_data_for_eval(test[['user_id', 'ranker']], rec_col='ranker', user_col='user_id')
submission.to_csv('submission_1.csv', index=False)
!kaggle competitions submit -c gb-recsys-june-july-2022 -f submission.csv -m 'test1'  

Successfully submitted to GB RecSys June-July 2022



  0%|          | 0.00/76.5k [00:00<?, ?B/s]
100%|##########| 76.5k/76.5k [00:02<00:00, 32.2kB/s]


#### Public MAP@5: 0.19247

## Second Approach

**1st level model**

In [1202]:
data['weighted'] = data['week_no'] ** 4

test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [1203]:
test_users = test['user_id'].unique()
result_users = data_train['user_id'].unique()

fake_ids = list(set(test_users) - set(result_users))

In [1204]:
# fake means unseen users in whole data_train
fake_data = data.iloc[-len(fake_ids):,:]
fake_data['user_id'] = fake_ids
data_train = pd.concat([data_train, fake_data])

In [1205]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)


top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='weighted',
                                  aggfunc='sum',
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) 

sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [1206]:
# https://github.com/RostislavIllyk/HW_recommendation_systems/blob/main/final_run.ipynb

def get_knn(matrix, sample_vector, n=5):
    dist_line = cdist(sample_vector, matrix, metric='cosine')
    items = dist_line[0].argsort()[-n:][::-1]
    batch = matrix[items]
    dist_selected = dist_line[0][items]
    return batch, (dist_selected.sum() - 1) / (n - 1)

def get_user_recommendations(user_item_matrix, sample_vector, k_neigbours=9, n_recomendation=5, coef1=5.5, coef2=0.001):
    batch, dist = get_knn(user_item_matrix.values, sample_vector, k_neigbours)
    
    # Вектор личных предпочтений
    rec_from_yourself = sample_vector
    
    # Вектор всех предпочтений
    rec_from_all = user_item_matrix.values.mean(axis=0)
    
    # Вектор ближайших предпочтений
    rec_from_neighbours = batch.mean(axis=0)
    
    k1, k2 = dist * coef1, dist * coef2  
     
    # целевой вектор
    rec = rec_from_yourself + rec_from_all / k1 + rec_from_neighbours * k2
    
    # Отбор товаров с максимальными весами
    items = rec[0].argsort()[-n_recomendation-1:][::-1]
    
    idx_to_zero = itemid_to_id[999999]
    items = list(items)
    if idx_to_zero in items:
        items.remove(idx_to_zero)
    items =items[:n_recomendation]
    
    return items

In [1207]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [1208]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result['actual'] = result['actual'].apply(lambda x: list(x))
result.head()

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [1209]:
id_list = result['user_id'].map(userid_to_id).values

In [1210]:
k_neigbours = 18 
n_recomendation = 10

rec_list=[]
for i in tqdm.trange(len(id_list)):    
    sample_vector = np.reshape(user_item_matrix.values[id_list[i]],(1,-1))
    item = get_user_recommendations(user_item_matrix, sample_vector, k_neigbours=k_neigbours, n_recomendation=n_recomendation)
    rec_list.append(item)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2042/2042 [04:34<00:00,  7.44it/s]


In [1211]:
result_list=[]
for i in tqdm.trange(len(rec_list)):
    item = [id_to_itemid[rec] for rec in rec_list[i]]
    result_list.append(item)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2042/2042 [00:00<00:00, 51831.36it/s]


In [1212]:
result['test'] = result_list

In [1213]:
result.apply(lambda x: precision_at_k(x['test'], x['actual'],  5), axis=1).mean()

0.43819784524975514

**For further imputation**

In [1214]:
train_id_list = list(userid_to_id.values())
len(set(train_id_list))

2500

In [1215]:
train_rec_list=[]
for i in tqdm.trange(len(train_id_list)):    
    sample_vector = np.reshape(user_item_matrix.values[train_id_list[i]],(1,-1))
    item = get_user_recommendations(user_item_matrix, sample_vector, k_neigbours=9, n_recomendation=5)
    train_rec_list.append(item)
    
train_result_list=[]
for i in tqdm.trange(len(train_rec_list)):
    item = [id_to_itemid[rec] for rec in train_rec_list[i]]
    train_result_list.append(item)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [05:47<00:00,  7.20it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [00:00<00:00, 125000.12it/s]


In [1216]:
total_result = pd.DataFrame(data={'user_id': train_id_list, 'recs': train_result_list})
total_result.head(2)

Unnamed: 0,user_id,recs
0,0,"[856942, 1082185, 995242, 940947, 5577022]"
1,1,"[1106523, 838136, 1133018, 826784, 916122]"


In [1217]:
total_result['user_id'] = total_result['user_id'] + 1

**2nd level model**

In [1218]:
df_match_candidates = pd.DataFrame(data_test['user_id'].unique())
df_match_candidates.columns = ['user_id']
    
df_match_candidates = df_match_candidates.merge(result[['user_id', 'test']], on='user_id', how='left')
df_match_candidates.rename(columns={'test': 'candidates'}, inplace=True)

df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
    
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
    
df_train = data_test[['user_id', 'item_id']].copy()
df_train['target'] = 1 
    
df_train = df_match_candidates.merge(df_train, on=['user_id', 'item_id'], how='left')
df_train = df_train.drop_duplicates(subset=['user_id', 'item_id']) 
df_train['target'].fillna(0, inplace=True)
df_train['item_id'] = df_train['item_id'].astype("int")

In [1219]:
df_train['target'].value_counts()

0.0    13093
1.0     7327
Name: target, dtype: int64

**Feature Engineering**

In [None]:
# do not forget to create freqs in item_features, see 1st approach

In [1220]:
recommender = MainRecommender(data_train, user_item_matrix=user_item_matrix, weighting=False)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [1221]:
id_to_userid = recommender.id_to_userid
id_to_itemid = recommender.id_to_itemid

userid_to_id = recommender.userid_to_id
itemid_to_id = recommender.itemid_to_id

user_factors = recommender.model.user_factors
item_factors = recommender.model.item_factors

In [1222]:
emb_df = data_train[['user_id', 'item_id']].copy()
emb_df['item_id'] = emb_df['item_id'].apply(lambda x: np.array(item_factors[itemid_to_id[x], :]))
emb_df = emb_df.groupby('user_id')['item_id'].sum().reset_index()

# scalar value
emb_df['item_id'] = emb_df['item_id'].apply(lambda x: x.mean())
emb_df.rename(columns={'item_id': 'scalar_embedding'}, inplace=True)

In [1223]:
diff_emb = data_train[['user_id', 'item_id']].copy()
diff_emb['diff_emb_scalar'] = diff_emb.apply(lambda row: (item_factors[itemid_to_id[row['item_id']]] - user_factors[userid_to_id[row['user_id']]]).mean(), axis=1)
diff_emb.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

In [1225]:
data_train_ = prefilter_items(data_train, item_features=item_features, take_n_popular=10000, n_weeks=52).drop('weighted', axis=1)

train_test = pd.concat([data_train_, data_test])

In [1226]:
scaler = StandardScaler()
pca = PCA(n_components=5, random_state=42)
X_users = pca.fit_transform(scaler.fit_transform(user_factors))

X_users = pd.DataFrame(data=X_users).reset_index()
X_users.rename(columns={'index': 'user_id'}, inplace=True)
X_users['user_id'] = X_users['user_id'].apply(lambda x: id_to_userid[x])


pca = PCA(n_components=5, random_state=42)
X_items = pca.fit_transform(scaler.fit_transform(item_factors))

X_items = pd.DataFrame(data=X_items).reset_index()
X_items.rename(columns={'index': 'item_id'}, inplace=True)
X_items['item_id'] = X_items['item_id'].apply(lambda x: id_to_itemid[x])

In [1227]:
disc_df = train_test.groupby('item_id')[['coupon_disc', 'coupon_match_disc']].mean().reset_index()

In [1228]:
print('Step 1')
df_train = df_train.merge(item_features, on='item_id', how='left')
df_train = df_train.merge(user_features, on='user_id', how='left')
df_train = df_train.merge(X_users, on='user_id', how='left')
df_train = df_train.merge(X_items, on='item_id', how='left')
df_train = df_train.merge(emb_df, on='user_id', how='left')  
df_train = df_train.merge(diff_emb, on=['user_id', 'item_id'], how='left')   
df_train = df_train.merge(disc_df, on='item_id', how='left')   
df_train['factor'] = df_train.apply(lambda row: get_dot(row['user_id'], row['item_id']), axis=1)
df_train['inv_rank'] = df_train['user_id'].apply(lambda x: get_inv_rank(x))

df_train.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)
train_test = train_test.merge(item_features[['item_id', 'department']], on='item_id', how='left')
print('Step 2')
df_train = create_multiple_features(df_train, train_val=train_test)  
df_train = create_user_features(df_train, data_train=train_test) 
df_train = create_item_features(df_train, train_val=train_test)
df_train = df_train.merge(get_new_features(train_test), on=['user_id', 'item_id'], how='left')

print('Step 3')
users_items = train_test.groupby('user_id')['item_id'].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])

def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_train['Last5sales'] = df_train[['user_id', 'item_id']].apply(code_last_sales, axis=1)

Step 1
Step 2
Step 3


In [1229]:
for column in df_train.columns:
    if df_train[column].dtype == 'object':
        df_train[column].fillna('unk', inplace=True)
    elif df_train[column].dtype in ['float64', 'float32']:
        df_train[column].fillna(0.0, inplace=True)

print(f'Length of train data: {df_train.shape[0]}')
df_train.dropna(inplace=True)
df_train = reduce_memory(df_train)

Length of train data: 20420


In [1230]:
X, y = df_train.drop('target', axis=1), df_train['target']

**Straightforward (stupid) but most accurate feature selection**

In [812]:
grs = X.groupby('user_id', sort=False).size().to_frame()[0].values

class BestSet:
    def __init__(self, k_features=35):
        self.k_features = k_features
        self.users = test1.loc[~test1.user_id.isin(data_train.user_id.tolist()), 'user_id'].unique().tolist()

    def fit(self, X, y):

        dim = X.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X, y, self.indices_)
        self.scores_ = [score]

        while dim > self.k_features:
            scores, subsets = [], []
            for p in tqdm.notebook.tqdm(combinations(self.indices_, r=dim-1), total=dim, leave=False):
                score = self._calc_score(X, y, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        
        return self

    def transform(self, X):
        best_indices = self.subsets_[np.argmax(self.scores_)]
        return X.iloc[:, list(best_indices)]

    def _calc_score(self, X, y, indices):
        cat_cols = X.select_dtypes('category').columns.tolist()

        model = LGBMRanker(objective='lambdarank',
                           boosting_type='gbdt',
                           n_estimators=500,
                           categorical_column=cat_cols,
                           random_state=42,
                           is_unbalance=True,
                           n_jobs=-1,)

        eval_h = model.fit(X.iloc[:, list(indices)], 
                           y, 
                           group=grs,
                           eval_set=[(X.iloc[:, list(indices)], y)], 
                           eval_group=[grs], 
                           eval_metric=['ndcg'],
                           eval_at=[5, 10 ], 
                           early_stopping_rounds=50, 
                           verbose=0)
    
        rank_preds = model.predict(X.iloc[:, list(indices)])
        
        ranker_prediction = X[['user_id', 'item_id']].copy()
        ranker_prediction["pred"] = rank_preds

        ranker_prediction = ranker_prediction.drop_duplicates()
        ranker_prediction.sort_values(by=["user_id", "pred"], inplace=True, ascending=False)
            
        ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
        ranked_res = ranked_res.rename(columns={'item_id': 'lgbm_ranker'})  

        result1 = test1.groupby('user_id')['item_id'].unique().reset_index()
        result1.columns = ['user_id', 'actual']

        result1 = result1.merge(ranked_res, on='user_id', how='left')
        result1.loc[result1.lgbm_ranker.isna(), 'lgbm_ranker'] = result1.loc[result1.lgbm_ranker.isna(), 'user_id'].\
                               apply(lambda x: total_result.loc[total_result.user_id == x, 'recs'].values[0])

        score = result1.apply(lambda row: precision_at_k(row['lgbm_ranker'], row['actual'], k=5), axis=1).mean()

        return score
    
selector = BestSet()
selector.fit(X, y)
X.columns[list(selector.subsets_[np.argmax(selector.scores_)])], max(selector.scores_)

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

(Index(['user_id', 'item_id', 'manufacturer', 'brand', 'sub_commodity_desc',
        'curr_size_of_product', 'department_freq', 'commodity_desc_freq',
        'sub_commodity_desc_freq', 'curr_size_of_product_freq', '0_x', '1_x',
        '2_x', '3_x', '4_x', '0_y', '1_y', '2_y', '4_y', 'scalar_embedding',
        'coupon_disc', 'coupon_match_disc', 'factor', 'total_quantity_value',
        'item_freq', 'user_freq', 'total_user_sales_value',
        'user_quantity_per_week', 'avg_cheque', 'users_unique_departments',
        'mean_trans_time_by_user', 'mean_sales_value_per_basket',
        'avg_basket_department', 'avg_price', 'n_stores_with_item',
        'n_unique_stores_with_item', 'median_sales_hour', 'median_weekday',
        'mean_visits_interval', 'n_transactions', 'mean_n_items_basket',
        'std_n_items_basket', 'mean_n_item_categories_basket',
        'max_n_item_categories_basket', 'std_n_item_categories_basket',
        'Last5sales'],
       dtype='object'),
 0.348222811671

In [1231]:
best_set = ['manufacturer', 'brand', 'commodity_desc', 'sub_commodity_desc',
            'curr_size_of_product', 'commodity_desc_freq',
            'curr_size_of_product_freq', '0_x', '1_x', '2_x', '4_x', '0_y', '1_y',
            '2_y', '4_y', 'scalar_embedding', 'diff_emb_scalar', 'coupon_disc',
            'coupon_match_disc', 'inv_rank', 'total_item_sales_value',
            'total_quantity_value', 'item_freq', 'user_freq',
            'total_user_sales_value', 'avg_cheque', 'users_unique_departments',
            'mean_sales_value_per_basket', 'unique_bought_items',
            'avg_basket_department', 'mean_sales_value_category', 'avg_price',
            'avg_num_purchases_week', 'mean_trans_time_by_item',
            'n_stores_with_item', 'n_unique_stores_with_item', 'median_weekday',
            'mean_visits_interval', 'n_transactions', 'mean_n_items_basket',
            'max_n_items_basket', 'std_n_items_basket',
            'mean_n_item_categories_basket', 'max_n_item_categories_basket',
            'std_n_item_categories_basket', 'Last5sales']

**Hyperparameter tunning**

In [1232]:
X_r = X[best_set].copy()
cat_cols = X.select_dtypes('category').columns.tolist()

grs = X.groupby('user_id', sort=False).size().to_frame()[0].values

In [815]:
start_params = {
                'objective':'lambdarank',
                'boosting_type': 'gbdt',
                'n_estimators': 2000,
                'categorical_column': cat_cols,
                'random_state': 42,
                'is_unbalance': True,
                'n_jobs': -1,
}

In [816]:
def objective(trial):
    param_trials = {
                    'max_depth': trial.suggest_int('max_depth', 9, 15),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                    'num_leaves': trial.suggest_int('num_leaves', 20, 265),
                    'subsample': trial.suggest_float('subsample', 0.3, 1.),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.),
                    'max_bin': trial.suggest_int('max_bin', 10, 260),
                    'min_child_samples': trial.suggest_int('min_child_samples', 20, 260),
                    'subsample_freq': trial.suggest_int('subsample_freq', 1, 20),
                    }
    param_trials.update(start_params)
    lgb_rn = LGBMRanker(**param_trials, silent=True)

    eval_h = lgb_rn.fit(X_r, y, group=grs,
                        eval_set=[(X_r, y)], eval_group=[grs], 
                        eval_metric=['ndcg'],
                        eval_at=[5, 10 ], 
                        early_stopping_rounds=50, 
                        verbose=0)
    
    rank_preds = lgb_rn.predict(X_r)
    ranker_prediction = X[['user_id', 'item_id']].copy()
    ranker_prediction["pred"] = rank_preds

    ranker_prediction = ranker_prediction.drop_duplicates()
    ranker_prediction.sort_values(by=["user_id", "pred"], inplace=True, ascending=False)
            
    ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
    ranked_res = ranked_res.rename(columns={'item_id': 'lgbm_ranker'})  

    result1 = test1.groupby('user_id')['item_id'].unique().reset_index()
    result1.columns = ['user_id', 'actual']

    result1 = result1.merge(ranked_res, on='user_id', how='left')
    result1.loc[result1.lgbm_ranker.isna(), 'lgbm_ranker'] = result1.loc[result1.lgbm_ranker.isna(), 'user_id'].\
                               apply(lambda x: total_result.loc[total_result.user_id == x, 'recs'].values[0])

    score = result1.apply(lambda row: precision_at_k(row['lgbm_ranker'], row['actual'], k=5), axis=1).mean()
    
    return score

In [817]:
optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(sampler=TPESampler(seed=42), direction="maximize")
study.optimize(objective, n_trials=500, timeout=3000)

print(f'Number of completed trials: {len(study.trials)}')
print('Best trial')
trial = study.best_trial
print(f'Best score: {trial.value}')
print('Best params: ', trial.params)

Number of completed trials: 280
Best trial
Best score: 0.3474801061007958
Best params:  {'max_depth': 14, 'learning_rate': 0.021262064401654825, 'num_leaves': 111, 'subsample': 0.7643251606704898, 'colsample_bytree': 0.5767734383871105, 'max_bin': 162, 'min_child_samples': 59, 'subsample_freq': 17}


**Final predictions**

In [1233]:
lgb_params = {
              'objective':'lambdarank',
              'boosting_type': 'gbdt',
              'n_estimators': 2000,
              'categorical_column': cat_cols,
              'random_state': 42,
              'is_unbalance': True,
              'n_jobs': -1,
              'max_depth': 12, 
              'learning_rate': 0.027892346545495567, 
              'num_leaves': 78, 
              'subsample': 0.6169142769038929, 
              'colsample_bytree': 0.3388817982402172, 
              'max_bin': 138, 
              'min_child_samples': 260, 
              'subsample_freq': 12
}

In [1234]:
lgb_rn = LGBMRanker(**lgb_params, silent=True)

eval_h = lgb_rn.fit(X_r, 
                    y, 
                    group=grs,
                    eval_set=[(X_r, y)], 
                    eval_group=[grs], 
                    eval_metric=['ndcg', 'map'],
                    eval_at=[5, 10 ], 
                    early_stopping_rounds=200, 
                    verbose=0)

In [1235]:
rank_preds = lgb_rn.predict(X_r)

ranker_prediction = X[['user_id', 'item_id']]
ranker_prediction["pred"] = rank_preds

ranker_prediction = ranker_prediction.drop_duplicates()
ranker_prediction.sort_values(by=["user_id", "pred"], inplace=True, ascending=False)

ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index() 
ranked_res = ranked_res.rename(columns={'item_id': 'lgbm_ranker'})  
ranked_res.head()

Unnamed: 0,user_id,lgbm_ranker
0,1,"[995242, 1074612, 1082185, 940947, 856942, 557..."
1,3,"[9526563, 1053690, 6463658, 1092937, 910032, 1..."
2,6,"[840361, 1119051, 1037863, 845208, 1024306, 10..."
3,7,"[1082185, 1013321, 1022003, 1106523, 1122358, ..."
4,8,"[1116578, 840361, 1005186, 1082185, 1029743, 8..."


In [1236]:
def get_user_prediction(user_id, N=5):
    
    items = ranker_prediction.loc[ranker_prediction.user_id == user_id, 'item_id'].head(N).tolist()
    if len(items) == 0:
        items = total_result.loc[total_result.user_id == user_id, 'recs'].values[0]
    return items

In [1237]:
test['ranker'] = test['user_id'].apply(lambda x: get_user_prediction(x, N=5))

In [1148]:
submission = transform_data_for_eval(test[['user_id', 'ranker']], rec_col='ranker', user_col='user_id')
submission.to_csv('submission_2.csv', index=False)
!kaggle competitions submit -c gb-recsys-june-july-2022 -f submission.csv -m 'test1' 

Successfully submitted to GB RecSys June-July 2022



  0%|          | 0.00/80.2k [00:00<?, ?B/s]
100%|##########| 80.2k/80.2k [00:04<00:00, 19.7kB/s]


#### Public MAP@5: 0.30775

In [960]:
test = test.merge(total_result, on='user_id', how='left')
submission = transform_data_for_eval(test[['user_id', 'recs']], rec_col='recs', user_col='user_id')
submission.to_csv('submission_3.csv', index=False)
!kaggle competitions submit -c gb-recsys-june-july-2022 -f submission.csv -m 'test1' 
# only first model is better

Successfully submitted to GB RecSys June-July 2022



  0%|          | 0.00/80.3k [00:00<?, ?B/s]
100%|##########| 80.3k/80.3k [00:01<00:00, 45.0kB/s]


#### Public MAP@5: 0.33113

## Predict by 1st model

In [1240]:
data = pd.read_csv('retail_train.csv')
test = pd.read_csv('test_user.csv')

In [1241]:
data['weighted'] = data['week_no'] ** 4

test.rename(columns={'UserId': 'user_id'}, inplace=True)
test_users = test['user_id'].unique()
result_users = data['user_id'].unique()

fake_ids = list(set(test_users) - set(result_users))
fake_ids

[2325]

In [1242]:
fake_data = data.iloc[-len(fake_ids):,:]
fake_data['user_id'] = fake_ids
data = pd.concat([data, fake_data])

In [1243]:
popularity = data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)


top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

data.loc[~data['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='weighted', 
                                  aggfunc='sum',
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) 

sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [1244]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [1245]:
id_list = test['user_id'].map(userid_to_id).values.astype(int)

In [1246]:
k_neigbours=9 
n_recomendation = 5

rec_list=[]
for i in tqdm.trange(len(id_list)):    
    sample_vector = np.reshape(user_item_matrix.values[id_list[i]],(1,-1))
    item = get_user_recommendations(user_item_matrix, sample_vector, k_neigbours=k_neigbours, n_recomendation=n_recomendation)
    rec_list.append(item)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1885/1885 [04:21<00:00,  7.21it/s]


In [1247]:
result_list=[]
for i in tqdm.trange(len(rec_list)):
    item = [id_to_itemid[rec] for rec in rec_list[i]]
    result_list.append(item)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1885/1885 [00:00<00:00, 124881.74it/s]


In [1248]:
for i in range(len(result_list)):
    for j in range(len(result_list[i])):
        result_list[i][j] = str(result_list[i][j])
    result_list[i] = ' '.join(result_list[i])

In [1249]:
test['Predicted'] = result_list
test.columns = [['UserId', 'Predicted']]
test

Unnamed: 0,UserId,Predicted
0,1,856942 1082185 995242 940947 8293439
1,2,1106523 1133018 838136 826784 916122
2,3,1053690 9526563 6463658 910032 13842214
3,6,1082185 1037863 1119051 840361 1024306
4,7,1082185 1122358 1106523 1022003 1013321
...,...,...
1880,2496,981760 916122 1106523 883404 992870
1881,2497,860776 995785 1066685 834484 897125
1882,2498,1070820 1082185 1022066 901776 1053690
1883,2499,1070820 5568378 1060872 5570048 5569327


In [1250]:
test.to_csv('final.csv', index=False)

In [974]:
!kaggle competitions submit -c gb-recsys-june-july-2022 -f final.csv -m 'test2'  

Successfully submitted to GB RecSys June-July 2022



  0%|          | 0.00/80.3k [00:00<?, ?B/s]
100%|##########| 80.3k/80.3k [00:01<00:00, 55.2kB/s]


#### Public MAP@5: 0.34917

**Predict for all**

In [978]:
train_id_list = list(userid_to_id.values())
len(set(train_id_list))

2500

In [979]:
train_rec_list=[]
for i in tqdm.trange(len(train_id_list)):    
    sample_vector = np.reshape(user_item_matrix.values[train_id_list[i]],(1,-1))
    item = get_user_recommendations(user_item_matrix, sample_vector, k_neigbours=9, n_recomendation=5)
    train_rec_list.append(item)
    
train_result_list=[]
for i in tqdm.trange(len(train_rec_list)):
    item = [id_to_itemid[rec] for rec in train_rec_list[i]]
    train_result_list.append(item)

100%|██████████████████████████████████████████████████████████████████████████████| 2500/2500 [05:27<00:00,  7.63it/s]
100%|██████████████████████████████████████████████████████████████████████████| 2500/2500 [00:00<00:00, 131522.46it/s]


In [980]:
total_result = pd.DataFrame(data={'user_id': train_id_list, 'recs': train_result_list})
total_result['user_id'] = total_result['user_id'] + 1

In [983]:
total_result['recs'] = total_result['recs'].apply(lambda x: ' '.join([str(i) for i in x]))
total_result.columns = [['UserId', 'Predicted']]

In [985]:
total_result.to_csv('total_prediction.csv', index=False)