# Финальный проект
Мы уже прошли всю необходимую теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ). Рекомендуем вам начать делать проект уже сейчас

Целевая метрика - money precision@5. Порог для уcпешной сдачи проекта money precision@5 > 20%
Бизнес ограничения в топ-5 товарах:

- Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)

- 2 новых товара (юзер никогда не покупал)

- 1 дорогой товар, > 7 долларов (price = sum(sales_value) / sum(quantity))

- Все товары из разных категорий (категория - sub_commodity_desc)

- Стоимость каждого рекомендованного товара > 1 доллара (price = sum(sales_value) / sum(quantity))

- НЕТ ограничения на СТМ

- Вам нужно добавить обрааботку холодных юзеров (рекомендовать им топ-5 популярных товаров)

- Будет public тестовый датасет, на котором вы сможете измерять метрику

- Также будет private тестовый датасет для измерения финального качества

- НЕ обязательно использовать 2-ух уровневые рекоммендательные системы в проекте

- Вы сдаете код проекта в виде github репозитория и .csv файл с рекомендациями. В .csv файле 2 столбца: user_id - (item_id1, item_id2, ..., item_id5)

Дедлайн - 4 октября 23:59

Бейзлайн можно найти на github

# Подключение библиотек и скриптов

In [1]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking ### !!!! Пользуйтесь в финальном проекте

from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender, bm25_weight, tfidf_weight

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

np.random.seed(42)

# Функции

In [2]:
# src.metrics
# целевая метрика
def money_precision_at_k(recommended_list, bought_list, prices_dict, k=5):
        
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    if k < len(recommended_list):
        recommended_list = recommended_list[:k]
        
    prices_recommended = np.array([prices_dict[i] for i in recommended_list])
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = (flags*prices_recommended).sum() / prices_recommended.sum()
    
    return precision

In [3]:
# src.utils
def prefilter_items(data,
                    item_features=None,
                    resent_weeks=12, 
                    filter_frequency=False, 
                    cheap_filter=False,
                    rare_department_filter=False,
                    popular_filter=True, 
                    take_n_popular=5000):
    """Предфильтрация товаров"""
    
    # Уберем товары с нулевым количеством и выручкой
    data = data[data['quantity'] != 0]
    data = data[data['sales_value'] != 0]
    
    prices = data.groupby('item_id').agg({
        'sales_value' : 'sum', 
        'quantity': 'sum'
    }).reset_index()
    
    prices['price'] = prices['sales_value'] / prices['quantity']
    #prices.loc[prices['quantity'] == 0, 'price'] = 0

    # 1. Удаление малоценных товаров 
    low = prices[prices['price'] <= 1]
    low_list = low['item_id'].tolist()
    data = data[~data['item_id'].isin(low_list)]

    # 2. Удаление товаров со слишком большой ценой > 50$
    high = prices[prices['price'] > 50]
    high_list = high['item_id'].tolist()
    data = data[~data['item_id'].isin(high_list)]
    
    # Уберем товары, где дисконт больше 0 (наценка или ошибка?)
    data = data[data['retail_disc'] <= 0]
                
    # Уберем товары, где купонный дисконт -10 и ниже (разово купили по купону и вряд ли купят еще)
    data = data[data['coupon_disc'] > -10]
    
    # Уберем товары, которые не продавались за последние несколько недель
    if resent_weeks != 0:
        weeks = data.groupby('item_id')['week_no'].last().reset_index()
        weeks = weeks[weeks['week_no'] > (96 - resent_weeks)]
        sales_items = weeks['item_id'].tolist()
        data = data[data['item_id'].isin(sales_items)]
    
    # Уберем товары с низкой частотой покупки
    if filter_frequency:
        num_users = data['user_id'].nunique()
        frequency = data.groupby('item_id')['user_id'].nunique().reset_index() 
        frequency.rename(columns={'user_id': 'buyers'}, inplace=True)
        frequency['part_buyers']=frequency['buyers']/num_users
        top_frequency  = frequency[frequency['part_buyers'] > 0.01].item_id.tolist()
        data = data[data['item_id'].isin(top_frequency)]

    # Уберем не интересные для рекомендаций категории (department)
    if item_features is not None:
        # Уберем сезонные товары
        items_in_seasons_comms = item_features[item_features['commodity_desc'].isin(['SPRING/SUMMER SEASONAL', 'HALLOWEEN', 'CHRISTMAS  SEASONAL', 'SEASONAL', 'FALL AND WINTER SEASONAL'])].item_id.unique().tolist()
        data = data[~data['item_id'].isin(items_in_seasons_comms)]
        
        del_deps = [' ', 'FLORAL', 'GM MERCH EXP', 'CHARITABLE CONT', 'ELECT &PLUMBING', 'TOYS', 'HOUSEWARES', 'VIDEO RENTAL', 
                    'PROD-WHS SALES', 'DAIRY DELI', 'CNTRL/STORE SUP', 'PHOTO', 'VIDEO', 'PORK', 'GRO BAKERY', 'HBC',
                    'AUTOMOTIVE', 'RX', 'CNTRL/STORE SUP', 'GM MERCH EXP', 'TRAVEL & LEISUR', 'FROZEN GROCERY', 'RESTAURANT',
                    'COUP/STR & MFG', 'CHEF SHOPPE', 'SPIRITS',  'DELI/SNACK BAR', 'PHARMACY SUPPLY', 'MEAT-WHSE', 'SEAFOOD',
                    'MISC SALES TRAN']
        
        items_del_deps = item_features[item_features['department'].isin(del_deps)].item_id.unique().tolist()
        data = data[~data['item_id'].isin(items_del_deps)]
        
        if cheap_filter:
            # уберем товары из дешевых категорий
            item_features = item_features.merge(prices, on='item_id', how='left')
            subcomms = item_features.groupby('sub_commodity_desc')['price'].mean().reset_index()
            valuable_subcomms = subcomms[subcomms['price'] > 4]['sub_commodity_desc'].unique().tolist()
  
            items_in_valuable_subcomms = item_features[item_features['commodity_desc'].isin(valuable_subcomms)]['item_id'].unique().tolist()
            data = data[data['item_id'].isin(items_in_valuable_subcomms)]
            department_size = pd.DataFrame(item_features.\
                                        groupby('department')['item_id'].nunique().\
                                        sort_values(ascending=False)).reset_index()
            
            #уберем редко встречающиеся категории
        if rare_department_filter:
            department_size = pd.DataFrame(item_features.\
                                        groupby('department')['item_id'].nunique().\
                                        sort_values(ascending=False)).reset_index()
            department_size.columns = ['department', 'n_items']
            rare_departments = department_size[department_size['n_items'] < 300].department.tolist()
            items_in_rare_departments = item_features[item_features['department'].isin(rare_departments)].item_id.unique().tolist()

            data = data[~data['item_id'].isin(items_in_rare_departments)]
                  
    # 4. Выбор топ-N самых популярных товаров (N = take_n_popular)
    if popular_filter:
        popularity_sales = data.groupby('item_id').agg({
            'sales_value' : 'sum', 
            'user_id': 'count'
            }).reset_index()
        popularity_sales = popularity_sales[popularity_sales['user_id'] > 2]
        popularity_sales.sort_values('sales_value', ascending=False, inplace=True)
        n_popular = popularity_sales['item_id'][:take_n_popular].tolist()

        # Заведем фиктивный item_id (если юзер не покупал товары из топ-5000, то он "купил" такой товар)
        data.loc[~data['item_id'].isin(n_popular), 'item_id'] = 9999999
        n_popular.append(9999999)

        data = data[data['item_id'].isin(n_popular)]

    return data

In [4]:
def postfilter_items(recommendations, 
                     user_id,
                     vip_users,
                     train_bought_items_dict, 
                     top_popular_items,
                     overall_top_expensive_purchases,
                     top_expensive_item,
                     item_features, 
                     prices_dict, 
                     N=5, 
                     money_rank=False,
                     divide_users=True):
    """Постфильтрация товаров"""

    # Уникальность
    #     recommendations = list(set(recommendations)) - неверно! так теряется порядок
    unique_recommendations = []
    [unique_recommendations.append(item) for item in recommendations if item not in unique_recommendations]
    
    # возможность ранжирования всех предсказаний по цене
    if money_rank:
        prices_unique_recommendations = [prices_dict[item] for item in unique_recommendations]
        recommended_prises_dict = dict(zip(unique_recommendations, prices_unique_recommendations))
        unique_recommendations = sorted(recommended_prises_dict, key=lambda x: recommended_prises_dict.get(x), reverse=True)
    
    # ранжируем только покупателей с высоким средним чеком - это работает
    if divide_users:
        if user_id in vip_users:
            prices_unique_recommendations = [prices_dict[item] for item in unique_recommendations]
            recommended_prises_dict = dict(zip(unique_recommendations, prices_unique_recommendations))
            unique_recommendations = sorted(recommended_prises_dict, key=lambda x: recommended_prises_dict.get(x), reverse=True)
    
    # Разные категории
    categories_used = []
    final_recommendations = []
    CATEGORY_NAME = 'sub_commodity_desc'
    
    # 1 дорогой товар, > 7 долларов
    expensive_list = []
            
    for key in prices_dict.keys():
        if prices_dict[key] > 7:
            expensive_list.append(key)
            
    if user_id in train_bought_items_dict.keys():
        train_bought_items_list = train_bought_items_dict[user_id]
    else:
        train_bought_items_list = []

    #expensive_new_list = [item for item in expensive_list if item not in train_bought_items_list]
    # берем несколько самых популярных товаров от 7 до 8 долл
    expensive_list_recs = []
     # в первую очередь дополняем товарами из рекомендованных
    [expensive_list_recs.append(item) for item in unique_recommendations if item in expensive_list]
    # дополняем популярными дорогими
    [expensive_list_recs.append(item) for item in overall_top_expensive_purchases if item not in expensive_list_recs]
    
    expensive_item = expensive_list_recs[0]         
    category = item_features.loc[item_features['item_id'] == expensive_item, CATEGORY_NAME].values[0]
    if category not in categories_used:
        final_recommendations.append(expensive_item)
        categories_used.append(category)
    
    if expensive_item in unique_recommendations:
        unique_recommendations.remove(expensive_item)
        
    # 2 новых товара (юзер никогда не покупал)    
    new_rec_list = [item for item in unique_recommendations if item not in train_bought_items_list]
    top_popular_items_not_from_recs = [item for item in top_popular_items if item not in unique_recommendations]
    new_popular = [item for item in top_popular_items_not_from_recs if item not in train_bought_items_list]
    # выбираем наиболее дешевые
    prices_new_popular = [prices_dict[item] for item in new_popular]
    new_popular_prises_dict = dict(zip(new_popular, prices_new_popular))
    new_popular = sorted(new_popular, key=lambda x: new_popular_prises_dict.get(x), reverse=False)
    
    new_items = []
    if new_rec_list:
        for item in new_rec_list:
            category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
            if category not in categories_used:
                new_items.append(item)
                categories_used.append(category)
            if len(new_items) == 2:
                break
    if len(new_items) < 2:
        if new_popular:
            for item in new_popular:
                category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
                if category not in categories_used:
                    new_items.append(item)
                    categories_used.append(category)
                if len(new_items) == 2:
                    break
    if len(new_items) < 2:
        new_list = [item for item in item_features['item_id'].unique() if item not in train_bought_items_list]
        for item in new_list:
                category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
                if category not in categories_used:
                    new_items.append(item)
                    categories_used.append(category)
                if len(new_items) == 2:
                    break
    
    if final_recommendations[0] in train_bought_items_list:
        new_items = new_items[:2]
    else:
        new_items = new_items[:1]
    for new_item in new_items:
        if new_item not in final_recommendations:
            final_recommendations.append(new_item)
            
        if new_item in unique_recommendations:
            unique_recommendations.remove(new_item)
            
    if unique_recommendations:
        for item in unique_recommendations:
            category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]

            if category not in categories_used:
                final_recommendations.append(item)
                unique_recommendations.remove(item)
                categories_used.append(category)
            if len(final_recommendations) >= N:
                break

        # Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
    n_rec = len(final_recommendations)
    if n_rec < N:
        # Более корректно их нужно дополнить топом популярных (например)
        for item in top_popular_items:
            category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
            
            if category not in categories_used:
                if item not in final_recommendations:
                    final_recommendations.append(item)
                    categories_used.append(category)
            if len(final_recommendations) >= N:
                break
        #final_recommendations.extend(top_popular_items[:N - n_rec])
    
    final_recommendations = final_recommendations[:N]

    assert len(final_recommendations) == N, 'Количество рекомендаций != {}'.format(N)

    return final_recommendations

In [5]:
# src.recommenders
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, item_features, user_features, filter_ctm=False, weighting=True):
        
        self.vip_users = self._get_vip_users(data)
        self.high_mean_cheque_users = self._get_high_mean_cheque_users(data)

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id']).agg({
            'sales_value' : 'sum', 
            'quantity': 'sum',
            'basket_id': 'count'
            }).reset_index()
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 9999999]
        self.top_purchases = self.top_purchases[self.top_purchases['basket_id'] > 1]
        self.top_purchases['price'] = self.top_purchases['sales_value'] / self.top_purchases['quantity']
        #self.top_purchases = self.top_purchases[self.top_purchases['price'] > 4]
        self.top_purchases.sort_values(['basket_id', 'price'], ascending=False, inplace=True)

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['basket_id'].count().reset_index()
        self.overall_top_purchases.sort_values('basket_id', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 9999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
        
        # Топ дорогих покупок по всему датасету
        self.overall_top_expensive_items = data.groupby('item_id').agg({
            'sales_value' : 'sum', 
            'quantity': 'sum',
            'basket_id': 'count',
            'user_id' : 'nunique'
            }).reset_index()
        self.overall_top_expensive_items['price'] = self.overall_top_expensive_items['sales_value'] / self.overall_top_expensive_items['quantity']
        self.overall_top_expensive_items = self.overall_top_expensive_items[self.overall_top_expensive_items['price'] > 7]
        self.overall_top_expensive_items = self.overall_top_expensive_items[self.overall_top_expensive_items['item_id'] != 9999999]
        self.overall_top_expensive_items.sort_values('user_id', ascending=False, inplace=True)
        self.overall_top_expensive_items = self.overall_top_expensive_items.reset_index()
        
        # топовый товар
        self.top_expensive_item = self.overall_top_expensive_items['item_id'][0]
        
        # если пользователь раньше не покупал дорогих товаров, мы вряд ли угадаем, поэтому оставим только товары с минимальной подходящей ценой
        self.overall_top_expensive_items = self.overall_top_expensive_items[self.overall_top_expensive_items['price'] < 7.1] 
        self.overall_top_expensive_purchases = self.overall_top_expensive_items.item_id.tolist()

        self.user_item_matrix, self.matrix_index, self.matrix_columns = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)
        
        self.user_feat_lightfm, self.item_feat_lightfm, \
            self.user_feat_values, self.item_feat_values = self._prepare_lightfm_features(self.matrix_index, self.matrix_columns, user_features, item_features)
        
        self.item_id_to_ctm = self._prepare_ctm_dict(item_features)
        
        self.prices_dict = self._prepare_prices_dict(data)
        self.bought_items_dict = self._prepare_bought_items_dict(data)

        if weighting:#=='bm25_weight':
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T
        
        #if weighting:#=='tfidf_weight':
         #   self.user_item_matrix = tfidf_weight(self.user_item_matrix.T).T

        self.model_als = self.fit_als(self.user_item_matrix)
        self.model_bpr = self.fit_bpr(self.user_item_matrix)
       # self.model_lightfm = self.fit_lightfm(self.user_item_matrix, self.user_feat_values, self.item_feat_values)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        self.knn_recommender = self.fit_knn_recommender(self.user_item_matrix)
        self.cosine_recommender = self.fit_cosine_recommender(self.user_item_matrix)
        self.tfidf_recommender = self.fit_tfidf_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', columns='item_id',
                                          values='basket_id',  # Можно пробовать другие варианты
                                          aggfunc='count',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit
        
        matrix_index = user_item_matrix.index
        matrix_columns = user_item_matrix.columns

        return user_item_matrix, matrix_index, matrix_columns
    
    @staticmethod
    def _prepare_lightfm_features(matrix_index, matrix_columns, user_features, item_features):
        user_feat = pd.DataFrame(matrix_index)
        user_feat = user_feat.merge(user_features, on='user_id', how='left')
        user_feat.set_index('user_id', inplace=True)

        item_feat = pd.DataFrame(matrix_columns)
        item_feat = item_feat.merge(item_features, on='item_id', how='left')
        item_feat.set_index('item_id', inplace=True)
        
        user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
        item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())
        
        user_feat_values = user_feat_lightfm.values
        item_feat_values = user_feat_lightfm.values
        
        return user_feat_lightfm, item_feat_lightfm, user_feat_values, item_feat_values
           
    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
    
    # Словарь СТМ
    @staticmethod
    def _prepare_ctm_dict(item_features):
            # Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ
        item_id_to_ctm = {}
        for i in range(item_features.shape[0]):
            if item_features['brand'][i] == 'Private':
                item_id_to_ctm[item_features['item_id'][i]] = 1
            else:
                item_id_to_ctm[item_features['item_id'][i]] = 0
                
        return item_id_to_ctm
                
    # Словарь цен
    @staticmethod
    def _prepare_prices_dict(data):
        prices = data.groupby('item_id').agg({
            'sales_value' : 'sum', 
            'quantity': 'sum'
            }).reset_index()
    
        prices['price'] = prices['sales_value'] / prices['quantity']
        prices.loc[prices['quantity'] == 0, 'price'] = 0
        prices.sort_values('price', ascending=True, inplace=True)

        prices_dict = dict(zip(prices['item_id'], prices['price']))
        
        return prices_dict
    
    # Словарь предыдущих покупок пользователя
    @staticmethod
    def _prepare_bought_items_dict(data):
        bought_items = data.groupby('user_id')['item_id'].unique().reset_index()
        bought_items_dict = dict(zip(bought_items['user_id'], bought_items['item_id']))
        
        return bought_items_dict
    
    @staticmethod
    def _get_high_mean_cheque_users(data):
        users_purchases = data.groupby(['user_id', 'item_id']).agg({
            'sales_value' : 'sum', 
            'quantity': 'sum'}).reset_index()
        users_purchases['price'] = users_purchases['sales_value'] / users_purchases['quantity']
        users_purchases_mean_price = users_purchases.groupby('user_id')['price'].mean().reset_index()
        high_mean_cheque_users = users_purchases_mean_price[users_purchases_mean_price['price'] > 4.5]['user_id'].unique().tolist()
        
        return high_mean_cheque_users
    
    @staticmethod
    def _get_vip_users(data):
        users_purchases = data.groupby(['user_id', 'item_id']).agg({
            'sales_value' : 'sum', 
            'quantity': 'sum'}).reset_index()
        users_purchases['price'] = users_purchases['sales_value'] / users_purchases['quantity']
        users_purchases['price_flag'] = users_purchases['price'].apply(lambda x: 1 if x > 4.5 else 0)
        flags_count = users_purchases.groupby('user_id').agg({
            'item_id': 'count',
            'price_flag': 'sum'}).reset_index()
        flags_count['part_more_4_5'] = flags_count['price_flag'] / flags_count['item_id']
        vip_users =  flags_count[flags_count['part_more_4_5'] > 0.3]['user_id'].unique().tolist()
        
        return vip_users

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender
    
    @staticmethod
    def fit_knn_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных ближайшими соседями"""

        knn_recommender = ItemItemRecommender(K=5, num_threads=4)
        knn_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return knn_recommender
    
    @staticmethod
    def fit_cosine_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары по косинусной схожести"""

        cosine_recommender = CosineRecommender(K=5, num_threads=4)
        cosine_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return cosine_recommender
    
    @staticmethod
    def fit_tfidf_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, занижая вес популярных"""

        tfidf_recommender = TFIDFRecommender(K=5, num_threads=4)
        tfidf_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return tfidf_recommender

    @staticmethod
    def fit_als(user_item_matrix, n_factors=20, regularization=0.001, iterations=10, num_threads=4):
        """Обучает ALS"""

        model_als = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model_als.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model_als
    
    @staticmethod
    def fit_bpr(user_item_matrix, n_factors=20, regularization=0.001, iterations=10, num_threads=4):
        """Обучает BPR"""

        model_bpr = BayesianPersonalizedRanking(factors=n_factors,
                                                regularization=regularization,
                                                iterations=iterations,
                                                num_threads=num_threads)
        model_bpr.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model_bpr

    
    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        if user in self.matrix_index:
            res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=self.userid_to_id[user],
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[9999999]],#None, если не вводим фиктивный item_id
                                        recalculate_user=False)]
        else:
            res = []

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стандартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model_als, N=N)
    
    def get_bpr_recommendations(self, user, N=5):
        """Рекомендации через стандартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model_bpr, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)
    
    def get_knn_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые купили ближайшие соседи"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.knn_recommender, N=N)
    
    def get_cosine_recommendations(self, user, N=5):
        """Рекомендуем товары по косинусной схожести"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.cosine_recommender, N=N)
    
    def get_tfidf_recommendations(self, user, N=5):
        """Рекомендуем товары, занижая вес популярных"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.tfidf_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        if filter_ctm:
            ctm = [key for key, value in self.item_id_to_ctm.items() if value == 1]
            top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)
            top_users_purchases = top_users_purchases[top_users_purchases['item_id'].isin(ctm)]

        else:
            top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            res.extend(self.get_own_recommendations(user, N=1))

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        
        return res

# Загрузка данных

In [6]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
# spliting data
test_size_weeks = 3

train_data = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
test_data = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [9]:
# filtering data
n_items_before = train_data['item_id'].nunique()

train_data = prefilter_items(train_data, 
                             item_features=item_features, 
                             resent_weeks=12, 
                             filter_frequency=False, 
                             cheap_filter=False, 
                             rare_department_filter=False, 
                             popular_filter=True, 
                             take_n_popular=22000)

n_items_after = train_data['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 21595


In [10]:
recommender = MainRecommender(train_data, item_features, user_features)



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21595.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21595.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21595.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21595.0), HTML(value='')))




In [11]:
result = test_data.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [12]:
N = 110
top_popular_items = recommender.overall_top_purchases[:N]
train_users_list = train_data['user_id'].unique().tolist()

In [13]:
result['self_top_recs'] = result['user_id'].apply(lambda x: recommender.top_purchases[recommender.top_purchases['user_id'] == x]['item_id'].unique() if x in train_users_list else top_popular_items)

In [14]:
result['als_recs'] = result['user_id'].apply(lambda x: recommender.get_als_recommendations(user=x, N=N) if x in train_users_list else top_popular_items)

In [15]:
result['bpr_recs'] = result['user_id'].apply(lambda x: recommender.get_bpr_recommendations(user=x, N=N) if x in train_users_list else top_popular_items)

In [16]:
result['own_recs'] = result['user_id'].apply(lambda x: recommender.get_own_recommendations(user=x, N=N) if x in train_users_list else top_popular_items)

In [17]:
result['knn_recs'] = result['user_id'].apply(lambda x: recommender.get_knn_recommendations(user=x, N=N) if x in train_users_list else top_popular_items)

In [18]:
result['cosine_recs'] = result['user_id'].apply(lambda x: recommender.get_cosine_recommendations(user=x, N=N) if x in train_users_list else top_popular_items)

In [19]:
result['tfidf_recs'] = result['user_id'].apply(lambda x: recommender.get_tfidf_recommendations(user=x, N=N) if x in train_users_list else top_popular_items)

In [20]:
%%time
result['final_self_top_recs'] = result.apply(lambda row: postfilter_items(row['self_top_recs'], 
                                                                            row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            recommender.top_expensive_item,
                                                                            item_features, 
                                                                            recommender.prices_dict, 
                                                                            N=5,
                                                                            money_rank=False,
                                                                            divide_users=True), axis=1)

Wall time: 1min 12s


In [21]:
result['final_als_recs'] = result.apply(lambda row: postfilter_items(row['als_recs'], row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            recommender.top_expensive_item,
                                                                            item_features, 
                                                                            recommender.prices_dict, 
                                                                            N=5,
                                                                            money_rank=False,
                                                                            divide_users=True), axis=1)

In [22]:
result['final_bpr_recs'] = result.apply(lambda row: postfilter_items(row['bpr_recs'], row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            recommender.top_expensive_item,
                                                                            item_features, 
                                                                            recommender.prices_dict, 
                                                                            N=5,
                                                                            money_rank=False,
                                                                            divide_users=True), axis=1)

In [23]:
result['final_own_recs'] = result.apply(lambda row: postfilter_items(row['own_recs'], row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            recommender.top_expensive_item,
                                                                            item_features, 
                                                                            recommender.prices_dict, 
                                                                            N=5,
                                                                            money_rank=False,
                                                                            divide_users=True), axis=1)

In [25]:
result['final_knn_recs'] = result.apply(lambda row: postfilter_items(row['knn_recs'], row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            recommender.top_expensive_item,
                                                                            item_features, 
                                                                            recommender.prices_dict, 
                                                                            N=5,
                                                                            money_rank=False,
                                                                            divide_users=True), axis=1)

In [26]:
result['final_cosine_recs'] = result.apply(lambda row: postfilter_items(row['cosine_recs'], row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            recommender.top_expensive_item,
                                                                            item_features, 
                                                                            recommender.prices_dict, 
                                                                            N=5,
                                                                            money_rank=False,
                                                                            divide_users=True), axis=1)

In [27]:
# похоже, единственная функция, возвращающая списки меньше 5
result['final_tfidf_recs'] = result.apply(lambda row: postfilter_items(row['tfidf_recs'], row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            recommender.top_expensive_item,
                                                                            item_features, 
                                                                            recommender.prices_dict,
                                                                            N=5,
                                                                            money_rank=False,
                                                                            divide_users=True), axis=1)

In [28]:
result.apply(lambda row: money_precision_at_k(row['final_self_top_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.20003298012533735

In [29]:
result.apply(lambda row: money_precision_at_k(row['final_als_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.07918718428614631

In [30]:
result.apply(lambda row: money_precision_at_k(row['final_bpr_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.05711043380214204

In [31]:
result.apply(lambda row: money_precision_at_k(row['final_own_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.07688712382952065

In [32]:
result.apply(lambda row: money_precision_at_k(row['final_knn_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.07938718231654371

In [33]:
result.apply(lambda row: money_precision_at_k(row['final_cosine_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.08149234122237256

In [34]:
result.apply(lambda row: money_precision_at_k(row['final_tfidf_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.06803543321442851

In [35]:
result['final_recs'] = result['final_self_top_recs']

In [36]:
# всего товаров попало в рекомендации
all_recs = []
for i in range(result.shape[0]):
    for item in result['final_recs'][i]:
        if item not in all_recs:
            all_recs.append(item)
print(f'Рекомендуем {len(all_recs)} товаров')

Рекомендуем 2282 товаров


In [37]:
# Проверяем категории:
for i in range(result.shape[0]):
    category_list = [] 
    for item in result['final_recs'][i]:
        category = item_features.loc[item_features['item_id'] == item, 'sub_commodity_desc'].values[0]
        category_list.append(category)
    if len(set(category_list)) != 5:
        print(f"Для пользователя {result['user_id'][i]} нет 5 разных категорий")

In [38]:
# Проверяем наличие дорогого товара:
for i in range(result.shape[0]):
    prices_recommended = np.array([recommender.prices_dict[item] for item in result['final_recs'][i]])
    res = [1 if price>7  else 0 for price in prices_recommended]
    if sum(res) == 0:
        print(f"Для пользователя {result['user_id'][i]} нет дорогого товара")

In [39]:
# Проверяем наличие двух новых товаров:
for i in range(result.shape[0]):
    user_id = result['user_id'][i]
    if user_id in recommender.bought_items_dict.keys():
        user_bought_list = recommender.bought_items_dict[user_id]
    else: 
        user_bought_list = []
    flags = np.isin(result['final_recs'][i], user_bought_list)
    if sum(flags) > 3:
        print(f'Для пользователя {user_id} нет двух новых товаров')

In [40]:
# Проверяем цены больше 1 долл.:
for i in range(result.shape[0]):
    prices_recommended = np.array([recommender.prices_dict[item] for item in result['final_recs'][i]])
    res = [1 if price<1  else 0 for price in prices_recommended]
    if sum(res) != 0:
        print(f"Для пользователя {result['user_id'][i]} есть товары с ценой меньше 1 долл")

In [41]:
result[['user_id', 'final_recs']].to_csv('OBerezovskaya_recs_.csv', index=False)