# Финальный проект
Мы уже прошли всю необходимую теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ). Рекомендуем вам начать делать проект уже сейчас

Целевая метрика - money precision@5. Порог для уcпешной сдачи проекта money precision@5 > 20%
Бизнес ограничения в топ-5 товарах:

- Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)

- 2 новых товара (юзер никогда не покупал)

- 1 дорогой товар, > 7 долларов (price = sum(sales_value) / sum(quantity))

- Все товары из разных категорий (категория - sub_commodity_desc)

- Стоимость каждого рекомендованного товара > 1 доллара (price = sum(sales_value) / sum(quantity))

- НЕТ ограничения на СТМ

- Вам нужно добавить обрааботку холодных юзеров (рекомендовать им топ-5 популярных товаров)

- Будет public тестовый датасет, на котором вы сможете измерять метрику

- Также будет private тестовый датасет для измерения финального качества

- НЕ обязательно использовать 2-ух уровневые рекоммендательные системы в проекте

- Вы сдаете код проекта в виде github репозитория и .csv файл с рекомендациями. В .csv файле 2 столбца: user_id - (item_id1, item_id2, ..., item_id5)

Дедлайн - 4 октября 23:59

Бейзлайн можно найти на github

# Подключение библиотек и скриптов

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Функции

In [2]:
# src.metrics
# целевая метрика
def money_precision_at_k(recommended_list, bought_list, prices_dict, k=5):
        
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    if k < len(recommended_list):
        recommended_list = recommended_list[:k]
        
    prices_recommended = np.array([prices_dict[i] for i in recommended_list])
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = (flags*prices_recommended).sum() / prices_recommended.sum()
    
    return precision

In [3]:
# src.utils
def prefilter_items(data,
                    item_features=None,
                    resent_weeks=12,
                    popular_filter=True, 
                    take_n_popular=22000):
    """Предфильтрация товаров"""
    
    # Уберем товары с нулевым количеством и выручкой
    data = data[data['quantity'] != 0]
    data = data[data['sales_value'] != 0]
    
    prices = data.groupby('item_id').agg({
        'sales_value' : 'sum', 
        'quantity': 'sum'
    }).reset_index()
    
    prices['price'] = prices['sales_value'] / prices['quantity']
    #prices.loc[prices['quantity'] == 0, 'price'] = 0

    # 1. Удаление малоценных товаров 
    low = prices[prices['price'] <= 1]
    low_list = low['item_id'].tolist()
    data = data[~data['item_id'].isin(low_list)]

    # 2. Удаление товаров со слишком большой ценой > 50$
    high = prices[prices['price'] > 50]
    high_list = high['item_id'].tolist()
    data = data[~data['item_id'].isin(high_list)]
    
    # Уберем товары, где дисконт больше 0 (наценка или ошибка?)
    data = data[data['retail_disc'] <= 0]
                
    # Уберем товары, где купонный дисконт -10 и ниже (разово купили по купону и вряд ли купят еще)
    data = data[data['coupon_disc'] > -10]
    
    # Уберем товары, которые не продавались за последние несколько недель
    if resent_weeks != 0:
        weeks = data.groupby('item_id')['week_no'].last().reset_index()
        weeks = weeks[weeks['week_no'] > (96 - resent_weeks)]
        sales_items = weeks['item_id'].tolist()
        data = data[data['item_id'].isin(sales_items)]
    

    # Уберем не интересные для рекомендаций категории (department)
    if item_features is not None:
        # Уберем сезонные товары
        items_in_seasons_comms = item_features[item_features['commodity_desc'].isin(['SPRING/SUMMER SEASONAL', 'HALLOWEEN', 'CHRISTMAS  SEASONAL', 'SEASONAL', 'FALL AND WINTER SEASONAL'])].item_id.unique().tolist()
        data = data[~data['item_id'].isin(items_in_seasons_comms)]
        
        del_deps = [' ', 'FLORAL', 'GM MERCH EXP', 'CHARITABLE CONT', 'ELECT &PLUMBING', 'TOYS', 'HOUSEWARES', 'VIDEO RENTAL', 
                    'PROD-WHS SALES', 'DAIRY DELI', 'CNTRL/STORE SUP', 'PHOTO', 'VIDEO', 'PORK', 'GRO BAKERY', 'HBC',
                    'AUTOMOTIVE', 'RX', 'CNTRL/STORE SUP', 'GM MERCH EXP', 'TRAVEL & LEISUR', 'FROZEN GROCERY', 'RESTAURANT',
                    'COUP/STR & MFG', 'CHEF SHOPPE', 'SPIRITS',  'DELI/SNACK BAR', 'PHARMACY SUPPLY', 'MEAT-WHSE', 'SEAFOOD',
                    'MISC SALES TRAN']
        
        items_del_deps = item_features[item_features['department'].isin(del_deps)].item_id.unique().tolist()
        data = data[~data['item_id'].isin(items_del_deps)]
        
    # 4. Выбор топ-N самых популярных товаров (N = take_n_popular)
    if popular_filter:
        popularity_sales = data.groupby('item_id').agg({
            'sales_value' : 'sum', 
            'user_id': 'count'
            }).reset_index()
        popularity_sales = popularity_sales[popularity_sales['user_id'] > 2]
        popularity_sales.sort_values('sales_value', ascending=False, inplace=True)
        n_popular = popularity_sales['item_id'][:take_n_popular].tolist()

        # Заведем фиктивный item_id (если юзер не покупал товары из топ-5000, то он "купил" такой товар)
        data.loc[~data['item_id'].isin(n_popular), 'item_id'] = 9999999
        n_popular.append(9999999)

        data = data[data['item_id'].isin(n_popular)]

    return data

In [4]:
def postfilter_items(recommendations, 
                     user_id,
                     vip_users,
                     train_bought_items_dict, 
                     top_popular_items,
                     overall_top_expensive_purchases,
                     item_features, 
                     prices_dict, 
                     N=5, 
                     divide_users=True):
    """Постфильтрация товаров"""

    # Уникальность
    #     recommendations = list(set(recommendations)) - неверно! так теряется порядок
    unique_recommendations = []
    [unique_recommendations.append(item) for item in recommendations if item not in unique_recommendations]
    
    # ранжируем только покупателей с высоким средним чеком - это работает
    if divide_users:
        if user_id in vip_users:
            prices_unique_recommendations = [prices_dict[item] for item in unique_recommendations]
            recommended_prises_dict = dict(zip(unique_recommendations, prices_unique_recommendations))
            unique_recommendations = sorted(recommended_prises_dict, key=lambda x: recommended_prises_dict.get(x), reverse=True)
    
    # Разные категории
    categories_used = []
    final_recommendations = []
    CATEGORY_NAME = 'sub_commodity_desc'
    
    # 1 дорогой товар, > 7 долларов
    expensive_list = []
            
    for key in prices_dict.keys():
        if prices_dict[key] > 7:
            expensive_list.append(key)
            
    if user_id in train_bought_items_dict.keys():
        train_bought_items_list = train_bought_items_dict[user_id]
    else:
        train_bought_items_list = []

    # берем несколько самых популярных товаров от 7 до 8 долл
    expensive_list_recs = []
     # в первую очередь дополняем товарами из рекомендованных
    [expensive_list_recs.append(item) for item in unique_recommendations if item in expensive_list]
    # дополняем популярными дорогими
    [expensive_list_recs.append(item) for item in overall_top_expensive_purchases if item not in expensive_list_recs]
    
    expensive_item = expensive_list_recs[0]         
    category = item_features.loc[item_features['item_id'] == expensive_item, CATEGORY_NAME].values[0]
    if category not in categories_used:
        final_recommendations.append(expensive_item)
        categories_used.append(category)
    
    if expensive_item in unique_recommendations:
        unique_recommendations.remove(expensive_item)
        
    # 2 новых товара (юзер никогда не покупал)    
    new_rec_list = [item for item in unique_recommendations if item not in train_bought_items_list]
    top_popular_items_not_from_recs = [item for item in top_popular_items if item not in unique_recommendations]
    new_popular = [item for item in top_popular_items_not_from_recs if item not in train_bought_items_list]
    # выбираем наиболее дешевые
    prices_new_popular = [prices_dict[item] for item in new_popular]
    new_popular_prises_dict = dict(zip(new_popular, prices_new_popular))
    new_popular = sorted(new_popular, key=lambda x: new_popular_prises_dict.get(x), reverse=False)
    
    new_items = []
    if new_rec_list:
        for item in new_rec_list:
            category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
            if category not in categories_used:
                new_items.append(item)
                categories_used.append(category)
            if len(new_items) == 2:
                break
    if len(new_items) < 2:
        if new_popular:
            for item in new_popular:
                category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
                if category not in categories_used:
                    new_items.append(item)
                    categories_used.append(category)
                if len(new_items) == 2:
                    break
    if len(new_items) < 2:
        new_list = [item for item in item_features['item_id'].unique() if item not in train_bought_items_list]
        for item in new_list:
                category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
                if category not in categories_used:
                    new_items.append(item)
                    categories_used.append(category)
                if len(new_items) == 2:
                    break
    
    if final_recommendations[0] in train_bought_items_list:
        new_items = new_items[:2]
    else:
        new_items = new_items[:1]
    for new_item in new_items:
        if new_item not in final_recommendations:
            final_recommendations.append(new_item)
            
        if new_item in unique_recommendations:
            unique_recommendations.remove(new_item)
            
    if unique_recommendations:
        for item in unique_recommendations:
            category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]

            if category not in categories_used:
                final_recommendations.append(item)
                unique_recommendations.remove(item)
                categories_used.append(category)
            if len(final_recommendations) >= N:
                break

        # Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
    n_rec = len(final_recommendations)
    if n_rec < N:
        # Более корректно их нужно дополнить топом популярных (например)
        for item in top_popular_items:
            category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
            
            if category not in categories_used:
                if item not in final_recommendations:
                    final_recommendations.append(item)
                    categories_used.append(category)
            if len(final_recommendations) >= N:
                break
    
    final_recommendations = final_recommendations[:N]

    assert len(final_recommendations) == N, 'Количество рекомендаций != {}'.format(N)

    return final_recommendations

In [5]:
# src.recommenders
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, item_features, user_features, filter_ctm=False, weighting=True):
        
        self.vip_users = self._get_vip_users(data)

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id']).agg({
            'sales_value' : 'sum', 
            'quantity': 'sum',
            'basket_id': 'count'
            }).reset_index()
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 9999999]
        self.top_purchases = self.top_purchases[self.top_purchases['basket_id'] > 1]
        self.top_purchases['price'] = self.top_purchases['sales_value'] / self.top_purchases['quantity']
        self.top_purchases.sort_values(['basket_id', 'price'], ascending=False, inplace=True)

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['basket_id'].count().reset_index()
        self.overall_top_purchases.sort_values('basket_id', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 9999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
        
        # Топ дорогих покупок по всему датасету
        self.overall_top_expensive_items = data.groupby('item_id').agg({
            'sales_value' : 'sum', 
            'quantity': 'sum',
            'basket_id': 'count',
            'user_id' : 'nunique'
            }).reset_index()
        self.overall_top_expensive_items['price'] = self.overall_top_expensive_items['sales_value'] / self.overall_top_expensive_items['quantity']
        self.overall_top_expensive_items = self.overall_top_expensive_items[self.overall_top_expensive_items['price'] > 7]
        self.overall_top_expensive_items = self.overall_top_expensive_items[self.overall_top_expensive_items['item_id'] != 9999999]
        self.overall_top_expensive_items.sort_values('user_id', ascending=False, inplace=True)
        self.overall_top_expensive_items = self.overall_top_expensive_items.reset_index()        
        # если пользователь раньше не покупал дорогих товаров, мы вряд ли угадаем, поэтому оставим только товары с минимальной подходящей ценой
        self.overall_top_expensive_items = self.overall_top_expensive_items[self.overall_top_expensive_items['price'] < 7.1] 
        self.overall_top_expensive_purchases = self.overall_top_expensive_items.item_id.tolist()
        
        self.prices_dict = self._prepare_prices_dict(data)
        self.bought_items_dict = self._prepare_bought_items_dict(data)
                
    # Словарь цен
    @staticmethod
    def _prepare_prices_dict(data):
        prices = data.groupby('item_id').agg({
            'sales_value' : 'sum', 
            'quantity': 'sum'
            }).reset_index()
    
        prices['price'] = prices['sales_value'] / prices['quantity']
        prices.loc[prices['quantity'] == 0, 'price'] = 0
        prices.sort_values('price', ascending=True, inplace=True)

        prices_dict = dict(zip(prices['item_id'], prices['price']))
        
        return prices_dict
    
    # Словарь предыдущих покупок пользователя
    @staticmethod
    def _prepare_bought_items_dict(data):
        bought_items = data.groupby('user_id')['item_id'].unique().reset_index()
        bought_items_dict = dict(zip(bought_items['user_id'], bought_items['item_id']))
        
        return bought_items_dict
    
    @staticmethod
    def _get_vip_users(data):
        users_purchases = data.groupby(['user_id', 'item_id']).agg({
            'sales_value' : 'sum', 
            'quantity': 'sum'}).reset_index()
        users_purchases['price'] = users_purchases['sales_value'] / users_purchases['quantity']
        users_purchases['price_flag'] = users_purchases['price'].apply(lambda x: 1 if x > 4.5 else 0)
        flags_count = users_purchases.groupby('user_id').agg({
            'item_id': 'count',
            'price_flag': 'sum'}).reset_index()
        flags_count['part_more_4_5'] = flags_count['price_flag'] / flags_count['item_id']
        vip_users =  flags_count[flags_count['part_more_4_5'] > 0.3]['user_id'].unique().tolist()
        
        return vip_users

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

# Загрузка данных

In [6]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
# spliting data
test_size_weeks = 3

train_data = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
test_data = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [9]:
# filtering data
n_items_before = train_data['item_id'].nunique()

train_data = prefilter_items(train_data, 
                             item_features=item_features, 
                             resent_weeks=12, 
                             popular_filter=True, 
                             take_n_popular=22000)

n_items_after = train_data['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 21595


In [10]:
recommender = MainRecommender(train_data, item_features, user_features)

In [11]:
result = test_data.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [12]:
N = 110
top_popular_items = recommender.overall_top_purchases[:N]
train_users_list = train_data['user_id'].unique().tolist()

In [13]:
result['self_top_recs'] = result['user_id'].apply(lambda x: recommender.top_purchases[recommender.top_purchases['user_id'] == x]['item_id'].unique() if x in train_users_list else top_popular_items)

In [14]:
%%time
result['final_recs'] = result.apply(lambda row: postfilter_items(row['self_top_recs'], 
                                                                 row['user_id'],
                                                                 recommender.vip_users,
                                                                 recommender.bought_items_dict, 
                                                                 top_popular_items,
                                                                 recommender.overall_top_expensive_purchases,
                                                                 item_features, 
                                                                 recommender.prices_dict,
                                                                 N=5,
                                                                 divide_users=True), axis=1)

Wall time: 1min 12s


In [15]:
result.apply(lambda row: money_precision_at_k(row['final_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.20003298012533735

In [16]:
# всего товаров попало в рекомендации
all_recs = []
for i in range(result.shape[0]):
    for item in result['final_recs'][i]:
        if item not in all_recs:
            all_recs.append(item)
print(f'Рекомендуем {len(all_recs)} товаров')

Рекомендуем 2282 товаров


In [17]:
# Проверяем категории:
for i in range(result.shape[0]):
    category_list = [] 
    for item in result['final_recs'][i]:
        category = item_features.loc[item_features['item_id'] == item, 'sub_commodity_desc'].values[0]
        category_list.append(category)
    if len(set(category_list)) != 5:
        print(f"Для пользователя {result['user_id'][i]} нет 5 разных категорий")

In [18]:
# Проверяем наличие дорогого товара:
for i in range(result.shape[0]):
    prices_recommended = np.array([recommender.prices_dict[item] for item in result['final_recs'][i]])
    res = [1 if price>7  else 0 for price in prices_recommended]
    if sum(res) == 0:
        print(f"Для пользователя {result['user_id'][i]} нет дорогого товара")

In [19]:
# Проверяем наличие двух новых товаров:
for i in range(result.shape[0]):
    user_id = result['user_id'][i]
    if user_id in recommender.bought_items_dict.keys():
        user_bought_list = recommender.bought_items_dict[user_id]
    else: 
        user_bought_list = []
    flags = np.isin(result['final_recs'][i], user_bought_list)
    if sum(flags) > 3:
        print(f'Для пользователя {user_id} нет двух новых товаров')

In [20]:
# Проверяем цены больше 1 долл.:
for i in range(result.shape[0]):
    prices_recommended = np.array([recommender.prices_dict[item] for item in result['final_recs'][i]])
    res = [1 if price<=1  else 0 for price in prices_recommended]
    if sum(res) != 0:
        print(f"Для пользователя {result['user_id'][i]} есть товары с ценой меньше 1 долл")

In [21]:
# сохраняем результат
result[['user_id', 'final_recs']].to_csv('OBerezovskaya_recs.csv', index=False)

In [22]:
# проверяем на тесте
test_data1 = pd.read_csv('data/retail_test1.csv')

In [23]:
result1 = test_data1.groupby('user_id')['item_id'].unique().reset_index()
result1.columns=['user_id', 'actual']
result1.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [24]:
result1['self_top_recs'] = result1['user_id'].apply(lambda x: recommender.top_purchases[recommender.top_purchases['user_id'] == x]['item_id'].unique() if x in train_users_list else top_popular_items)

In [27]:
%%time
result1['final_self_top_recs'] = result1.apply(lambda row: postfilter_items(row['self_top_recs'], 
                                                                            row['user_id'], 
                                                                            recommender.vip_users,
                                                                            recommender.bought_items_dict, 
                                                                            top_popular_items,
                                                                            recommender.overall_top_expensive_purchases,
                                                                            item_features, 
                                                                            recommender.prices_dict, 
                                                                            N=5,
                                                                            divide_users=True), axis=1)

Wall time: 1min 12s


In [28]:
result1.apply(lambda row: money_precision_at_k(row['final_self_top_recs'], row['actual'], recommender.prices_dict, k=5), axis=1).mean()

0.17027612568246056

In [29]:
# всего товаров попало в рекомендации
all_recs = []
for i in range(result1.shape[0]):
    for item in result1['final_self_top_recs'][i]:
        if item not in all_recs:
            all_recs.append(item)
print(f'Рекомендуем {len(all_recs)} товаров')

Рекомендуем 2135 товаров
