In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [5]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [6]:
total_sum_sales = data['sales_value'].sum()
total_quantity = data['quantity'].sum()

f = {'sales_value':'sum', 'quantity':'sum'}
popularity = data.groupby('item_id').agg(f)
popularity['weight_sum_value'] = popularity['sales_value'].apply(lambda x: x/total_sum_sales)
popularity['weight_quantity'] = popularity['quantity'].apply(lambda x: x/total_quantity)
popularity = popularity.reset_index()
# popularity['weight_sum_value'] = popularity['weight_sum_value'].fillna(0)
# popularity['quantity_value_weight'] = popularity['quantity_value_weight'].fillna(0)
popularity

Unnamed: 0,item_id,sales_value,quantity,weight_sum_value,weight_quantity
0,25671,20.94,6,2.817523e-06,2.493948e-08
1,26081,0.99,1,1.332067e-07,4.156580e-09
2,26093,1.59,1,2.139380e-07,4.156580e-09
3,26190,1.54,1,2.072103e-07,4.156580e-09
4,26355,1.98,2,2.664133e-07,8.313160e-09
...,...,...,...,...,...
89046,17991689,2.49,1,3.350349e-07,4.156580e-09
89047,17991691,2.49,1,3.350349e-07,4.156580e-09
89048,18000012,19.96,4,2.685661e-06,1.662632e-08
89049,18024155,3.99,1,5.368632e-07,4.156580e-09


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [7]:
def weighted_random_recommendation(items, items_weights, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    items_weights = np.array(items_weights)
    recs = np.random.choice(items, size=n, p=items_weights, replace=False)
    
    return recs.tolist()

In [8]:
%%time

items = popularity['item_id']
items_weights = popularity['weight_sum_value']

result['weighted_random_recommendation_sales_value'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items, items_weights, n=5))

result.head(2)

Wall time: 5.66 s


Unnamed: 0,user_id,actual,weighted_random_recommendation_sales_value
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1082627, 8181516, 882595, 1037840, 1111986]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[8090532, 868015, 1137383, 894236, 8090541]"


In [9]:
%%time

items = popularity['item_id']
items_weights = popularity['weight_quantity']

result['weighted_random_recommendation_quantity'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items, items_weights, n=5))

result.head(2)

Wall time: 7.45 s


Unnamed: 0,user_id,actual,weighted_random_recommendation_sales_value,weighted_random_recommendation_quantity
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1082627, 8181516, 882595, 1037840, 1111986]","[6534178, 6533889, 480014, 931255, 397896]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[8090532, 868015, 1137383, 894236, 8090541]","[6534178, 5703832, 6533889, 6534166, 6410462]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [10]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation_sales_value,weighted_random_recommendation_quantity,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[1029112, 1060543, 879506, 991303, 1384908]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 962274, 1076199, 884794, 1127831]","[6534166, 6534178, 938700, 6533889, 6544236]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[5666390, 6602548, 12648199, 1003241, 1045948]","[6534178, 6533889, 1029743, 6534166, 1082185]","[853178, 1082185, 1139651, 987480, 1139830]","[6533889, 6534178, 397896, 6534166, 6544236]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]"


Данные загрузили не как списки, обработаем

In [11]:
def to_list_int1(str):
    if str != '[]':
        result = str.strip(',[/]/').split(',')
        int_result = [int(x) for x in result]
        return int_result
    return []


def to_list_int2(str):
    if str != '[]' and str != '':
        result = str.strip(',[/]/').split()
        int_result = [int(x) for x in result]
        return int_result
    return []

In [12]:
for col_name in result.columns[2:]:
    result[col_name] = result[col_name].apply(lambda x: to_list_int1(x))
result['actual'] = result['actual'].apply(lambda x: to_list_int2(x))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation_sales_value,weighted_random_recommendation_quantity,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1029112, 1060543, 879506, 991303, 1384908]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 962274, 1076199, 884794, 1127831]","[6534166, 6534178, 938700, 6533889, 6544236]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[5666390, 6602548, 12648199, 1003241, 1045948]","[6534178, 6533889, 1029743, 6534166, 1082185]","[853178, 1082185, 1139651, 987480, 1139830]","[6533889, 6534178, 397896, 6534166, 6544236]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]"


In [13]:
import os, sys
    
from metrics import hit_rate_at_k, precision_at_k, recall_at_k, ap_k

In [14]:
hit_rate_at_k_list = []
precision_at_k_list = []
recall_at_k_list = []
ap_k_list = []

for name_col in result.columns[1:]:
    hit_rate_at_k_list.append(round(result.apply(lambda row: hit_rate_at_k(row[name_col], row['actual']), axis=1).mean(),4))
    precision_at_k_list.append(round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4))
    recall_at_k_list.append(round(result.apply(lambda row: recall_at_k(row[name_col], row['actual']), axis=1).mean(),4))
    ap_k_list.append(round(result.apply(lambda row: ap_k(row[name_col], row['actual']), axis=1).mean(),4))

  return flags.sum() / len(recommended_list)


In [15]:
metrics_dict = {'hit_rate_at_k':hit_rate_at_k_list, 
                'precision_at_k': precision_at_k_list, 
                'recall_at_k': recall_at_k_list, 
                'ap_k': ap_k_list}

In [16]:
metrics = pd.DataFrame(data=metrics_dict, index=result.columns[1:])
metrics

Unnamed: 0,hit_rate_at_k,precision_at_k,recall_at_k,ap_k
actual,1.0,1.0,0.271,0.0
random_recommendation,0.0034,0.0007,0.0001,0.0
popular_recommendation,0.5313,0.1552,0.025,0.0
weighted_random_recommendation_sales_value,0.097,0.0212,0.0034,0.0
weighted_random_recommendation_quantity,0.2228,0.0469,0.0125,0.0
itemitem,0.4868,0.1368,0.0157,0.0
cosine,0.4765,0.1329,0.0148,0.0
tfidf,0.4853,0.139,0.0154,0.0
own_purchases,0.6126,0.2199,0.0289,0.0


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

Попробую улучшить weighted_random_recommendation_quantity, взяв по топ-5000 товаров

In [17]:
popularity_top = popularity.sort_values(by='quantity', ascending=False).head(5000)
total_quantity_top = popularity_top['quantity'].sum()

popularity_top['weight_quantity_top'] = popularity_top['quantity'].apply(lambda x: x/total_quantity_top)
popularity_top.head(4)

Unnamed: 0,item_id,sales_value,quantity,weight_sum_value,weight_quantity,weight_quantity_top
56233,6534178,467993.62,199684264,0.06297,0.830004,0.833551
56193,6533889,42645.75,16911359,0.005738,0.070293,0.070594
56228,6534166,31298.96,12946508,0.004211,0.053813,0.054043
56341,6544236,6774.1,2578976,0.000911,0.01072,0.010766


In [18]:
%%time

items = popularity_top['item_id']
items_weights = popularity_top['weight_quantity_top']

result['weighted_random_recommendation_quantity_top5000'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items, items_weights, n=5))

result.head(2)

Wall time: 502 ms


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation_sales_value,weighted_random_recommendation_quantity,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation_quantity_top5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1029112, 1060543, 879506, 991303, 1384908]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 962274, 1076199, 884794, 1127831]","[6534166, 6534178, 938700, 6533889, 6544236]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]","[6534178, 1051810, 6534166, 420647, 9707240]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[5666390, 6602548, 12648199, 1003241, 1045948]","[6534178, 6533889, 1029743, 6534166, 1082185]","[853178, 1082185, 1139651, 987480, 1139830]","[6533889, 6534178, 397896, 6534166, 6544236]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]","[6534178, 6534166, 6533889, 1404121, 1388206]"


In [19]:
metrics.loc['weighted_random_recommendation_quantity_top5000'] = [round(result.apply(lambda row: hit_rate_at_k(row['weighted_random_recommendation_quantity_top5000'], row['actual']), axis=1).mean(),4),
                                                             round(result.apply(lambda row: precision_at_k(row['weighted_random_recommendation_quantity_top5000'], row['actual']), axis=1).mean(),4),
                                                             round(result.apply(lambda row: recall_at_k(row['weighted_random_recommendation_quantity_top5000'], row['actual']), axis=1).mean(),4),
                                                             round(result.apply(lambda row: ap_k(row['weighted_random_recommendation_quantity_top5000'], row['actual']), axis=1).mean(),4)]

In [20]:
metrics

Unnamed: 0,hit_rate_at_k,precision_at_k,recall_at_k,ap_k
actual,1.0,1.0,0.271,0.0
random_recommendation,0.0034,0.0007,0.0001,0.0
popular_recommendation,0.5313,0.1552,0.025,0.0
weighted_random_recommendation_sales_value,0.097,0.0212,0.0034,0.0
weighted_random_recommendation_quantity,0.2228,0.0469,0.0125,0.0
itemitem,0.4868,0.1368,0.0157,0.0
cosine,0.4765,0.1329,0.0148,0.0
tfidf,0.4853,0.139,0.0154,0.0
own_purchases,0.6126,0.2199,0.0289,0.0
weighted_random_recommendation_quantity_top5000,0.2223,0.0471,0.0121,0.0


Метрики немного стали выше, кроме полноты. Возможно понижение полноты связано с тем, что в рандом попали только 17% от item