In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [84]:
weight = data_train.groupby('item_id')['sales_value'].sum().reset_index()

In [106]:
# list(zip(items, weights))
# убираем нули, заменив значение на близкое к нулю
weight.loc[weight['sales_value']==0]=0.0000000000001

In [116]:
weights = (weight['sales_value'] / sum(weight['sales_value']))
# weights =np.log1p(weight['sales_value'] / sum(weight['sales_value']))

In [117]:
weights[weights==0].value_counts()

Series([], Name: sales_value, dtype: int64)

Проверим сумму весов - должно равняться == 1

In [118]:
(weights).sum()

0.9999999999994853

In [119]:
items = weight['item_id']
# items = data_train.item_id.unique()

In [120]:
tuples = list(zip(items, weights))
items_weights = pd.DataFrame(tuples,
                  columns = ['item_id', 'weight'])

In [121]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    items=(items_weights['item_id'])
    weights=(items_weights['weight'])
    
    recs = np.random.choice(items ,p=(weights), size=n,replace=False)
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    
    return recs.tolist()

In [122]:
%%time



result['weighted_random_recommendation'] = result['user_id'].map(lambda x: weighted_random_recommendation(items_weights, n=5))
result.head(2)

CPU times: user 1.52 s, sys: 0 ns, total: 1.52 s
Wall time: 1.52 s


Unnamed: 0,user_id,actual,weighted_random_recommendation,my_rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[994065.0, 908846.0, 1105182.0, 9884070.0, 102...","[6534178, 6533889, 480014, 6534166, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6034857.0, 1117444.0, 328490.0, 8019217.0, 93...","[6534178, 1404121, 6533889, 6534166, 5747233]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5, Precision@3, Recall@5 для каждого алгоритма с помощью функций из вебинара 1. Какой алгоритм показывает лучшее качество?

In [123]:
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation,my_rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[994065.0, 908846.0, 1105182.0, 9884070.0, 102...","[6534178, 6533889, 480014, 6534166, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6034857.0, 1117444.0, 328490.0, 8019217.0, 93...","[6534178, 1404121, 6533889, 6534166, 5747233]"


In [124]:
# your_code
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [125]:
# your_code
def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    recall_at_k = flags.sum() / len(bought_list)
    
    
    return recall_at_k

In [126]:
result.apply(lambda row: precision_at_k(row['weighted_random_recommendation'], row['actual'],k=5), axis=1).mean()

0.02184133202742413

In [127]:
result.apply(lambda row: precision_at_k(row['weighted_random_recommendation'], row['actual'],k=3), axis=1).mean()

0.020731309174012415

In [128]:
result.apply(lambda row: recall_at_k(row['weighted_random_recommendation'], row['actual'],k=5), axis=1).mean()

0.004178047672020735

In [129]:
# result.apply(lambda row: recall_at_k(row['weighted_random_recommendation'], row['actual'],k=3), axis=1).mean()

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 (или другое количество) товаров


In [149]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


Ищем топ 5000 популярных товаров и их индексы

In [150]:
# your_code
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [151]:
index_5000=popularity[popularity['item_id'].isin(top_5000)]

In [152]:
index_5000

Unnamed: 0,item_id,n_sold
2694,202291,35911
3414,397896,1214994
3583,420647,168661
3880,480014,371107
4207,545926,20134
...,...,...
85543,15926886,260
85626,15927403,376
85650,15927661,230
85668,15927850,269


In [153]:
# weights_5000 = (popularity['n_sold'] / sum(popularity['n_sold']))

In [158]:
weights_5000 = (index_5000['n_sold'] / sum(index_5000['n_sold']))

In [159]:
weights_5000.sum()

1.0

In [160]:
index_5000=index_5000.item_id

In [161]:
tuple_5000=list(zip(index_5000,weights_5000))

In [162]:
df=pd.DataFrame(tuple_5000,columns=['item_id','weight'])

In [169]:
df

Unnamed: 0,item_id,weight
0,202291,0.000157
1,397896,0.005322
2,420647,0.000739
3,480014,0.001626
4,545926,0.000088
...,...,...
4995,15926886,0.000001
4996,15927403,0.000002
4997,15927661,0.000001
4998,15927850,0.000001


In [164]:
%%time

result['my_rec'] = result['user_id'].map(lambda x: weighted_random_recommendation(df, n=5))
result.head(5)

CPU times: user 384 ms, sys: 3.38 ms, total: 388 ms
Wall time: 386 ms


Unnamed: 0,user_id,actual,weighted_random_recommendation,my_rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[994065.0, 908846.0, 1105182.0, 9884070.0, 102...","[6534178, 6533889, 6534166, 397896, 936355]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6034857.0, 1117444.0, 328490.0, 8019217.0, 93...","[6534178, 1057102, 6534166, 6533889, 6544236]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[870549.0, 5569230.0, 9487603.0, 1116671.0, 11...","[6534178, 6533889, 1070820, 6534166, 6544236]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1024639.0, 6534178.0, 1044078.0, 12262978.0, ...","[5668996, 6534178, 862732, 6533889, 6544236]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1069532.0, 1020581.0, 948670.0, 1025888.0, 84...","[6534178, 6534166, 6533889, 6544236, 1404121]"


In [165]:
result.apply(lambda row: precision_at_k(row['my_rec'], row['actual'],k=5), axis=1).mean()

0.04750244857982409

In [166]:
result.apply(lambda row: precision_at_k(row['my_rec'], row['actual'],k=3), axis=1).mean()

0.07313091740124059

In [167]:
result.apply(lambda row: recall_at_k(row['my_rec'], row['actual'],k=5), axis=1).mean()

0.012127111186797858

In [168]:
# result.apply(lambda row: recall_at_k(row['my_rec'], row['actual'],k=3), axis=1).mean()

Вывод : метрики для weighted_random_recommendation выросли в 2 раза при предсказании на топ 5000 товаров. Таким образом мы улучшили качество предсказаний, взяв топ 5000

### Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [173]:
# Заведем фиктивный item_id (если юзер покупал товары не из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [174]:
user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

число соседей=5

In [184]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(sparse_user_item,  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1],  # userid - id от 0 до N
                        user_items=sparse_user_item,   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 931 ms, sys: 5.96 ms, total: 937 ms
Wall time: 321 ms


In [185]:
recs

(array([3587, 2307, 2148, 3947, 3408], dtype=int32),
 array([ 7431., 12981., 43806., 17219., 56269.]))

In [186]:
%%time
result['itemitem'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

CPU times: user 62.7 ms, sys: 278 µs, total: 63 ms
Wall time: 62.4 ms


In [187]:
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation,my_rec,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[994065.0, 908846.0, 1105182.0, 9884070.0, 102...","[6534178, 6533889, 6534166, 397896, 936355]","[1098066, 995242, 981760, 1127831, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6034857.0, 1117444.0, 328490.0, 8019217.0, 93...","[6534178, 1057102, 6534166, 6533889, 6544236]","[1098066, 995242, 981760, 1127831, 1082185]"


In [194]:
result.apply(lambda row: precision_at_k(row['itemitem'], row['actual'],k=5), axis=1).mean()

0.14573947110675645

In [197]:
result.apply(lambda row: recall_at_k(row['itemitem'], row['actual'],k=5), axis=1).mean()

0.016217944397612795

In [None]:
число соседей=2,10,100

In [273]:
%%time

model = ItemItemRecommender(K=4, num_threads=4) # K - кол-во билжайших соседей

model.fit(sparse_user_item,  # На вход item-user matrix
          show_progress=True)

# recs = model.recommend(userid=userid_to_id[1],  # userid - id от 0 до N
#                         user_items=sparse_user_item,   # на вход user-item matrix
#                         N=5, # кол-во рекомендаций 
#                         filter_already_liked_items=False, 
#                         filter_items=[itemid_to_id[999999]], 
#                         recalculate_user=False)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 903 ms, sys: 23.7 ms, total: 927 ms
Wall time: 310 ms


In [274]:
%%time
result['itemitem-4'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

CPU times: user 105 ms, sys: 88 µs, total: 106 ms
Wall time: 104 ms


In [275]:
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation,my_rec,itemitem,itemitem-10,itemitem-2,itemitem-100,itemitem-1,itemitem-3,itemitem-4
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[994065.0, 908846.0, 1105182.0, 9884070.0, 102...","[6534178, 6533889, 6534166, 397896, 936355]","[1098066, 995242, 981760, 1127831, 1082185]","[1127831, 981760, 840361, 1082185, 995242]","[904360, 1029743, 840361, 995242, 1082185]","[1081177, 995785, 1004906, 1082185, 1029743]","[1081177, 995785, 1004906, 1082185, 1029743]","[840361, 1029743, 1082185, 981760, 995242]","[840361, 1127831, 981760, 1082185, 995242]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6034857.0, 1117444.0, 328490.0, 8019217.0, 93...","[6534178, 1057102, 6534166, 6533889, 6544236]","[1098066, 995242, 981760, 1127831, 1082185]","[1127831, 981760, 840361, 1082185, 995242]","[904360, 1029743, 840361, 995242, 1082185]","[1081177, 995785, 1004906, 1082185, 1029743]","[1081177, 995785, 1004906, 1082185, 1029743]","[840361, 1029743, 1082185, 981760, 995242]","[840361, 1127831, 981760, 1082185, 995242]"


In [280]:
result.apply(lambda row: precision_at_k(row['itemitem-1'], row['actual'],k=5), axis=1).mean()

0.16229187071498333

In [277]:
result.apply(lambda row: precision_at_k(row['itemitem-2'], row['actual'],k=5), axis=1).mean()

0.16297747306561994

In [278]:
result.apply(lambda row: precision_at_k(row['itemitem-3'], row['actual'],k=5), axis=1).mean()

0.19285014691478689

In [276]:
result.apply(lambda row: precision_at_k(row['itemitem-4'], row['actual'],k=5), axis=1).mean()

0.15700293829578643

In [279]:
result.apply(lambda row: precision_at_k(row['itemitem-10'], row['actual'],k=5), axis=1).mean()

0.15700293829578643

Лучше всего сработала модель при числе соседей К-3