In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [210]:
data = pd.read_csv('/Users/admin/Desktop/GeekBrains/Рекомендательные_системы/Урок_2_implicit/webinar_2 2/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [211]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [212]:
items = data_train.item_id.unique()
len(items)

86865

In [213]:
data_weights = data_train.groupby('item_id')['sales_value'].sum().reset_index()
data_weights.sort_values('sales_value', ascending=False, inplace=True)
data_weights

Unnamed: 0,item_id,sales_value
55470,6534178,447799.94
55430,6533889,40483.34
28895,1029743,35764.66
55465,6534166,30170.77
34707,1082185,26029.96
...,...,...
42020,1244713,0.00
14575,900351,0.00
54721,6424439,0.00
63321,9296891,0.00


In [214]:
summa = data_weights['sales_value'].sum()
summa

7052176.859999999

In [215]:
# взяла веса продаж каждого item в зависимости от общей суммы продаж:

data_weights['weight'] = data_weights['sales_value'] /  summa
data_weights

Unnamed: 0,item_id,sales_value,weight
55470,6534178,447799.94,0.063498
55430,6533889,40483.34,0.005741
28895,1029743,35764.66,0.005071
55465,6534166,30170.77,0.004278
34707,1082185,26029.96,0.003691
...,...,...,...
42020,1244713,0.00,0.000000
14575,900351,0.00,0.000000
54721,6424439,0.00,0.000000
63321,9296891,0.00,0.000000


In [216]:
data_weights['weight'].sum()

1.0

In [217]:
def weighted_random_recommendation(items, items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    items = np.array(items)
    items_weights = np.array(items_weights)
    recs = np.random.choice(items, p=items_weights, size=n, replace=False)
    
    return recs.tolist()

In [218]:
%%time
# your_code

items = data_train.item_id.unique()
items_weights = data_weights['weight']

weighted_random_recommendation(items, items_weights, n=5)

CPU times: user 28.5 ms, sys: 2.96 ms, total: 31.5 ms
Wall time: 30.8 ms


[952304, 8203807, 1035321, 9296977, 927360]

In [219]:
result = pd.read_csv('/Users/admin/Desktop/GeekBrains/Рекомендательные_системы/Урок_2_implicit/webinar_2 2/predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[1003302, 836647, 5748558, 977649, 12133834]","[6534178, 6533889, 1029743, 6534166, 1082185]","[833025, 826541, 850801, 913785, 5980723]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[969153, 854667, 1067159, 9858629, 1301443]","[6534178, 6533889, 1029743, 6534166, 1082185]","[9419268, 840505, 927485, 835727, 968687]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [220]:
result['actual'].values

array(['[  821867   834484   856942   865456   889248   907957   914190   943316\n   951954   954486   958046   962568   969231   971585   979707   986947\n   990656   995242  1004906  1005186  1042083  1050310  1060819  1062002\n  1064441  1069733  1074612  1082185  1131115  1132771  6534544 13876341\n 15971874 17178953   883616   917704   931860   961554  1002032  1031190\n  8090541  8293439  9297615  9527329 15926712  1049998   861272   869465\n   877373   908213   933913   940947   945809   959316   978974  1031697\n  1041796  1048918  1081189  1101422  1115576  1122428  1132231  1132814\n  5577022  8091601  9296986  9677939 10356149 13417048 15741823 15830875]',
       '[  835476   851057   872021   878302   879948   909638   913202   920626\n   958154   994891  1053690  1083328  1096727  6463658  7167218  7167249\n  9526563  9526886 13842214]',
       '[  920308   926804   946489  1006718  1017061  1078346  1104227  1108624\n  1110392  1120741 13776852 17105539   825541   870315 

При чтении файла возникает проблема, столбцы считываются как строка, а не массив. Поэтому нужно его обработать.

In [221]:
result1 = data_test.groupby('user_id')['item_id'].unique().reset_index()
result1.columns=['user_id', 'actual']
result1.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [222]:
result['actual'] = result1['actual']
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1003302, 836647, 5748558, 977649, 12133834]","[6534178, 6533889, 1029743, 6534166, 1082185]","[833025, 826541, 850801, 913785, 5980723]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[969153, 854667, 1067159, 9858629, 1301443]","[6534178, 6533889, 1029743, 6534166, 1082185]","[9419268, 840505, 927485, 835727, 968687]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [223]:
result['weighted_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items,items_weights, n=5))

In [224]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1003302, 836647, 5748558, 977649, 12133834]","[6534178, 6533889, 1029743, 6534166, 1082185]","[9297615, 976621, 1022510, 1088147, 965141]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[969153, 854667, 1067159, 9858629, 1301443]","[6534178, 6533889, 1029743, 6534166, 1082185]","[982743, 988520, 985911, 962764, 1004906]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [225]:
# your_code
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k

In [226]:
weighted = result.apply(lambda row: precision_at_k(row['weighted_recommendation'], row['actual']), axis=1).mean()
weighted

0.010088148873653264

In [227]:
# загружаем файл из вебинара (с сохраненными метриками)
metrics = pd.read_csv('metrics_precision@5')
metrics

Unnamed: 0,algorithm,precision@5
0,random_recommendation,0.000392
1,popular_recommendation,0.15524
2,itemitem,0.136925
3,cosine,0.132909
4,tfidf,0.138981
5,own_purchases,0.179693


In [228]:
metrics = metrics.append({'algorithm': 'weighted_random_recommendation', 'precision@5': weighted}, ignore_index=True)
metrics

Unnamed: 0,algorithm,precision@5
0,random_recommendation,0.000392
1,popular_recommendation,0.15524
2,itemitem,0.136925
3,cosine,0.132909
4,tfidf,0.138981
5,own_purchases,0.179693
6,weighted_random_recommendation,0.010088


Метрика улучшилась по сравнению с random recomendation. Но у остальных алгоритмов показатели явно выше.

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [229]:
# your_code
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [230]:
top5000 = data_weights.sort_values(by='sales_value', ascending=False).head(5000)
top5000

Unnamed: 0,item_id,sales_value,weight
55470,6534178,447799.94,0.063498
55430,6533889,40483.34,0.005741
28895,1029743,35764.66,0.005071
55465,6534166,30170.77,0.004278
34707,1082185,26029.96,0.003691
...,...,...,...
72352,12188446,265.29,0.000038
37958,1112387,265.26,0.000038
23584,981716,265.17,0.000038
51978,5575861,265.14,0.000038


In [231]:
items = top5000.item_id.unique()

In [232]:
result5000 = data_test.groupby('user_id')['item_id'].unique().reset_index() # актуальные значения item берем из теста
result5000.columns=['user_id', 'actual']
result5000.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [233]:
result5000['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result5000.head(2)

Unnamed: 0,user_id,actual,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1018670, 1122255, 1112825, 7139529, 7410341]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1071342, 912137, 892008, 1128744, 8019921]"


In [234]:
rand5000 = result5000.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()
rand5000

0.00636630754162585

In [235]:
metrics = metrics.append({'algorithm': 'random5000', 'precision@5': rand5000}, ignore_index=True)
metrics.sort_values(by='precision@5', ascending=False)

Unnamed: 0,algorithm,precision@5
5,own_purchases,0.179693
1,popular_recommendation,0.15524
4,tfidf,0.138981
2,itemitem,0.136925
3,cosine,0.132909
6,weighted_random_recommendation,0.010088
7,random5000,0.006366
0,random_recommendation,0.000392


При подсчете бейзлайна по random на топ-5000 товаров метрику удалось значительно улучшить.

#### Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей  𝐾 .

In [236]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [237]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [238]:
data_train.loc[~data_train['item_id'].isin(top_5000)]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0
8,2375,26984851516,1,1102651,1,1.89,364,0.00,1642,1,0.0,0.0
9,2375,26984851516,1,6423775,1,2.00,364,-0.79,1642,1,0.0,0.0
10,2375,26984851516,1,9487839,1,2.00,364,-0.79,1642,1,0.0,0.0
12,1364,26984896261,1,897044,1,2.99,31742,-0.40,1520,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2282311,2088,41297771158,635,13877226,1,15.65,304,-5.25,1258,91,0.0,0.0
2282314,1541,41297771177,635,1057256,1,1.05,304,0.00,1300,91,0.0,0.0
2282315,1541,41297771180,635,12487271,1,5.19,304,0.00,1301,91,0.0,0.0
2282316,1168,41297772063,635,836262,1,12.40,304,0.00,1526,91,0.0,0.0


In [239]:
data_train['item_id'].isin(top_5000)

0           True
1           True
2           True
3           True
4           True
           ...  
2282320     True
2282321    False
2282322     True
2282323     True
2282324     True
Name: item_id, Length: 2278490, dtype: bool

In [240]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

5.33770796861036

In [242]:
# преобразования юзеров и item
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [243]:
%%time

model3 = ItemItemRecommender(K=3, num_threads=4) # K - кол-во билжайших соседей

model3.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model3.recommend(userid=userid_to_id[2],  # userid - id от 0 до N - выбираем для кого предсказываем
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 2.18 s, sys: 97.3 ms, total: 2.28 s
Wall time: 811 ms


In [244]:
recs

[(2381, 78679.0),
 (3408, 72035.0),
 (2148, 2953.0),
 (300, 1284.0),
 (3587, 1278.0)]

In [245]:
[id_to_itemid[rec[0]] for rec in recs]

[999999, 1082185, 981760, 840361, 1098066]

In [246]:
%%time

result['itemitem3'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model3.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

CPU times: user 50.1 ms, sys: 2.39 ms, total: 52.5 ms
Wall time: 52.1 ms


In [247]:
item3 = result.apply(lambda row: precision_at_k(row['itemitem3'], row['actual']), axis=1).mean()
item3

0.1860920666013684

In [248]:
%%time

model7 = ItemItemRecommender(K=7, num_threads=4) # K - кол-во билжайших соседей

model7.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model7.recommend(userid=userid_to_id[2],  # userid - id от 0 до N - выбираем для кого предсказываем
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 2.28 s, sys: 93 ms, total: 2.37 s
Wall time: 827 ms


In [249]:
%%time

result['itemitem7'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model7.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

CPU times: user 65.3 ms, sys: 2.3 ms, total: 67.6 ms
Wall time: 66.7 ms


In [250]:
item7 = result.apply(lambda row: precision_at_k(row['itemitem7'], row['actual']), axis=1).mean()
item7

0.1448579823702236

In [252]:
metrics = metrics.append({'algorithm':'itemitem3K', 'precision@5':item3}, ignore_index=True)
metrics = metrics.append({'algorithm':'itemitem7K', 'precision@5':item7}, ignore_index=True)
metrics.sort_values(by='precision@5', ascending=False)

Unnamed: 0,algorithm,precision@5
8,itemitem3K,0.186092
5,own_purchases,0.179693
1,popular_recommendation,0.15524
9,itemitem7K,0.144858
4,tfidf,0.138981
2,itemitem,0.136925
3,cosine,0.132909
6,weighted_random_recommendation,0.010088
7,random5000,0.006366
0,random_recommendation,0.000392


Алгоритм ItemItemRecommender с числом соседей 𝐾 = 3 оказался наилучшим по метрикам.