In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix

from implicit.nearest_neighbours import ItemItemRecommender

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
result = pd.read_json('predictions_basic.json')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1127262, 14110849, 9527642, 981955, 870315]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1300002, 931124, 6514212, 851016, 6772882]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 883404]","[1082185, 1098066, 6534178, 1127831, 1068719]"


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж

Можно сэмплировать товары случайно, но пропорционально какому-либо весу

Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [5]:
items_weights = data_train.groupby('item_id')['quantity'].sum().reset_index()
items_weights.sort_values('quantity', ascending=False, inplace=True)

items_weights.head(2)

Unnamed: 0,item_id,quantity
55470,6534178,190227964
55430,6533889,15978434


In [6]:
items_weights['weight'] = items_weights['quantity'].apply(lambda x: np.log2(x) if x>0 else 0)
sum_ = items_weights['weight'].sum()
items_weights['weight'] = items_weights['weight'].apply(lambda x: x / sum_)

items_weights.head(2)

Unnamed: 0,item_id,quantity,weight
55470,6534178,190227964,0.000135
55430,6533889,15978434,0.000118


In [7]:
items_weights['weight'].sum()

0.9999999999999998

In [8]:
def weighted_random_recommendation(items_weights, n=5):
    
    items = np.array(items_weights['item_id'])
    weights = np.array(items_weights['weight'])
    
    recs = np.random.choice(items, size=n, replace=False, p=weights)
    
    return recs.tolist()

In [9]:
%%time

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))

CPU times: user 2.74 s, sys: 30.5 ms, total: 2.77 s
Wall time: 2.83 s


### Задание 2. Расчет метрик

Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [10]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [11]:
def calc_metrics(result_df, columns):
    
    columns = columns
    
    for column in columns:
        
        metric = result_df.apply(lambda row: precision_at_k(row[column], row['actual']), axis=1).mean()
        print(f"{column} : {metric}")

In [12]:
columns = list(result.columns[2:])
columns

['random_recommendation',
 'popular_recommendation',
 'itemitem',
 'cosine',
 'tfidf',
 'own_purchases',
 'weighted_random_recommendation']

In [13]:
calc_metrics(result, columns)

random_recommendation : 0.0004897159647404506
popular_recommendation : 0.15523996082272082
itemitem : 0.13692458374142857
cosine : 0.13290891283055686


  # Remove the CWD from sys.path while we load stuff.


tfidf : 0.16092066601371002
own_purchases : 0.20191740412979084
weighted_random_recommendation : 0.001077375122428991


Алгоритм itemitem при фильтрации покупок не из топа 5000 (в таблице 'own_purchases'), показывает наилучшее качество.

### Задание 3*. Улучшение бейзлайнов и ItemItem

#### Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров

In [14]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [15]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
len(top_5000)

5000

##### Random recommendation

In [16]:
def random_recommendation(items, n=5):
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [17]:
%%time

items = top_5000

result['random_rec_top'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))

CPU times: user 982 ms, sys: 26.2 ms, total: 1.01 s
Wall time: 995 ms


##### Weighted random recommender¶

In [18]:
items_weights = data_train.groupby('item_id')['quantity'].sum().reset_index()
items_weights_top = items_weights.sort_values('quantity', ascending=False).head(5000)
len(items_weights_top)

5000

In [19]:
items_weights_top['weight'] = items_weights['quantity'].apply(lambda x: np.log2(x))
sum_ = items_weights_top['weight'].sum()
items_weights_top['weight'] = items_weights_top['weight'].apply(lambda x: x / sum_)

items_weights_top.head()

Unnamed: 0,item_id,quantity,weight
55470,6534178,190227964,0.00068
55430,6533889,15978434,0.000592
55465,6534166,12439291,0.000583
55576,6544236,2501949,0.000525
43620,1404121,1562004,0.000509


In [20]:
items_weights_top['weight'].sum()

0.9999999999999999

In [21]:
def weighted_random_recommendation(items_weights_top, n=5):
    
    items = np.array(items_weights_top['item_id'])
    weights = np.array(items_weights_top['weight'])
    
    recs = np.random.choice(items, size=n, replace=False, p=weights)
    
    return recs.tolist()

In [22]:
%%time

result['weighted_rand_rec_top'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights_top, n=5))

CPU times: user 337 ms, sys: 6.33 ms, total: 344 ms
Wall time: 345 ms


In [23]:
columns = list(result.columns)
columns

['user_id',
 'actual',
 'random_recommendation',
 'popular_recommendation',
 'itemitem',
 'cosine',
 'tfidf',
 'own_purchases',
 'weighted_random_recommendation',
 'random_rec_top',
 'weighted_rand_rec_top']

In [24]:
columns = ['random_recommendation', 'random_rec_top', 'weighted_random_recommendation', 'weighted_rand_rec_top']

In [25]:
calc_metrics(result, columns)

random_recommendation : 0.0004897159647404506
random_rec_top : 0.006464250734573941
weighted_random_recommendation : 0.001077375122428991
weighted_rand_rec_top : 0.007051909892262479


Бейзлайны, рассчитанные на топ-5000 товаров, показывают лучшее качество.

#### Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей  𝐾

In [26]:
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [27]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
sparse_user_item

<2499x5001 sparse matrix of type '<class 'numpy.float64'>'
	with 667080 stored elements in Compressed Sparse Row format>

In [29]:
user_item_matrix.shape

(2499, 5001)

In [30]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

5.33770796861036

In [31]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [32]:
user_item_matrix

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
%%time

for i in range(1, 11):

    model = ItemItemRecommender(K=i, num_threads=4) # K - кол-во билжайших соседей

    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)
    
    result[i] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)])

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 20.5 s, sys: 391 ms, total: 20.9 s
Wall time: 22.2 s


In [34]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation,random_rec_top,...,1,2,3,4,5,6,7,8,9,10
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1127262, 14110849, 9527642, 981955, 870315]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]","[1007120, 953609, 884686, 9931527, 1032681]","[1045478, 1077430, 9836460, 819304, 990804]",...,"[1082185, 1029743, 995785, 1004906, 1081177]","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 995242, 1127831, 840361]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1300002, 931124, 6514212, 851016, 6772882]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 883404]","[1082185, 1098066, 6534178, 1127831, 1068719]","[1097239, 888186, 1110177, 932557, 10456581]","[905235, 1036081, 1043973, 1076744, 887984]",...,"[1082185, 1098066, 6534178, 1127831, 1068719]","[1082185, 1098066, 6534178, 826249, 1127831]","[1082185, 981760, 1098066, 6534178, 826249]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]"


In [35]:
columns = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [36]:
calc_metrics(result, columns)

  # Remove the CWD from sys.path while we load stuff.


1 : 0.20191740412979084
2 : 0.2193437806072451
3 : 0.21897649363368973
4 : 0.16738491674828382
5 : 0.15122428991184922
6 : 0.15543584720861708
7 : 0.1602350636630735
8 : 0.1646425073457374
9 : 0.16405484818804886
10 : 0.16601371204701063


Оптимальное число соседей K - 2 или 3 соседа