In [None]:
!pip install implicit==0.4.4

Collecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.3 MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.4-cp37-cp37m-linux_x86_64.whl size=3413793 sha256=9bae1f263984fbc122c245929e165ea6df994154a55ba08fe7dec4e4dd4499ef
  Stored in directory: /root/.cache/pip/wheels/44/7e/7d/a17324ea207cfbe76aca878b5b8ca0aa932cf55d163329be37
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.4


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [23]:
# Метрики
def precision(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    return flags.sum() / len(recommended_list)

def precision_at_k(recommended_list, bought_list, k=5):
    return precision(recommended_list[:k], bought_list)

In [24]:
data = pd.read_csv('transaction_data.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631.0,1.0,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631.0,1.0,0.0,0.0


In [25]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[837208, 849264, 851231, 856942, 861272, 86474..."
1,2,"[868389, 868547, 883665, 911974, 925862, 93493..."


In [26]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()


In [27]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


item_id,51716,202291,397896,420647,480014,818980,819063,819255,819304,819308,...,12781986,12810391,12810393,12810422,12811532,12812261,12946027,12949590,13038913,13158451
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [29]:
%%time

model = AlternatingLeastSquares(factors=100, 
                                regularization=0.001,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)



  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: user 376 ms, sys: 16.5 ms, total: 392 ms
Wall time: 400 ms


In [30]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)]
    return res

In [32]:
%%time
    
result['als'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
print(result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean())

0.06554445028932193
CPU times: user 35.8 s, sys: 28.8 s, total: 1min 4s
Wall time: 33.3 s


In [33]:
result.head(2)

Unnamed: 0,user_id,actual,als
0,1,"[837208, 849264, 851231, 856942, 861272, 86474...","[999999, 856942, 9575181, 1134161, 823687]"
1,2,"[868389, 868547, 883665, 911974, 925862, 93493...","[999999, 6773238, 1100368, 1113385, 1040807]"


Уменьшим параметры factors, regularization и iterations:

In [40]:
model = AlternatingLeastSquares(factors=50, 
                                regularization=0.0001,
                                iterations=5, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)



  0%|          | 0/5 [00:00<?, ?it/s]

In [41]:
%%time
    
result['als_2'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
print(result.apply(lambda row: precision_at_k(row['als_2'], row['actual']), axis=1).mean())

0.020515518148342977
CPU times: user 19.5 s, sys: 15.9 s, total: 35.4 s
Wall time: 18.2 s


In [42]:
result.head(2)

Unnamed: 0,user_id,actual,als,als_2
0,1,"[837208, 849264, 851231, 856942, 861272, 86474...","[999999, 856942, 9575181, 1134161, 823687]","[999999, 853622, 7167836, 929605, 9187298]"
1,2,"[868389, 868547, 883665, 911974, 925862, 93493...","[999999, 6773238, 1100368, 1113385, 1040807]","[999999, 9187298, 973889, 8293447, 855279]"


Увеличим параметры factors, regularization и iterations:

In [43]:
model = AlternatingLeastSquares(factors=150, 
                                regularization=0.01,
                                iterations=25, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)



  0%|          | 0/25 [00:00<?, ?it/s]

In [44]:
%%time
    
result['als_3'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
print(result.apply(lambda row: precision_at_k(row['als_3'], row['actual']), axis=1).mean())

0.11835875854813173
CPU times: user 45.9 s, sys: 37 s, total: 1min 22s
Wall time: 42.5 s


In [45]:
result.head(2)

Unnamed: 0,user_id,actual,als,als_2,als_3
0,1,"[837208, 849264, 851231, 856942, 861272, 86474...","[999999, 856942, 9575181, 1134161, 823687]","[999999, 853622, 7167836, 929605, 9187298]","[999999, 1082185, 995242, 857849, 856942]"
1,2,"[868389, 868547, 883665, 911974, 925862, 93493...","[999999, 6773238, 1100368, 1113385, 1040807]","[999999, 9187298, 973889, 8293447, 855279]","[999999, 888614, 12518330, 996028, 1035843]"


__Выводы:__

При уменьшении гипер - параметров "factors", "regularization" и "iterations" значение метрики "precision_at_k" падает, а при увеличении гипер - параметров "precision_at_k" растет.