In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
df_interactions = pd.read_csv('interactions.csv')

In [3]:
df_interactions.head()

Unnamed: 0,user_id,week_no,shares_deal
0,1,31,"[3375, 1810]"
1,1,32,"[322, 3965]"
2,1,39,"[2816, 7203, 2227]"
3,1,43,[3497]
4,1,47,[8337]


In [4]:
df_interactions.shape

(600000, 3)

In [5]:
df_shares = pd.read_csv('finviz_shares.csv')

In [6]:
df_shares.head()

Unnamed: 0,ticker,id_share,industry,Index,P/E,EPS (ttm),Insider Own,Perf Week,Forward P/E,EPS next Y,...,Market Cap_full_count,Income_full_count,Sales_full_count,Shs Outstand_full_count,Shs Float_full_count,Avg Volume_full_count,52_w_range_low,52_w_range_high,Volatility_low_%,Volatility_high_%
0,AAP,1,Consumer Cyclical,S&P 500,22.54,9.88,0.1,-4.2,16.78,11.37,...,14230000000000.0,646400000.0,10970000000000.0,62850000.0,62080000.0,679290.0,151.01,244.55,2.73,2.73
1,AAQC,2,Financial,-,,,,-0.1,,,...,388000000.0,,,40000000.0,,55940.0,9.6,9.96,0.16,0.18
2,AAOI,3,Technology,-,,-2.05,6.4,2.08,,15.4,...,104580000.0,-53100000.0,209500000.0,27100000.0,25390000.0,506560.0,3.63,12.97,5.01,5.43
3,AAPL,4,Technology,DJIA S&P500,28.58,6.02,0.07,-0.45,26.29,6.42,...,2817210000000000.0,100560000000000.0,378320000000000.0,16390000000000.0,16310000000000.0,101620000.0,116.21,182.94,1.87,2.63
4,AAON,5,Industrials,-,43.79,1.33,16.6,-4.43,34.57,34.88,...,3120000000000.0,71500000.0,514900000.0,52420000.0,41560000.0,244130.0,58.65,83.79,3.3,3.88


In [7]:
df_interactions.shares_deal = df_interactions.shares_deal.map(lambda x: x.lstrip('['))

In [8]:
df_interactions.shares_deal = df_interactions.shares_deal.map(lambda x: x.rstrip(']'))

In [9]:
df_interactions.shares_deal = df_interactions.shares_deal.map(lambda x: x.split(', '))

In [10]:
df_interactions.head()

Unnamed: 0,user_id,week_no,shares_deal
0,1,31,"[3375, 1810]"
1,1,32,"[322, 3965]"
2,1,39,"[2816, 7203, 2227]"
3,1,43,[3497]
4,1,47,[8337]


In [11]:
def shares_treatment(feature_name, X):
    
    list_of_shares = X[feature_name]
    new_list = []
    
    for i in range(X.shape[0]):
        temp_list = []
        for j in range(len(list_of_shares[i])): 
            if list_of_shares[i][j].isdigit() == True: 
                temp_list.append(float(list_of_shares[i][j]))
        new_list.append(temp_list)
    
    X[feature_name] = new_list
    
    return X

In [12]:
df_interactions = shares_treatment(feature_name = 'shares_deal', X = df_interactions)

In [13]:
# кол-во покупок акций за 1 неделю
df_interactions['count_shares'] = df_interactions.shares_deal.map(lambda x: len(x))

In [14]:
df_interactions

Unnamed: 0,user_id,week_no,shares_deal,count_shares
0,1,31,"[3375.0, 1810.0]",2
1,1,32,"[322.0, 3965.0]",2
2,1,39,"[2816.0, 7203.0, 2227.0]",3
3,1,43,[3497.0],1
4,1,47,[8337.0],1
...,...,...,...,...
599995,100000,35,[1169.0],1
599996,100000,38,"[55.0, 6054.0, 7056.0]",3
599997,100000,42,"[8374.0, 5118.0, 810.0]",3
599998,100000,45,"[5559.0, 7208.0]",2


In [15]:
# "стакаем" номера недель 
s = df_interactions.apply(lambda x: pd.Series(x['shares_deal']), axis=1).stack().reset_index(level=1, drop=True)

In [16]:
s.name = 'id_shares'
df_interactions = df_interactions.drop('shares_deal', axis=1).join(s)

In [17]:
df_interactions.head(20)

Unnamed: 0,user_id,week_no,count_shares,id_shares
0,1,31,2,3375.0
0,1,31,2,1810.0
1,1,32,2,322.0
1,1,32,2,3965.0
2,1,39,3,2816.0
2,1,39,3,7203.0
2,1,39,3,2227.0
3,1,43,1,3497.0
4,1,47,1,8337.0
5,1,49,4,689.0


In [18]:
df_interactions = df_interactions.reset_index()

In [19]:
df_interactions = df_interactions.drop('index', axis=1)

In [20]:
df_interactions

Unnamed: 0,user_id,week_no,count_shares,id_shares
0,1,31,2,3375.0
1,1,31,2,1810.0
2,1,32,2,322.0
3,1,32,2,3965.0
4,1,39,3,2816.0
...,...,...,...,...
2100555,100000,51,6,2034.0
2100556,100000,51,6,7905.0
2100557,100000,51,6,6132.0
2100558,100000,51,6,4887.0


__Будем диверсифицировать кол-во акций каждого инвестора в зависимости от отраслей__

In [21]:
df_shares['industry'].unique()

array(['Consumer Cyclical', 'Financial', 'Technology', 'Industrials',
       'Healthcare', 'Consumer Defensive', 'Real Estate',
       'Basic Materials', 'Communication Services', 'Utilities', 'Energy'],
      dtype=object)

In [22]:
industries = df_shares['industry'].to_list()

In [23]:
def industry_list_shares(industry, X):
    industry_df = X[X['industry'] == industry]
    industry_list = industry_df.id_share.to_list()
    industry_list = [float(i) for i in industry_list]

    return industry_list

In [24]:
financial_list = industry_list_shares('Financial', df_shares)
consumer_list = industry_list_shares('Consumer Cyclical', df_shares)
technology_list = industry_list_shares('Technology', df_shares)
industrial_list = industry_list_shares('Industrials', df_shares)
healthcare_list = industry_list_shares('Healthcare', df_shares)
defensive_list = industry_list_shares('Consumer Defensive', df_shares) 
estate_list = industry_list_shares('Real Estate', df_shares)
materials_list = industry_list_shares('Basic Materials', df_shares)
service_list = industry_list_shares('Communication Services', df_shares) 
utils_list = industry_list_shares('Utilities', df_shares)
energy_list = industry_list_shares('Energy', df_shares)

_Кол-во акций на каждую отрасль_

In [26]:
#id всех акций из датасета df_interactions: 
list_id_shares = df_interactions.id_shares.to_list()

In [27]:
quantity_2 = []
for sh in list_id_shares: 
    if sh in financial_list: 
        quantity_2.append(random.randint(260, 275))
    elif sh in consumer_list:
        quantity_2.append(random.randint(125, 150))
    elif sh in technology_list:
        quantity_2.append(random.randint(250, 260))
    elif sh in industrial_list:
        quantity_2.append(random.randint(15, 20))
    elif sh in healthcare_list:
        quantity_2.append(random.randint(225, 250))
    elif sh in defensive_list:
        quantity_2.append(random.randint(40, 65))
    elif sh in estate_list:
        quantity_2.append(random.randint(10, 15))
    elif sh in materials_list:
        quantity_2.append(random.randint(75, 100))
    elif sh in service_list:
        quantity_2.append(random.randint(1, 5))
    elif sh in utils_list:
        quantity_2.append(random.randint(5, 10))
    elif sh in energy_list:
        quantity_2.append(random.randint(175, 200))

In [28]:
df_interactions['quantity'] = quantity_2

In [29]:
#count shares - это кол-во купленных акций юзером id в определенную неделю. 
# например за 29 неделю пользователь номер 1 купил 2 акции. 
# опционально можно менять подходы: ставить или quantity или count_shares
df_interactions

Unnamed: 0,user_id,week_no,count_shares,id_shares,quantity
0,1,31,2,3375.0,15
1,1,31,2,1810.0,14
2,1,32,2,322.0,264
3,1,32,2,3965.0,273
4,1,39,3,2816.0,270
...,...,...,...,...,...
2100555,100000,51,6,2034.0,258
2100556,100000,51,6,7905.0,258
2100557,100000,51,6,6132.0,270
2100558,100000,51,6,4887.0,261


In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import bm25_weight

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

## Train-test split

In [31]:
test_size_weeks = 4

data_train = df_interactions[df_interactions['week_no'] < df_interactions['week_no'].max() - test_size_weeks]
data_test = df_interactions[df_interactions['week_no'] >= df_interactions['week_no'].max() - test_size_weeks]

## Метрика

In [75]:
def precision_at_k(recommended_list, bought_list):
    
    k=5
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    flags = np.isin(bought_list, recommended_list)
    #print(flags)
    precision = flags.sum() / len(recommended_list)
    
    return precision

_Для бизнеса и для ML я выбрал метрику `precision_at_k.` В первую очередь брокер заинтересован в том, чтобы клиент
совершал сделки, а во вторую очередь зарабатывал. Идеальным вариантом считаю кастомную метрику, которая отражает
доход с каждой рекомендации_

## Бейзлайны

Создадим датафрейм с покупками юзеров на тестовом датасете (последние 3 недели)

In [33]:
result = data_test.groupby('user_id')['id_shares'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,..."


### 1.2 Popularity-based recommendation

In [34]:
def spb100_recommendation(shares, n=5):
    """Топ-n популярных акций из индекса spb-100"""
    
    recs = np.array(shares)
    recs = [float(i) for i in recs]
    rec_list = random.sample(recs, k=n)
    
    return rec_list

In [36]:
spb = pd.read_csv('popular_spb.csv')

In [37]:
%%time

shares_spb = spb.id_share.unique()

result['spb100_recommendation'] = result['user_id'].apply(lambda x: spb100_recommendation(shares_spb, n=5))
result.head(2)

Wall time: 5.75 s


Unnamed: 0,user_id,actual,spb100_recommendation
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]","[1897.0, 7392.0, 7360.0, 716.0, 4792.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,...","[1219.0, 1505.0, 5511.0, 8058.0, 7147.0]"


In [38]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных акций"""
    
    popular = data.groupby('id_shares')['quantity'].sum().reset_index()
    popular.sort_values('quantity', ascending=False, inplace=True)
    
    recs = popular.head(n).id_shares
    
    return recs.tolist()

In [39]:
####

In [40]:
popular_recs = popularity_recommendation(data_train, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

Unnamed: 0,user_id,actual,spb100_recommendation,popular_recommendation
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]","[1897.0, 7392.0, 7360.0, 716.0, 4792.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,...","[1219.0, 1505.0, 5511.0, 8058.0, 7147.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]"


__Сформируем датасет из популярных акций__

In [41]:
popularity = df_interactions.groupby('id_shares')['quantity'].sum().reset_index()

popularity.head(500)

Unnamed: 0,id_shares,quantity
0,1.0,32707
1,2.0,61070
2,3.0,70067
3,4.0,64593
4,5.0,4306
...,...,...
495,496.0,58864
496,497.0,2743
497,498.0,61363
498,499.0,58491


In [42]:
top_250 = popularity.sort_values('quantity', ascending=False).head(250).id_shares.tolist()

In [43]:
# Заведем фиктивный shares_id (если юзер не купил акцию из топ-250, то он "купил" такую акцию)
data_train.loc[~data_train['id_shares'].isin(top_250), 'id_shares'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='id_shares', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


id_shares,82.0,118.0,120.0,197.0,225.0,368.0,478.0,549.0,623.0,702.0,...,8352.0,8380.0,8386.0,8401.0,8459.0,8476.0,8491.0,8514.0,8519.0,999999.0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [44]:
user_item_matrix.shape

(100000, 251)

In [45]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

## ItemItemRecommender

In [46]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=True, 
                        filter_items=None, 
                        recalculate_user=True)

  0%|          | 0/251 [00:00<?, ?it/s]

Wall time: 1.22 s


In [47]:
result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=True, 
                                    filter_items=None, 
                                    recalculate_user=True)])

In [48]:
result.head(2)

Unnamed: 0,user_id,actual,spb100_recommendation,popular_recommendation,itemitem
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]","[1897.0, 7392.0, 7360.0, 716.0, 4792.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,...","[1219.0, 1505.0, 5511.0, 8058.0, 7147.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]"


## Cosine Similarity

In [49]:
%%time

model = CosineRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=True, 
                        filter_items=None, 
                        recalculate_user=False)

  0%|          | 0/251 [00:00<?, ?it/s]

Wall time: 1.1 s


In [50]:
[id_to_itemid[rec[0]] for rec in recs]

[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]

In [51]:
%%time

result['cosine'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=True, 
                                    filter_items=None, 
                                    recalculate_user=True)])

Wall time: 4.78 s


In [52]:
result.head(2)

Unnamed: 0,user_id,actual,spb100_recommendation,popular_recommendation,itemitem,cosine
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]","[1897.0, 7392.0, 7360.0, 716.0, 4792.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,...","[1219.0, 1505.0, 5511.0, 8058.0, 7147.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]"


## TFIDFRecommender

In [53]:
%%time

model = TFIDFRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=True, 
                        filter_items=None, 
                        recalculate_user=False)

  0%|          | 0/251 [00:00<?, ?it/s]

Wall time: 1.1 s


In [54]:
[id_to_itemid[rec[0]] for rec in recs]

[3340.0, 3899.0, 6499.0, 3914.0, 999999.0]

In [55]:
%%time

result['tfidf'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=True, 
                                    filter_items=None, 
                                    recalculate_user=False)])

Wall time: 4.37 s


In [56]:
result.head(2)

Unnamed: 0,user_id,actual,spb100_recommendation,popular_recommendation,itemitem,cosine,tfidf
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]","[1897.0, 7392.0, 7360.0, 716.0, 4792.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3340.0, 3899.0, 6499.0, 3914.0, 999999.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,...","[1219.0, 1505.0, 5511.0, 8058.0, 7147.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3340.0, 3899.0, 6499.0, 3914.0, 999999.0]"


## ALS

In [57]:
%%time

model = AlternatingLeastSquares(factors=100,  # Нужно подбирать
                                regularization=0.001,
                                iterations=15,
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=[itemid_to_id[999999]], 
                        filter_items=None, 
                        recalculate_user=False)



  0%|          | 0/15 [00:00<?, ?it/s]

Wall time: 2min 20s


In [58]:
recs

[(130, 0.0003321208),
 (135, 0.0003103339),
 (54, 0.00030463282),
 (9, 0.0002918737),
 (208, 0.0002754843)]

In [59]:
[id_to_itemid[rec[0]] for rec in recs]

[4390.0, 4569.0, 2251.0, 702.0, 7512.0]

In [60]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)]
    return res

In [61]:
%%time
    
result['als'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))

Wall time: 2min 19s


In [62]:
result.head(2)

Unnamed: 0,user_id,actual,spb100_recommendation,popular_recommendation,itemitem,cosine,tfidf,als
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]","[1897.0, 7392.0, 7360.0, 716.0, 4792.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3340.0, 3899.0, 6499.0, 3914.0, 999999.0]","[5189.0, 702.0, 3317.0, 8339.0, 7321.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,...","[1219.0, 1505.0, 5511.0, 8058.0, 7147.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3340.0, 3899.0, 6499.0, 3914.0, 999999.0]","[5189.0, 702.0, 3317.0, 8339.0, 7321.0]"


In [63]:
## Попробуем применить bm-25 взвешивание

In [64]:
user_item_matrix = bm25_weight(user_item_matrix.T).T  # Применяется к item-user матрице ! 

In [65]:
%%time

model = AlternatingLeastSquares(factors=100, 
                                regularization=0.001,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result['als_bm25'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))

  0%|          | 0/15 [00:00<?, ?it/s]

Wall time: 4min 23s


In [66]:
result.head(2)

Unnamed: 0,user_id,actual,spb100_recommendation,popular_recommendation,itemitem,cosine,tfidf,als,als_bm25
0,1,"[8337.0, 689.0, 1039.0, 6724.0, 7819.0]","[1897.0, 7392.0, 7360.0, 716.0, 4792.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3340.0, 3899.0, 6499.0, 3914.0, 999999.0]","[5189.0, 702.0, 3317.0, 8339.0, 7321.0]","[4368.0, 4552.0, 2242.0, 4390.0, 6091.0]"
1,2,"[1342.0, 3794.0, 4661.0, 650.0, 731.0, 6062.0,...","[1219.0, 1505.0, 5511.0, 8058.0, 7147.0]","[3899.0, 3340.0, 6499.0, 3914.0, 2351.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3899.0, 3340.0, 6499.0, 3914.0, 999999.0]","[3340.0, 3899.0, 6499.0, 3914.0, 999999.0]","[5189.0, 702.0, 3317.0, 8339.0, 7321.0]","[4368.0, 4552.0, 2242.0, 4390.0, 6091.0]"


## Bayesian personalized ranking

In [67]:
## Попробуем применить bm-25 взвешивание + BayesianPersonalizedRanking

In [70]:
%%time

model_bpr = BayesianPersonalizedRanking(factors=20, 
                                regularization=0.001,
                                iterations=15,  
                                num_threads=4)

model_bpr.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix, взвешенную по bm25
          show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

Wall time: 640 ms


In [72]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]],  # !!! 
                                    recalculate_user=False)]
    return res

In [73]:
result['bm25_bpr'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model_bpr, N=5))

In [79]:
# Посмотрим результаты по метрике

In [76]:
pd.DataFrame([
    result.apply(lambda row: precision_at_k(row['spb100_recommendation'], row['actual']), axis=1).mean(),
    result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean(),
    result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean(),
    result.apply(lambda row: precision_at_k(row['cosine'], row['actual']), axis=1).mean(),
    result.apply(lambda row: precision_at_k(row['tfidf'], row['actual']), axis=1).mean(),
    result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean(),
    result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean(),
    result.apply(lambda row: precision_at_k(row['bm25_bpr'], row['actual']), axis=1).mean(),
], 
    columns=['precision_at_k'], 
    index=['spb100_recommendation', 'popular_recommendation', 'itemitem', 'cosine', 'tfidf', 'als',
          'als_bm25', 'bm25_bpr']
).T

Unnamed: 0,spb100_recommendation,popular_recommendation,itemitem,cosine,tfidf,als,als_bm25,bm25_bpr
precision_at_k,0.00048,0.000452,0.000422,0.000418,0.000422,0.000526,0.00063,0.000586


In [78]:
# Метрики, конечно, удивили.... 
# - Как улучшить/изменить ситуацию (сузить круг бумаг или круг пользователей): 
#        - изменить снова входные id_share. Поставить предфильтрацию по наличию бумаги в s&p500
#        - изменить кол-во уникальных пользователей. Сократить со 100 000 до 2500 и провести снова тест