In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import bs4 as bs
import requests
from lxml import html
from pprint import pprint
import random

__Дизайн взаимодействий user-item__


`1)` Работаем пока только с иностранными компаниями (USA)

`2)` Облигации и ETF тоже вычеркиваем из списка покупок

`3)` Будем выбирать по несколько акций (3-5) в месяц из списка популярных

`4)` Начальные условия: пусть раз в неделю пользователь совершает хотя бы одну сделку на покупку популярного актива из списка `spb_list`. Подробно про алгоритм распределения распишу ниже. 

`5)` Времянные рамки: 24 недели. Гарантированно, что пользователь сделает хотя бы одну покупку за 4 недели. 

__Фиксация покупок ценных бумаг по неделям__

In [2]:
id_list = np.arange(1, 25001, 1)

In [3]:
data=dict()

In [4]:
#Сгенерируем номера недель когда пользователь совершает покупки ценных бумаг
for i in range(len(id_list)):
    min_week=28
    max_week=52
    step=4
    temp=[]
    while min_week!=max_week: 
        step=4
        spam = random.randrange(min_week, min_week+step)
        temp.append(spam)
        min_week = min_week+step
    data[id_list[i]] = temp

In [5]:
#из словаря составим серию
data = pd.Series(data)

In [6]:
data

1        [30, 34, 39, 41, 46, 49]
2        [30, 32, 36, 40, 47, 48]
3        [31, 35, 39, 42, 47, 51]
4        [31, 32, 36, 42, 44, 48]
5        [30, 35, 36, 41, 47, 48]
                   ...           
24996    [30, 32, 39, 41, 45, 48]
24997    [28, 35, 39, 43, 47, 48]
24998    [29, 33, 39, 42, 46, 50]
24999    [29, 33, 39, 42, 45, 50]
25000    [28, 32, 38, 43, 44, 51]
Length: 25000, dtype: object

In [7]:
df = pd.DataFrame({'week': data}).reset_index()

In [8]:
df.head()

Unnamed: 0,index,week
0,1,"[30, 34, 39, 41, 46, 49]"
1,2,"[30, 32, 36, 40, 47, 48]"
2,3,"[31, 35, 39, 42, 47, 51]"
3,4,"[31, 32, 36, 42, 44, 48]"
4,5,"[30, 35, 36, 41, 47, 48]"


In [9]:
df = df.rename(columns={'index': 'user_id'})

In [10]:
df.head()

Unnamed: 0,user_id,week
0,1,"[30, 34, 39, 41, 46, 49]"
1,2,"[30, 32, 36, 40, 47, 48]"
2,3,"[31, 35, 39, 42, 47, 51]"
3,4,"[31, 32, 36, 42, 44, 48]"
4,5,"[30, 35, 36, 41, 47, 48]"


In [11]:
# "стакаем" номера недель 
s = df.apply(lambda x: pd.Series(x['week']), axis=1).stack().reset_index(level=1, drop=True)

In [12]:
s.name = 'week_no'
df = df.drop('week', axis=1).join(s)

In [13]:
#проверка
df = df.reset_index()

In [14]:
df = df.drop('index', axis=1)

In [15]:
df.head(15)

Unnamed: 0,user_id,week_no
0,1,30
1,1,34
2,1,39
3,1,41
4,1,46
5,1,49
6,2,30
7,2,32
8,2,36
9,2,40


In [16]:
df.shape

(150000, 2)

__Забираем данные из СПБ биржи__

In [17]:
shares_df = pd.read_csv('spb100.csv')

In [18]:
shares_df.head()

Unnamed: 0,tickers,name_company,ISIN
0,BABA,Alibaba Group Holding Limited,US01609W1027
1,AAPL,Apple Inc.,US0378331005
2,TSLA,"Tesla, Inc.",US88160R1014
3,SPCE,"Virgin Galactic Holdings, Inc.",US92766K1060
4,AMZN,"Amazon.com, Inc.",US0231351067


In [19]:
isin_tickers = shares_df.ISIN.unique()

Отфильтруем по ISIN только те, которые торгуются на USA. 

In [20]:
shares_df = shares_df.loc[shares_df['ISIN'].str.contains('US', case=False)]
shares_df = shares_df.loc[~shares_df['tickers'].str.contains("@", case=False)]

In [21]:
shares_df.head()

Unnamed: 0,tickers,name_company,ISIN
0,BABA,Alibaba Group Holding Limited,US01609W1027
1,AAPL,Apple Inc.,US0378331005
2,TSLA,"Tesla, Inc.",US88160R1014
3,SPCE,"Virgin Galactic Holdings, Inc.",US92766K1060
4,AMZN,"Amazon.com, Inc.",US0231351067


Удалим _RDS A_ из списка тикеров, потому что он уже не торгуется

In [22]:
shares_df = shares_df.drop(shares_df[shares_df.tickers == 'RDS A'].index)

In [23]:
shares_df

Unnamed: 0,tickers,name_company,ISIN
0,BABA,Alibaba Group Holding Limited,US01609W1027
1,AAPL,Apple Inc.,US0378331005
2,TSLA,"Tesla, Inc.",US88160R1014
3,SPCE,"Virgin Galactic Holdings, Inc.",US92766K1060
4,AMZN,"Amazon.com, Inc.",US0231351067
...,...,...,...
95,FDX,FedEx Corporation,US31428X1063
96,PINS,"Pinterest, Inc. ClassA",US72352L1061
97,OKE,"ONEOK, Inc.",US6826801036
98,TSN,"Tyson Foods, Inc. Class A",US9024941034


In [24]:
shares_df.shape

(91, 3)

__"Приклеим" сразу же данные из finviz: цену акции, ee id__

In [25]:
finviz_df = pd.read_csv('finviz_shares.csv')

In [26]:
finviz_df_ticker = finviz_df[['ticker', 'id_share', 'Price']]

In [27]:
finviz_df_ticker = finviz_df_ticker.rename(columns={'ticker': 'tickers'})

In [28]:
pd.merge(shares_df, finviz_df_ticker, how="inner", on="tickers")

Unnamed: 0,tickers,name_company,ISIN,id_share,Price
0,BABA,Alibaba Group Holding Limited,US01609W1027,706,123.98
1,AAPL,Apple Inc.,US0378331005,4,172.12
2,TSLA,"Tesla, Inc.",US88160R1014,7664,904.55
3,SPCE,"Virgin Galactic Holdings, Inc.",US92766K1060,7111,8.85
4,AMZN,"Amazon.com, Inc.",US0231351067,388,3180.07
...,...,...,...,...,...
85,FDX,FedEx Corporation,US31428X1063,2757,240.41
86,PINS,"Pinterest, Inc. ClassA",US72352L1061,5996,25.55
87,OKE,"ONEOK, Inc.",US6826801036,5604,63.02
88,TSN,"Tyson Foods, Inc. Class A",US9024941034,7661,97.88


In [29]:
shares_spb = pd.merge(shares_df, finviz_df_ticker, how="inner", on="tickers")

In [30]:
shares_spb.to_csv('popular_spb.csv', encoding='utf-8', index=False)

In [31]:
shares_spb.head()

Unnamed: 0,tickers,name_company,ISIN,id_share,Price
0,BABA,Alibaba Group Holding Limited,US01609W1027,706,123.98
1,AAPL,Apple Inc.,US0378331005,4,172.12
2,TSLA,"Tesla, Inc.",US88160R1014,7664,904.55
3,SPCE,"Virgin Galactic Holdings, Inc.",US92766K1060,7111,8.85
4,AMZN,"Amazon.com, Inc.",US0231351067,388,3180.07


_UPD: и все-таки с spb акциями не получается хуже. Лучше взять всю выборку из finviz_

In [32]:
#создаем два листа со списками акций: из finviz, из spb
id_shares_finviz = finviz_df.id_share.to_list()
id_shares_spb = shares_spb.id_share.to_list()

In [33]:
df_finviz = pd.read_csv('finviz_shares.csv')

In [34]:
# акции из s&p500
sp500_df = df_finviz[(df_finviz['Index'] == 'S&P 500') | (df_finviz['Index'] == 'DJIA S&P500')]

In [35]:
sp500_df

Unnamed: 0,ticker,id_share,industry,Index,P/E,EPS (ttm),Insider Own,Perf Week,Forward P/E,EPS next Y,...,Market Cap_full_count,Income_full_count,Sales_full_count,Shs Outstand_full_count,Shs Float_full_count,Avg Volume_full_count,52_w_range_low,52_w_range_high,Volatility_low_%,Volatility_high_%
0,AAP,1,Consumer Cyclical,S&P 500,22.54,9.88,0.10,-4.20,16.78,11.37,...,1.423000e+13,6.464000e+08,1.097000e+13,6.285000e+07,6.208000e+07,679290.0,151.01,244.55,2.73,2.73
3,AAPL,4,Technology,DJIA S&P500,28.58,6.02,0.07,-0.45,26.29,6.42,...,2.817210e+15,1.005600e+14,3.783200e+14,1.639000e+13,1.631000e+13,101620000.0,116.21,182.94,1.87,2.63
6,AAL,7,Industrials,S&P 500,,-3.12,0.50,12.91,8.95,200.50,...,1.220000e+13,-1.993000e+09,2.988000e+13,6.485600e+08,6.409500e+08,39640000.0,14.90,26.09,4.41,4.95
18,ABMD,19,Healthcare,S&P 500,106.08,2.90,1.40,5.71,66.52,5.87,...,1.373000e+13,1.330000e+08,1.000000e+12,4.544000e+07,4.454000e+07,361530.0,261.27,379.30,4.82,4.73
27,ABC,28,Healthcare,S&P 500,18.38,7.70,0.20,2.65,12.25,7.22,...,2.988000e+13,1.610000e+12,2.211000e+14,2.085600e+08,1.497100e+08,1050000.0,100.71,143.56,2.52,2.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8497,XRAY,8498,Healthcare,S&P 500,28.61,1.90,16.40,3.25,17.01,10.30,...,1.194000e+13,4.175000e+08,4.250000e+12,2.186000e+08,2.178500e+08,1740000.0,48.13,69.54,2.37,2.74
8505,YUM,8506,Consumer Cyclical,S&P 500,23.94,5.18,0.16,-0.96,25.11,8.85,...,3.642000e+13,1.580000e+12,6.440000e+12,2.960000e+08,2.926700e+08,1590000.0,101.94,139.85,2.58,2.34
8534,ZBH,8535,Healthcare,S&P 500,29.28,4.06,0.10,-2.06,14.99,7.23,...,2.450000e+13,8.492000e+08,7.880000e+12,2.088000e+08,1.974100e+08,1920000.0,108.47,180.36,4.95,3.29
8536,ZBRA,8537,Technology,S&P 500,29.76,15.68,0.50,-8.20,23.68,7.48,...,2.519000e+13,8.450000e+08,5.470000e+12,5.342000e+07,5.317000e+07,344490.0,422.19,615.00,3.32,3.48


In [36]:
sp500_df.industry.unique()

array(['Consumer Cyclical', 'Technology', 'Industrials', 'Healthcare',
       'Consumer Defensive', 'Utilities', 'Financial', 'Basic Materials',
       'Real Estate', 'Energy', 'Communication Services'], dtype=object)

In [37]:
industrials_shares = sp500_df[sp500_df['industry'] == 'Industrials'].id_share.to_list()

In [38]:
# lists из id_share тикеров s&p500 по секторам экономики
consumer_shares = sp500_df[sp500_df['industry'] == 'Consumer Cyclical'].id_share.to_list()
technology_share = sp500_df[sp500_df['industry'] == 'Technology'].id_share.to_list()
healthcare_share = sp500_df[sp500_df['industry'] == 'Healthcare'].id_share.to_list()
defensive_share = sp500_df[sp500_df['industry'] == 'Consumer Defensive'].id_share.to_list()
utils_share = sp500_df[sp500_df['industry'] == 'Utilities'].id_share.to_list()
financial_share = sp500_df[sp500_df['industry'] == 'Financial'].id_share.to_list()
materials_share = sp500_df[sp500_df['industry'] == 'Basic Materials'].id_share.to_list()
estate_share = sp500_df[sp500_df['industry'] == 'Real Estate'].id_share.to_list()
energy_share = sp500_df[sp500_df['industry'] == 'Energy'].id_share.to_list()
serv_share = sp500_df[sp500_df['industry'] == 'Communication Services'].id_share.to_list()

__Идея по выбору тикеров__

1) В начале мы предположили, что юзеры совершают покупки как минимум раз в 4 недели. Кол-во тикеров к покупке выбирается случайно, в диапазоне от 1 до 6 (k = random.randint(1, 7)).

2) Если k <= 2, то выбираем из списка spb100

3) Если 2<k<5: 
   - 2 тикера из популярных отраслей s&p500 (финансы, здравоохранение, технологии)
   - 2 тикера из непопулярынх отраслей s&p500 (consumer_shares, defensive_share, utils_share, materials_share, 
     estate_share, energy_share, serv_share)
     
4) Если 5<=k<=6: 
   - k-3 тикера из популярных отраслей s&p500 (финансы, здравоохранение, технологии)
   - 2 тикерa из непопулярынх отраслей s&p500
   - 1 любой тикер из любой отрасли не из s&p500

In [39]:
# если идея выше не сработает, то попробовать воспользоваться биномиальным распределением 
# (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom.html)

In [40]:
list_1 = [technology_share, healthcare_share, financial_share]

In [41]:
list_2 = [consumer_shares, defensive_share, utils_share, materials_share, estate_share, energy_share, serv_share]

In [42]:
popular_share_sp = [] 
unpop_share_sp = []

In [43]:
# сливаем популярные id-ки акций
for feat in list_1: 
    popular_share_sp.extend(feat)

In [44]:
# сливаем непопулярные id-ки акций
for unfeat in list_2: 
    unpop_share_sp.extend(unfeat)

In [45]:
# акции не из s&p500
non_sp500_list = df_finviz[(df_finviz['Index'] == '-')].id_share.to_list()

In [46]:
len(non_sp500_list)

8056

In [47]:
data_share=dict()
week_amount = 6

In [49]:
for i in range(len(id_list)): 
    general_shares = []
    for j in range(week_amount): 
        amount_share = random.randint(1, 7)
        
        if amount_share <= 2: 
            
            shares_pop = random.sample(id_shares_spb, k=amount_share)
            
            general_shares.append(shares_pop)
            
        elif (amount_share > 2) and (amount_share < 5): 
            
            shares_pop = random.sample(popular_share_sp, k=2)
            shares_unpop = random.sample(unpop_share_sp, k=amount_share-2)
            shares_pop.extend(shares_unpop)
            
            general_shares.append(shares_pop)

            
        else:
            
            shares_pop = random.sample(popular_share_sp, k=2)
            shares_unpop = random.sample(unpop_share_sp, k=amount_share-3)
            shares_pop.extend(shares_unpop)
            any_share = random.sample(non_sp500_list, k=1)
            shares_pop.extend(any_share)
            
            general_shares.append(shares_pop)

    data_share[id_list[i]] = general_shares

In [50]:
#data_share

In [51]:
#из словаря составим серию
data_share = pd.Series(data_share)

In [52]:
data_share_df = pd.DataFrame({'shares': data_share}).reset_index()

In [53]:
data_share_df

Unnamed: 0,index,shares
0,1,"[[7681, 8498, 5529, 5351, 1174, 2026, 5907], [..."
1,2,"[[8498, 1703, 6711], [2661], [6743, 5817, 6241..."
2,3,"[[7658, 2240, 2146, 4764, 6441], [417, 1770, 5..."
3,4,"[[3428], [2995], [2806, 1982, 4741, 8157, 89],..."
4,5,"[[4060, 7662], [354], [1701, 4882, 1546, 5158]..."
...,...,...
24995,24996,"[[4533, 4841, 3894], [1462, 1793, 5297], [5913..."
24996,24997,"[[1505], [8181, 4737, 767, 2114, 1677], [5788,..."
24997,24998,"[[7937, 3325], [1982, 6184, 1443, 1290], [4301..."
24998,24999,"[[1429, 6400, 3673, 2473, 6044], [6013, 19, 23..."


In [63]:
# "стакаем" акции
f = data_share_df.apply(lambda x: pd.Series(x['shares']), axis=1).stack().reset_index(level=-1, drop=True)
f.name = 'shares_deal'

In [65]:
data_share_df = data_share_df.drop('shares', axis=1).join(f)

In [66]:
data_share_df = data_share_df.reset_index()

In [67]:
data_share_df.shape

(150000, 4)

In [70]:
data_share_df.head(30)

Unnamed: 0,level_0,index,shares_deal
0,0,1,"[7681, 8498, 5529, 5351, 1174, 2026, 5907]"
1,0,1,"[5511, 3289, 7661, 8398]"
2,0,1,"[1274, 3627, 1833, 1, 2387, 7291]"
3,0,1,"[2661, 354]"
4,0,1,"[4383, 349, 2173, 7362, 4671, 7162]"
5,0,1,"[7463, 5423, 1855, 5931]"
6,1,2,"[8498, 1703, 6711]"
7,1,2,[2661]
8,1,2,"[6743, 5817, 6241, 8276, 4412, 2396]"
9,1,2,"[95, 5270, 775, 4768, 6509, 4789, 4888]"


In [71]:
temp_list = pd.Series(data_share_df['shares_deal'])

In [72]:
df = pd.concat([df, temp_list], axis=1)

In [73]:
df['shares_deal'].isna().sum()

0

In [74]:
file_name = 'interactions.csv'

In [75]:
df.to_csv(file_name, encoding='utf-8', index=False)