In [106]:
import numpy as np
import pandas as pd
import matplotlib as plt
import bs4 as bs
import requests
from lxml import html
from pprint import pprint
import random

__Дизайн взаимодействий user-item__


`1)` Работаем пока только с иностранными компаниями (USA)

`2)` Облигации и ETF тоже вычеркиваем из списка покупок

`3)` Будем выбирать по несколько акций (3-5) в месяц из списка популярных

`4)` Начальные условия: пусть раз в неделю пользователь совершает одну сделку на покупку актива из списка `spb_list` 

`5)` Времянные рамки: 24 недели. Гарантированно, что пользователь сделает хотя бы одну покупку за 4 недели. 

__Фиксация покупок ценных бумаг по неделям__

In [107]:
id_list = np.arange(1, 100001, 1)

In [108]:
data=dict()

In [109]:
#Сгенерируем номера недель когда пользователь совершает покупки ценных бумаг
for i in range(len(id_list)):
    min_week=28
    max_week=52
    step=4
    temp=[]
    while min_week!=max_week: 
        step=4
        spam = random.randrange(min_week, min_week+step)
        temp.append(spam)
        min_week = min_week+step
    data[id_list[i]] = temp

In [110]:
#из словаря составим серию
data = pd.Series(data)

In [111]:
data

1         [31, 33, 39, 42, 44, 49]
2         [31, 33, 36, 41, 46, 48]
3         [30, 33, 39, 40, 46, 50]
4         [31, 33, 39, 40, 45, 51]
5         [28, 32, 38, 42, 46, 48]
                    ...           
99996     [29, 34, 37, 43, 45, 50]
99997     [29, 34, 38, 40, 47, 51]
99998     [28, 33, 36, 42, 45, 48]
99999     [31, 32, 39, 43, 46, 51]
100000    [29, 35, 38, 43, 44, 51]
Length: 100000, dtype: object

In [112]:
df = pd.DataFrame({'week_no': data}).reset_index()

In [113]:
df.head()

Unnamed: 0,index,week_no
0,1,"[31, 33, 39, 42, 44, 49]"
1,2,"[31, 33, 36, 41, 46, 48]"
2,3,"[30, 33, 39, 40, 46, 50]"
3,4,"[31, 33, 39, 40, 45, 51]"
4,5,"[28, 32, 38, 42, 46, 48]"


In [114]:
df = df.rename(columns={'index': 'id'})

In [115]:
df.head()

Unnamed: 0,id,week_no
0,1,"[31, 33, 39, 42, 44, 49]"
1,2,"[31, 33, 36, 41, 46, 48]"
2,3,"[30, 33, 39, 40, 46, 50]"
3,4,"[31, 33, 39, 40, 45, 51]"
4,5,"[28, 32, 38, 42, 46, 48]"


In [116]:
# "стакаем" номера недель 
s = df.apply(lambda x: pd.Series(x['week_no']), axis=1).stack().reset_index(level=1, drop=True)

In [117]:
s.name = 'weak_deal'
df = df.drop('week_no', axis=1).join(s)

In [118]:
#проверка
df.head(15)

Unnamed: 0,id,weak_deal
0,1,31
0,1,33
0,1,39
0,1,42
0,1,44
0,1,49
1,2,31
1,2,33
1,2,36
1,2,41


__Забираем данные из СПБ биржи__

In [14]:
shares_df = pd.read_csv('spb100.csv')

In [15]:
shares_df.head()

Unnamed: 0,tickers,name_company,ISIN
0,BABA,Alibaba Group Holding Limited,US01609W1027
1,AAPL,Apple Inc.,US0378331005
2,TSLA,"Tesla, Inc.",US88160R1014
3,SPCE,"Virgin Galactic Holdings, Inc.",US92766K1060
4,AMZN,"Amazon.com, Inc.",US0231351067


In [16]:
isin_tickers = shares_df.ISIN.unique()

Отфильтруем по ISIN только те, которые торгуются на USA. 

In [18]:
shares_df = shares_df.loc[shares_df['ISIN'].str.contains('US', case=False)]
shares_df = shares_df.loc[~shares_df['tickers'].str.contains("@", case=False)]

In [19]:
shares_df.head()

Unnamed: 0,tickers,name_company,ISIN
0,BABA,Alibaba Group Holding Limited,US01609W1027
1,AAPL,Apple Inc.,US0378331005
2,TSLA,"Tesla, Inc.",US88160R1014
3,SPCE,"Virgin Galactic Holdings, Inc.",US92766K1060
4,AMZN,"Amazon.com, Inc.",US0231351067


In [20]:
shares_df.shape

(92, 3)

In [21]:
us_tickers = shares_df.tickers.to_list()

In [22]:
# удалим символ /n из списков (на всякий случай)
us_tickers = list(map(lambda s: s.strip(), us_tickers))

In [23]:
# выбор кол-ва акций для покупки на этой неделе 
k = random.randint(1, 6)

In [24]:
k

3

In [25]:
week_amount=6

In [26]:
data_share=dict()

In [27]:
for i in range(len(id_list)): 
    general_shares = []
    for j in range(week_amount): 
        amount_share = random.randint(1, 6)
        shares = random.sample(us_tickers, k=amount_share)
        general_shares.append(shares)
    data_share[id_list[i]] = general_shares

In [None]:
#data_share

In [28]:
#из словаря составим серию
data_share = pd.Series(data_share)

In [29]:
data_share_df = pd.DataFrame({'shares': data_share}).reset_index()

In [30]:
data_share_df.head()

Unnamed: 0,index,shares
0,1,"[[MSTR, M, TWTR, BIDU], [LMT, VALE, QCOM, CLOV..."
1,2,"[[TTE, CRM], [WISH, VZ, BIDU, MOMO], [AMD, BA]..."
2,3,"[[TTE, KO, WISH], [OXY, IBM, NVDA, HOOD], [T, ..."
3,4,"[[VRTX], [T, VZ], [FB, GOOGL, INTC], [ET, EHTH..."
4,5,"[[TCS, BMY, TDOC, PINS, UAL, JPM], [XOM, MOMO,..."


In [31]:
# "стакаем" акции
f = data_share_df.apply(lambda x: pd.Series(x['shares']), axis=1).stack().reset_index(level=1, drop=True)
f.name = 'shares_deal'

In [32]:
data_share_df = data_share_df.drop('shares', axis=1).join(f)

In [38]:
data_share_df.head()

Unnamed: 0,index,shares_deal
0,1,"[MSTR, M, TWTR, BIDU]"
0,1,"[LMT, VALE, QCOM, CLOV, SPCE, VIPS]"
0,1,[LMT]
0,1,"[ET, DISCA]"
0,1,"[OXY, MRNA, BYND]"


In [39]:
data_share_df.shape

(600000, 2)

In [103]:
temp_list = pd.Series(data_share_df['shares_deal'])

In [119]:
df = pd.concat([df, temp_list], axis=1)

In [122]:
df.head(15)

Unnamed: 0,id,weak_deal,shares_deal
0,1,31,"[MSTR, M, TWTR, BIDU]"
0,1,33,"[LMT, VALE, QCOM, CLOV, SPCE, VIPS]"
0,1,39,[LMT]
0,1,42,"[ET, DISCA]"
0,1,44,"[OXY, MRNA, BYND]"
0,1,49,"[OKE, SPCE, XOM]"
1,2,31,"[TTE, CRM]"
1,2,33,"[WISH, VZ, BIDU, MOMO]"
1,2,36,"[AMD, BA]"
1,2,41,"[VRTX, TSN, FB, BYND, CNK, QCOM]"


In [123]:
file_name = 'interactions.csv'

In [124]:
df.to_csv(file_name, encoding='utf-8', index=False)