In [41]:
import numpy as np
import pandas as pd
import copy
from sklearn.preprocessing import MinMaxScaler
import nltk
import json

In [21]:
df_stock = pd.read_csv('price_minute.csv', index_col=0, header=0)
df_stock[['date', 'time']] = df_stock['t'].str.split(' ', expand=True)
df_stock = df_stock.drop(['t', 'n'], axis=1)

In [22]:
df_stock.head()

Unnamed: 0,v,vw,o,c,h,l,date,time
2182,314,1646.0508,1644.77,1644.77,1644.77,1644.77,2019-01-08,09:00:00
2181,507,1649.7491,1650.0,1650.0,1650.0,1650.0,2019-01-08,09:01:00
2180,320,1649.992,1650.0,1649.96,1650.0,1649.96,2019-01-08,09:02:00
2179,166,1650.0018,1650.0,1650.0,1650.0,1650.0,2019-01-08,09:12:00
2178,110,1650.9591,1650.96,1650.96,1650.96,1650.96,2019-01-08,09:19:00


In [23]:
df_news = np.load('data_text.npz', allow_pickle=True)

In [26]:
def gather_pos(pos_i):
    pos_i = pos_i[0]
    result = [word['upos'] for word in pos_i]
    return result

def process_stock(df):
    stock = []
    df = df.drop(['date', 'time'], axis=1).to_numpy()
    if len(df) == 0:
        return None
    scaler = MinMaxScaler()
    df_cp = copy.deepcopy(df)
    scaler.fit(df_cp)
    df_cp = scaler.transform(df_cp)
    for i in range(11, len(df), 10):
        history = df[i-11:i-1].reshape(60)
        chunk = df[i-10:i].reshape(60)
        chunk_scaled = df_cp[i-10:i].reshape(60)
        change = (chunk - history) / (history + 1e-6)
        stock.append(np.concatenate((chunk_scaled, change), axis=0))
    return np.array(stock)

In [58]:
data = []
for i in range(len(df_news['data'])):
    news, pos = df_news['data'][i], df_news['pos'][i]
    date, time = news['pub_time'].split()[0], news['pub_time'].split()[1].split('+')[0]
    stock = process_stock(df_stock.loc[(df_stock['date'] == date) & (df_stock['time'] >= time)])
    pos = gather_pos(pos)
    if stock is None:
        continue
    data.append({'news': news['text'], 'pos': pos, 'stock': stock.tolist()})
    

In [59]:
with open('data.json', 'w') as f:
    json.dump(data,f)

In [28]:

data

[{'news': 'WASHINGTON, Oct 6 (Reuters) - The U.S. House of Representatives antitrust subcommittee’s report on the business practices of four large tech companies is expected to be broken up into three reports instead of one, according to two sources with direct knowledge of the matter, as bipartisanship among committee members appeared to break down. The report from a Judiciary Committee panel focuses on how the business practices of Facebook, Alphabet’s Google , Amazon and Apple hurt rivals. It is likely to be broken up into a majority Democrat-led staff report and two others from Republican members on the panel, the sources said.\n\nThe highly anticipated report is expected to be released this week. (Reporting by Nandita Bose and Diane Bartz in Washington Editing by Chizu Nomiyama)',
  'pos': ['NOUN',
   'ADJ',
   'NOUN',
   'ADP',
   'PROPN',
   'PROPN',
   'ADV',
   'PART',
   'AUX',
   'VERB',
   'ADP',
   'NUM',
   'NOUN',
   'PUNCT',
   'NOUN'],
  'stock': array([[ 7.50507849e-0

In [42]:
stock

Unnamed: 0,v,vw,o,c,h,l,date,time
53659,11042,3130.3495,3130.7000,3132.45,3132.45,3129.0000,2020-10-06,15:52:00
53658,6256,3132.3779,3131.8825,3131.98,3133.67,3131.3700,2020-10-06,15:53:00
53657,4098,3132.7889,3131.7100,3132.45,3133.51,3131.3844,2020-10-06,15:54:00
53656,4754,3132.2282,3133.0000,3132.47,3133.00,3131.2100,2020-10-06,15:55:00
53655,7227,3133.4775,3132.9900,3135.00,3135.00,3131.7211,2020-10-06,15:56:00
...,...,...,...,...,...,...,...,...
53352,204,3093.2222,3092.0000,3092.00,3092.00,3092.0000,2020-10-06,23:39:00
53351,516,3098.3002,3099.0000,3099.00,3099.00,3099.0000,2020-10-06,23:49:00
53350,962,3099.8020,3100.0000,3100.00,3100.00,3100.0000,2020-10-06,23:55:00
53349,282,3099.8145,3100.0000,3100.00,3100.00,3100.0000,2020-10-06,23:56:00


In [16]:
d, t = df_news['data'][0]['pub_time'].split( )[0], df_news['data'][0]['pub_time'].split( )[1].split('+')[0]

In [29]:
dd = df_stock.loc[(df_stock['date'] == d) & (df_stock['time'] >= t)]

In [30]:
dd

Unnamed: 0,v,vw,o,c,h,l,date,time
53659,11042,3130.3495,3130.7000,3132.45,3132.45,3129.0000,2020-10-06,15:52:00
53658,6256,3132.3779,3131.8825,3131.98,3133.67,3131.3700,2020-10-06,15:53:00
53657,4098,3132.7889,3131.7100,3132.45,3133.51,3131.3844,2020-10-06,15:54:00
53656,4754,3132.2282,3133.0000,3132.47,3133.00,3131.2100,2020-10-06,15:55:00
53655,7227,3133.4775,3132.9900,3135.00,3135.00,3131.7211,2020-10-06,15:56:00
...,...,...,...,...,...,...,...,...
53352,204,3093.2222,3092.0000,3092.00,3092.00,3092.0000,2020-10-06,23:39:00
53351,516,3098.3002,3099.0000,3099.00,3099.00,3099.0000,2020-10-06,23:49:00
53350,962,3099.8020,3100.0000,3100.00,3100.00,3100.0000,2020-10-06,23:55:00
53349,282,3099.8145,3100.0000,3100.00,3100.00,3100.0000,2020-10-06,23:56:00


In [38]:
df_news['pos'][0][0]

[{'id': 1,
  'text': 'House',
  'upos': 'NOUN',
  'xpos': 'NN',
  'feats': 'Number=Sing',
  'misc': 'start_char=0|end_char=5'},
 {'id': 2,
  'text': 'antitrust',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'feats': 'Degree=Pos',
  'misc': 'start_char=6|end_char=15'},
 {'id': 3,
  'text': 'probe',
  'upos': 'NOUN',
  'xpos': 'NN',
  'feats': 'Number=Sing',
  'misc': 'start_char=16|end_char=21'},
 {'id': 4,
  'text': 'on',
  'upos': 'ADP',
  'xpos': 'IN',
  'misc': 'start_char=22|end_char=24'},
 {'id': 5,
  'text': 'Big',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'misc': 'start_char=25|end_char=28'},
 {'id': 6,
  'text': 'Tech',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'misc': 'start_char=29|end_char=33'},
 {'id': 7,
  'text': 'likely',
  'upos': 'ADV',
  'xpos': 'RB',
  'misc': 'start_char=34|end_char=40'},
 {'id': 8,
  'text': 'to',
  'upos': 'PART',
  'xpos': 'TO',
  'misc': 'start_char=41|end_char=43'},
 {'id': 9,
  'text': 'be',
  'upos': 'AUX',
 