In [9]:
import os   
import glob
import json
import numpy as np
import pandas as pd
import multiprocessing
from joblib import Parallel, delayed

pd.options.display.max_rows = 300

### Goals of this notebook: 
1. First fill seconds for each time id until 600 secs and attempt to write to file for entire book
2. Then denormalize prices with multipliers for each stock and time id (maybe write to file also or just for each stock...)
3. After second filling, use time chains to extend each time id to include previous seconds and reindex seconds of time id
    a. Best to keep it in OG format in case we do not / or want to include more features

In [2]:
DENORM = pd.read_csv("denormalize_multipliers.csv")
BETA = pd.read_csv("all_times.csv")

def fill_seconds(df):
    df = df.reset_index(drop=True)
    index_range = pd.Index(range(600), name='seconds_in_bucket')
    df = df.set_index('seconds_in_bucket').reindex(index_range)
    
    # Forward fill & back fill seconds
    df = df.ffill().reset_index()
    return df.bfill().iloc[:600]

def denormalize(df):
    global DENORM

    df = df.merge(DENORM, how="left")
    df['ask_price1'] *= df['multiplier']
    df['ask_price2'] *= df['multiplier']
    df['bid_price1'] *= df['multiplier']
    df['bid_price2'] *= df['multiplier']
    
    return df.drop(['multiplier'], axis=1)

### Denorm and Fill One Stock

In [3]:
df = pd.read_csv("individual_book_train/stock_0.csv")
denorm = denormalize(df)
dfill = denorm.groupby("time_id").apply(fill_seconds)
dfill = dfill.reset_index(drop=True).astype({'time_id': int, 'stock_id': int})
dfill

Unnamed: 0,seconds_in_bucket,time_id,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,0,5,194.074070,194.244438,194.064031,194.254477,3.0,226.0,2.0,100.0,0
1,1,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
2,2,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
3,3,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
4,4,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
...,...,...,...,...,...,...,...,...,...,...,...
2297995,595,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0
2297996,596,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0
2297997,597,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0
2297998,598,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0


### Parallelize for All Stocks

In [4]:
def denorm_fill(sid, df):
    file = f"individual_book_train/stock_{sid}.csv" 
    df = pd.concat([df, denormalize(pd.read_csv(file))])
    return df.groupby("time_id").apply(fill_seconds).reset_index(drop=True).astype({'time_id': int, 'stock_id': int})

def denorm_fill_stocks():
    ret = pd.DataFrame()
    ret = Parallel(n_jobs=-1, verbose=1) (
        delayed(denorm_fill)(sid, ret) for sid in pd.read_csv("all_times.csv").stock_id.unique()
    )  
    ret = pd.concat(ret, ignore_index=True)
    return ret

dfill_all = denorm_fill_stocks()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:  2.5min finished


In [7]:
dfill_all

Unnamed: 0,seconds_in_bucket,time_id,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,0,5,194.074070,194.244438,194.064031,194.254477,3.0,226.0,2.0,100.0,0
1,1,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
2,2,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
3,3,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
4,4,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
...,...,...,...,...,...,...,...,...,...,...,...
257359195,595,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126
257359196,596,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126
257359197,597,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126
257359198,598,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126


### Save to File

In [8]:
#dfill_all.to_hdf('dnorm_fill.h5', key='df', mode='w')

#### Reread data:

In [3]:
df = pd.read_hdf('dnorm_fill.h5', 'df') 
df

Unnamed: 0,seconds_in_bucket,time_id,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,0,5,194.074070,194.244438,194.064031,194.254477,3.0,226.0,2.0,100.0,0
1,1,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
2,2,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
3,3,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
4,4,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
...,...,...,...,...,...,...,...,...,...,...,...
257359195,595,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126
257359196,596,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126
257359197,597,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126
257359198,598,32767,194.781392,194.871618,194.771365,194.881651,101.0,100.0,1.0,100.0,126


### Extend timeids to include only one previous timeid for now...

* say: tid_prev2 -> tid_prev -> tid_curr
* only include tid_prev -> tid_curr
* use chains dict to find time ordered tid-pairs
* replace tids with extended tids for all stocks

In [40]:
df0 = df[df['stock_id']==0]
df05 = df0[df0['time_id']==5]
df05

Unnamed: 0,seconds_in_bucket,time_id,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,0,5,194.074070,194.244438,194.064031,194.254477,3.0,226.0,2.0,100.0,0
1,1,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
2,2,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
3,3,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
4,4,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
...,...,...,...,...,...,...,...,...,...,...,...
595,595,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0
596,596,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0
597,597,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0
598,598,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0


### Parallelize TimeID Extend

In [84]:
def chain_time(df, time_id, end = 77):  # default is just one extended timeid
    ret = df.query(f'time_id == {time_id}')
    if time_id in chains:
        for i, prev_id in enumerate(chains[time_id]):  # chains[5]: 1205, 31471
            if i == end:     
                return ret
            pre = df.query(f'time_id == {prev_id}') 
            ret = pd.concat([pre, ret], axis=0, ignore_index=True)
            ret['time_id'] = time_id
            ret['seconds_in_bucket'] = np.arange(0, (600)*(i+2), 1, dtype=int)
            
    return ret
    
def extend_times(df, end=77):
    ret = pd.DataFrame()
    ret = Parallel(n_jobs=-1, verbose=1)(
        delayed(chain_time)(df, time_id, end) for time_id in df.time_id.unique()
    )
    ret = pd.concat(ret, ignore_index=True)
    
    return ret

def get_chains_dict(filename="chains.json"):   
    with open(filename) as f:
        return {int(chain[-1]): [int(c) for c in chain[-2::-1]] for chain in json.load(f)} 

chains = get_chains_dict()

### Test functions on Stock 0 

In [105]:
df0 = df[df['stock_id'] == 0]

chain_time(df0, 5, end = 3)

Unnamed: 0,seconds_in_bucket,time_id,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,0,5,191.024379,191.164541,190.974321,191.174558,100.0,7.0,300.0,1.0,0
1,1,5,191.024379,191.164541,190.974321,191.174558,100.0,7.0,300.0,1.0,0
2,2,5,191.024379,191.174558,190.984321,191.184570,100.0,1.0,100.0,110.0,0
3,3,5,191.024379,191.174558,190.984321,191.184570,100.0,1.0,100.0,110.0,0
4,4,5,191.024379,191.174558,190.984321,191.184570,100.0,1.0,100.0,110.0,0
...,...,...,...,...,...,...,...,...,...,...,...
2395,2395,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0
2396,2396,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0
2397,2397,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0
2398,2398,5,194.404806,194.525078,194.384767,194.535097,100.0,3.0,26.0,3.0,0


In [106]:
x = extend_times(df0, end=0)
x.reset_index(drop=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 3830 out of 3830 | elapsed:   45.2s finished


Unnamed: 0,seconds_in_bucket,time_id,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,0,5,194.074070,194.244438,194.064031,194.254477,3.0,226.0,2.0,100.0,0
1,1,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
2,2,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
3,3,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
4,4,5,194.074070,194.244438,194.064031,194.254477,3.0,100.0,2.0,100.0,0
...,...,...,...,...,...,...,...,...,...,...,...
2297995,595,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0
2297996,596,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0
2297997,597,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0
2297998,598,32767,208.408205,208.508225,208.388205,208.548225,92.0,90.0,26.0,28.0,0


In [115]:
%%time
x.to_hdf("test_s0.h5", key='df', mode='w')

Wall time: 247 ms


In [116]:
%%time
x.to_feather("test_s0.fth")

Wall time: 46.3 ms


### Extend All Stock Time IDs

#### Extend by One TimeID

In [None]:
for stock_id in df.stock_id.unique()[14:]:
    print(f"Stock {stock_id} extending...")
    # default will extend stocks by max possible chain length
    extended_df = extend_times(df[df['stock_id']==stock_id], end=0)
    extended_df.to_feather(f"extend_one/stock_{stock_id}.fth")   # feat

Stock 15 extending...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 3830 out of 3830 | elapsed:   43.2s finished


Stock 16 extending...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 2404 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 3830 out of 3830 | elapsed:   32.9s finished


Stock 17 extending...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 2404 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 3830 out of 3830 | elapsed:   32.0s finished


Stock 18 extending...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 2404 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 3830 out of 3830 | elapsed:   32.5s finished


Stock 19 extending...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 2404 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done 3830 out of 3830 | elapsed:   34.7s finished


Stock 20 extending...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 2404 tasks      | elapsed:   20.1s


#### Extend by Two TimeIDs

In [None]:
for stock_id in df.stock_id.unique()[:]:
    print(f"Stock {stock_id} extending...")
    # default will extend stocks by max possible chain length
    extended_df = extend_times(df[df['stock_id']==stock_id], end=1)
    extended_df.to_feather(f"extend_two/stock_{stock_id}.fth")   

#### Extend by Three TimeIDs

In [None]:
for stock_id in df.stock_id.unique()[:]:
    print(f"Stock {stock_id} extending...")
    # default will extend stocks by max possible chain length
    extended_df = extend_times(df[df['stock_id']==stock_id], end=2)
    extended_df.to_feather(f"extend_three/stock_{stock_id}.fth")   

#### Extend to Max Chain Length

In [None]:
for stock_id in df.stock_id.unique()[6:]:
    print(f"Stock {stock_id} extending...")
    # default will extend stocks by max possible chain length
    extended_df = extend_times(df[df['stock_id']==stock_id])
    extended_df.to_feather(f"extend_all/stock_{stock_id}.fth")   # feather is even faster than hd5, can also use parquet