In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import dask.dataframe as dd

### Concat using time_id

In [2]:
files = glob.glob(os.path.join("./individual_book_train", "*.csv"))

def concat_stocks(files, time_id = 5):
    
    # CREATE NEW DATAFRAME - with columns: stock_id and average wap for time_id
    stocks = pd.DataFrame({"stock_id":[], "wap":[], "seconds": []})
    
    for f in files[:]: 

        # LAZY READ TO OPTIMIZE READ TIME
        df = dd.read_csv(f)

        # FIND ONLY COLUMNS WITH time_id ONLY
        df = df[df['time_id'] == time_id]
        df = df.compute()
        
        # COMPUTE WAP COLUMN
        df["wap"] = (df["bid_price1"] * df["ask_size1"] + df["ask_price1"] * df["bid_size1"]) \
                    / (df["bid_size1"] + df["ask_size1"])
        
        d = pd.DataFrame({"stock_id": df["stock_id"], "wap": df["wap"], "seconds": df["seconds_in_bucket"]})
        
        ## FILLING ALL SECONDS WITH STATS FROM SECOND BEFORE (NO MISSING SECONDS)
        i = 0
        while i < 600: 
            if i == len(d) and i < 600 or d.loc[i].seconds > i:
                new = pd.DataFrame({'stock_id':[d.loc[i-1].stock_id], 'wap':[d.loc[i-1].wap], 'seconds':[i]})
                d = pd.concat([d, new]).sort_values('seconds')
                d = d.reset_index(drop=True)
        
            i += 1
    
        stocks = pd.concat([stocks, d]).astype({'stock_id':'int', 'seconds':'int'})
                
    return stocks

### Creating dataframe (may take some time)
- All stocks concatenated by time id

In [3]:
s = concat_stocks(files, time_id = 5)

s

Unnamed: 0,stock_id,wap,seconds
0,0,1.001434,0
1,0,1.001448,1
2,0,1.001448,2
3,0,1.001448,3
4,0,1.001448,4
...,...,...,...
595,99,1.004193,595
596,99,1.004193,596
597,99,1.004092,597
598,99,1.004211,598


## Computing Beta

In [4]:
def compute_beta(d):
    c = (((d['wap'] - d['wap_mean']) * (d['market_mean_seconds'] - d['market_mean'])) / len(d)).sum()
    v = (((d['market_mean_seconds'] - d['market_mean']) ** 2) / len(d)).sum()
    d['beta'] = c/v
    return d

In [5]:
# Calculate average wap for stock
s['wap_mean'] = s.groupby(['stock_id'])['wap'].transform('mean')

# Calculate market mean for each second
s['market_mean_seconds'] = s.groupby(['seconds'])['wap'].transform('mean')

# Calculate overall market mean 
s['market_mean'] = s['wap'].mean()

# compute beta for each stock_id
s = s.groupby(['stock_id']).apply(compute_beta)

In [6]:
stock_betas = pd.DataFrame({'stock_id':s['stock_id'].unique(), 'beta':s['beta'].unique()})

stock_betas

Unnamed: 0,stock_id,beta
0,0,1.208072
1,1,2.201069
2,2,0.267023
3,3,-0.110631
4,4,0.793787
...,...,...
107,122,1.940053
108,123,0.617386
109,124,0.034549
110,125,1.301675
