# Runtime: ~3 hrs
## Extracting beta, dom & spread for each stock across all time_ids

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import dask.dataframe as dd

### Functions

* Compute_beta: Computes beta value from stock wap, mean(stock wap), mean(all stock waps), mean(all stock waps / sec))
* Extract_features: Returns dataframe with beta, mean(depth of market) & mean(spread) for all stocks @ a selected time_id

In [2]:
def compute_beta(d):
    """
    Computes beta value from stock wap, mean(stock wap), mean(all stock waps), mean(all stock waps / sec))
    """
    c = (((d['wap'] - d['wap_mean']) * (d['market_mean_seconds'] - d['market_mean'])) / len(d)).sum()
    v = (((d['market_mean_seconds'] - d['market_mean']) ** 2) / len(d)).sum()
    d['beta'] = c/v
    return d

def extract_features(files, time_id = 5):
    """
    Returns dataframe with beta, mean(depth of market) & mean(spread) for all stocks @ a selected time_id
    """
    # CREATE NEW DATAFRAME - with columns: stock_id and average wap for time_id
    stocks = pd.DataFrame({"stock_id":[], "wap":[], "seconds": []})

    # For each stock...
    for f in files[:]: 
        # LAZY READ TO OPTIMIZE READ TIME
        df = dd.read_csv(f)

        # FIND ONLY COLUMNS WITH time_id ONLY
        df = df[df['time_id'] == time_id]
        df = df.compute()

        # COMPUTE FEATURES
        df["wap"] = (df["bid_price1"] * df["ask_size1"] + df["ask_price1"] * df["bid_size1"]) \
                    / (df["bid_size1"] + df["ask_size1"])
        
        df["dom"] = df['bid_price1'] * df['bid_size1'] + df['bid_price2'] * df['bid_size2'] \
                    + df['ask_price1'] * df['ask_size1'] + df['ask_price2'] * df['ask_size2']
        
        df["spread"] = df['ask_price1'] / df['bid_price1'] - 1
        
        d = pd.DataFrame({"stock_id": df["stock_id"], "wap": df["wap"], "dom": df["dom"], "spread": df["spread"],
                          "seconds": df["seconds_in_bucket"]})
        
        ## FILLING ALL SECONDS WITH STATS FROM SECOND BEFORE (NO MISSING SECONDS)
        d = d.reset_index(drop = True)
        i = 0
        while i < 600: 
            if i == len(d) and i < 600 or d.loc[i].seconds > i:
                new = pd.DataFrame({'stock_id':[d.loc[i-1].stock_id], 
                                    'wap':[d.loc[i-1].wap], 
                                    'dom':[d.loc[i-1].dom], 
                                    'spread':[d.loc[i-1].spread], 
                                    'seconds':[i]})

                d = pd.concat([d, new]).sort_values('seconds')
                d = d.reset_index(drop=True)
            i += 1
        
        stocks = pd.concat([stocks, d]).astype({'stock_id':'int', 'seconds':'int'})
    
    # COMPUTE FEATURES ACROSS ALL STOCKS FOR TIME_ID
    
    # Calculate average wap for each stock_id for time_id
    stocks['wap_mean'] = stocks.groupby(['stock_id'])['wap'].transform('mean')

    # Calculate market mean for each second of time_id (mean WAP/sec across all stocks)
    stocks['market_mean_seconds'] = stocks.groupby(['seconds'])['wap'].transform('mean')

    # Calculate overall market mean for time_id (mean WAP across all stocks)
    stocks['market_mean'] = stocks['wap'].mean()
    
    # compute beta for each stock_id for time_id
    stocks = stocks.groupby(['stock_id']).apply(compute_beta)
    
    # Calculate average dom for each stock_id for time_id
    stocks['dom_mean'] = stocks.groupby(['stock_id'])['dom'].transform('mean')
    
    # Calculate average spread for each stock_id for time_id
    stocks['spread_mean'] = stocks.groupby(['stock_id'])['spread'].transform('mean')
    
    stock_betas = pd.DataFrame({
        'stock_id': stocks['stock_id'].unique(), 
        'beta': stocks['beta'].unique(),
        'dom': stocks['dom_mean'].unique(),
        'spread': stocks['spread_mean'].unique()
    })
    
    return stock_betas

### Compute beta, mean dom & mean spread for each time_id

In [None]:
files = glob.glob(os.path.join("./individual_book_train", "*.csv"))
stocks = pd.DataFrame({'stock_id':[], 'beta':[], 'dom':[], 'spread':[]})

# for each time_id in all unique time_ids from first file (assumption all time_ids are shared across stocks)
for time_id in pd.read_csv(files[0]).time_id.unique():
    
    # merge extracted features for each stock by time_id
    stocks = pd.concat([stocks, extract_features(files, time_id)]).astype({'stock_id':'int'})

stocks

### Across all time_ids, compute mean beta, mean(mean dom & mean spread for each time_id) for all stocks

In [None]:
# averaging beta, dom & spread across time_ids for each stock
stocks['beta_mean'] = stocks.groupby(['stock_id'])['beta'].mean()
stocks['dom_mean'] = stocks.groupby(['stock_id'])['dom'].mean()
stocks['spread_mean'] = stocks.groupby(['stock_id'])['spread'].mean()

# storing averaged beta, dom & spread in dataframe for each stock
stock_final = pd.DataFrame({
    'stock_id': stocks['stock_id'].unique(),
    'beta': stocks['beta_mean'].unique(),
    'dom': stocks['dom_mean'].unique(),
    'spread': stocks['spread_mean'].unique()
})

stock_final

### Write all extracted features to file

#### Contains averaged beta, dom & spread for all stocks across all time_ids:

In [None]:
stock_final.to_csv("beta_dom_spread.csv")

#### Contains beta, dom & spread for all stocks for each time_id:

In [None]:
stocks.to_csv("beta_dom_spread_by_timeid.csv")