In [2]:
%reset
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
from sklearn import linear_model, svm, kernel_ridge
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 100

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


## Utility functions to load, clean, substract mean of data, split datasets, print the classification rate

In [17]:
def load_data():
    # load data from csv file (stratpy not available)
    prices_raw = pd.read_csv("./Prices_raw.csv")

    # parse timestamps correctly
    for t in [u'date' , u'ebsMarketUpdateTime', u'feedHandlerPublishTime', u'feedHandlerReceiveTime', u'eventCaptureTime']:
        prices_raw[t] = pd.to_datetime(prices_raw[t])
    
    return prices_raw

def clean_data(prices_raw):
    prices = prices_raw[['date','bid','ask','bid2','ask2','bid3','ask3','bidSize1','askSize1','bidSize2','askSize2','bidSize3','askSize3']]

    prices['bid'] = prices['bid'].replace(0,np.NaN)
    prices['ask'] = prices['ask'].replace(0,np.NaN)
    prices['bid2'] = prices['bid2'].replace(0,np.NaN)
    prices['ask2'] = prices['ask2'].replace(0,np.NaN)

    prices['mid'] = 0.5*(prices['bid'] + prices['ask'])
    prices.index = prices_raw.feedHandlerReceiveTime

    return prices 
    # columns: feedHandlerReceiveTime | bid, ask, bid2, ask2, bid3, ask3, bidSize1, askSize1, bidSize2, askSize2, bidSize3, askSize3, paid, given, mid

def whiten_data(prices):
    mean_price = np.round(prices['mid'].mean(), 4)
    block_size = 1e6

    prices[['bid', 'ask', 'bid2', 'ask2', 'bid3', 'ask3', 'mid']] -= mean_price
    prices[['bidSize1', 'askSize1', 'bidSize2', 'askSize2', 'bidSize3', 'askSize3']] /= block_size
    
    return prices

def split_test_train(prices, dep_var):
    # IN = row indices for train, OUT = for test
    OUT = (prices.date == '2017.09.29') | (prices.date == '2017.09.28') 
    OUT = OUT | (prices.date == '2017.09.27') 
    IN = ~OUT

    X_train = prices[IN]
    X_train.drop(['date'], 1, inplace = True) # drop the date in order to keep a multivariate input
    cols = list(X_train.columns)
    X_train = np.array(X_train.values)
    y_train = np.array(dep_var[IN]['nextMidVariation'].values)
    
    X_test = prices[OUT]
    X_test.drop(['date'], 1, inplace = True)
    X_test = np.array(X_test.values)
    y_test = np.array(dep_var[OUT]['nextMidVariation'].values)

    y_train[y_train<0] = -1
    y_train[y_train>0] = 1
    y_test[y_test<0] = -1
    y_test[y_test>0] = 1

    return X_train, y_train, np.array(dep_var[IN]['nextMidVariation'].values), X_test, y_test, np.array(dep_var[OUT]['nextMidVariation'].values), cols

def classif_correct_rate(estim, truth):
    return 1.0 - np.linalg.norm(np.sign(estim) - truth, ord = 1) / (2 * estim.shape[0])

def print_statistics(y_train, y_test):
    print('Prior dataset statistics')
    print('train: +1: %.3f, -1: %.3f'   % (np.sum(y_train[y_train > 0]) / y_train.shape[0], np.sum(y_train[y_train < 0]) / y_train.shape[0]))
    print('test:  +1: %.3f, -1: %.3f\n' % (np.sum(y_test[y_test > 0]) / y_test.shape[0], np.sum(y_test[y_test < 0]) / y_test.shape[0]))

## Features engineering: add columns containing moving averages etc. 

In [20]:
def features_engineering(prices):
    # spread 
    prices['spread'] = prices['ask'] - prices['bid']

    # book pressure feature
    prices['bp'] = prices['mid'] - (prices['bidSize1']*prices['bid'] + prices['askSize1']*prices['ask'])/(prices['bidSize1']+prices['askSize1'])
    prices['bp_with2'] = prices['mid'] - (prices['bidSize1']*prices['bid'] + prices['askSize1']*prices['ask']
                                                     + prices['askSize2']*prices['ask2'] + prices['askSize3']*prices['ask3']
                                                     + prices['bidSize2']*prices['bid2'] + prices['bidSize3']*prices['bid3'])/(prices['bidSize1']+prices['askSize1']+prices['bidSize2']+prices['askSize2']+prices['bidSize3']+prices['askSize3'])


    prices['weekday'] = prices.index.weekday
    
    # volatility
    vol_lookbacks = [1e3, 1e5, 1e7, 1e9]
    for lookback in vol_lookbacks:
        prices['mid_vol_%d_ms' % lookback] = prices['mid'].rolling('%dms' % lookback).std()
        prices['mid_vol_%d_ms' % lookback].ffill()

    # column name over which we build moving averages
    columns = ['bid','ask','bid2','ask2','bidSize1','askSize1','bidSize2','askSize2','mid','spread', 'bp', 'bp_with2']
    columns = ['bidSize1','askSize1','mid','spread','bp_with2']
    
    #moving averages over last n rows
    row_intervals = [1, 5, 10, 320, 1280]
    for window in row_intervals:
        for feature in columns:
            prices['%s_ma_%d_row' % (feature, window)] = prices[feature].rolling(window, min_periods=1).mean()
            prices['mid_vol_%d_ms' % lookback].ffill()

    # moving averages over last n milliseconds
    time_intervals = [20, 80, 1000, 16000] 
    for time_window in time_intervals:
        for feature in columns:
            prices['%s_ma_%d_ms' % (feature, time_window)] = prices[feature].rolling('%ds' % time_window, min_periods=1).mean()
            prices['%s_ma_%d_ms' % (feature, time_window)]
    
    # columns over which we'll build delta /deltadelta signals
    for col in ['spread', 'bid', 'ask']:
        if col in columns:
            columns.remove(col)
    ma_row_columns = ['%s_ma_%d_row' % (feature, window) for feature in columns for window in row_intervals]
    ma_ms_columns = ['%s_ma_%d_ms' % (feature, time_window) for feature in columns for time_window in time_intervals]
    ma_columns = ma_row_columns + ma_ms_columns

    # columns to differentiate once
    ma_columns += ['mid']
    delta_column_names = ['delta_' + col for col in ma_columns]
    prices[delta_column_names] = prices[ma_columns] - prices[ma_columns].shift(1)
    # columns to differentiate twice
    delta_delta_column_names = ['delta_' + col for col in delta_column_names]
    prices[delta_delta_column_names] = prices[delta_column_names] - prices[delta_column_names].shift(1)

    # drop first two rows since they're nan for the delta_delta
    prices = prices.iloc[2:]

    prices['mid_diff_interval'] = (prices['delta_mid'] != 0).cumsum()

    old_n_rows = prices.shape[0]
    prices.dropna(inplace=True)
    print 'Dropped %d out of %d rows containing NaNs' % (old_n_rows - prices.shape[0], old_n_rows)
    old_n_rows = prices.shape[0]
    prices = prices[(np.abs(stats.zscore(prices['delta_mid'])) < 5)]
    print 'Dropped %d out of %d rows with extreme z-score' % (old_n_rows - prices.shape[0], old_n_rows)

    ######### create feature to learn, ie next move (not to be used as covariates!)
    prices['midDiff'] = prices['mid'].diff()
    prices['nextMidDiff'] = prices['midDiff'].shift(-1)
    prices['nextMidVariation'] = prices['nextMidDiff'].replace(to_replace=0, method='bfill')
    # drop nans again (there may be new nan's in nextMidVariation?)
    old_n_rows = prices.shape[0]
    prices.dropna(inplace=True)
    print 'Dropped %d out of %d rows containing NaNs\n' % (old_n_rows - prices.shape[0], old_n_rows)

    mid_look_ahead = prices[['nextMidVariation']]
    # drop variables which should not be used as covariates
    # prices.drop(['midDiff'], 1, inplace = True)
    prices.drop(['nextMidDiff'], 1, inplace = True)
    prices.drop(['nextMidVariation'], 1, inplace = True)

    return prices, mid_look_ahead

# Main function

In [21]:
prices_raw = load_data()
prices = clean_data(prices_raw)
prices = whiten_data(prices)
prices, dep_var = features_engineering(prices)
X_train, y_train, y_train_value, X_test, y_test, y_test_value, cols = split_test_train(prices, dep_var)

print_statistics(y_train, y_test)

Dropped 87792 out of 231152 rows containing NaNs
Dropped 982 out of 143360 rows with extreme z-score
Dropped 5 out of 142378 rows containing NaNs

Prior dataset statistics
train: +1: 0.512, -1: -0.488
test:  +1: 0.494, -1: -0.506



In [22]:
y_prev_train = X_train[:, cols.index('midDiff')]
y_prev_test = X_test[:, cols.index('midDiff')]

In [25]:
np.sign(y_prev_test), y_test

(array([-1.,  1.,  1., ...,  0.,  1.,  0.]),
 array([ 1.,  1., -1., ...,  1., -1., -1.]))