In [14]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# load data

In [2]:
data_dir = '/home/lzhao/data/tmp/optiver'

In [3]:
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
book_example = pd.read_parquet(os.path.join(data_dir, 'book_train.parquet/stock_id=0'))
trade_example = pd.read_parquet(os.path.join(data_dir, 'trade_train.parquet/stock_id=0'))

In [15]:
%%time
train_df.head(5)

CPU times: user 148 µs, sys: 100 µs, total: 248 µs
Wall time: 268 µs


Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [5]:
book_test = book_example[book_example.time_id == 5]
book_test

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.001422,1.002301,1.001370,1.002353,3,226,2,100
1,5,1,1.001422,1.002301,1.001370,1.002353,3,100,2,100
2,5,5,1.001422,1.002301,1.001370,1.002405,3,100,2,100
3,5,6,1.001422,1.002301,1.001370,1.002405,3,126,2,100
4,5,7,1.001422,1.002301,1.001370,1.002405,3,126,2,100
...,...,...,...,...,...,...,...,...,...,...
297,5,585,1.003129,1.003749,1.003025,1.003801,100,3,26,3
298,5,586,1.003129,1.003749,1.002612,1.003801,100,3,2,3
299,5,587,1.003129,1.003749,1.003025,1.003801,100,3,26,3
300,5,588,1.003129,1.003749,1.002612,1.003801,100,3,2,3


# feature engineer

In [6]:
# 计算对数收益率
def log_return(list_stock_prices):
    '''log(s2/s1) = log(s2) - log(s1)'''
    return np.log(list_stock_prices).diff()

# 计算已实现波动率
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

# 计算wap价格
def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap


In [7]:
book_example['wap'] = calc_wap(book_example)
book_example['log_return'] = book_example.groupby('time_id')['wap'].apply(log_return)

book_example['wap2'] = calc_wap2(book_example)
book_example['log_return2'] = book_example.groupby('time_id')['wap2'].apply(log_return)

book_example['wap_balance'] = np.abs(book_example.wap - book_example.wap2)

book_example['price_spread'] = (book_example['ask_price1']- book_example['bid_price1']) / ((book_example['ask_price1'] + book_example['bid_price1'])/2)
book_example['bid_spread'] = book_example['bid_price1'] - book_example['bid_price2']
book_example['ask_spread'] = book_example['ask_price1'] - book_example['ask_price2']
book_example['total_volume'] = book_example['ask_size1'] + book_example['ask_size2'] + book_example['bid_size1'] + book_example['bid_size2']

book_example['volume_imbalance'] = np.abs((book_example['ask_size1'] + book_example['ask_size2']) - (book_example['bid_size1'] + book_example['bid_size2']))

book_example.head(5)


Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,wap,log_return,wap2,log_return2,wap_balance,price_spread,bid_spread,ask_spread,total_volume,volume_imbalance
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100,1.001434,,1.00139,,4.4e-05,0.000878,5.2e-05,-5.2e-05,331,321
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100,1.001448,1.4e-05,1.00139,0.0,5.8e-05,0.000878,5.2e-05,-5.2e-05,205,195
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100,1.001448,0.0,1.001391,1e-06,5.7e-05,0.000878,5.2e-05,-0.000103,205,195
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443,-5e-06,1.001391,0.0,5.2e-05,0.000878,5.2e-05,-0.000103,231,221
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443,0.0,1.001391,0.0,5.2e-05,0.000878,5.2e-05,-0.000103,231,221


In [8]:
feature_dict = {
    'log_return' : [realized_volatility],
    'log_return2' : [realized_volatility], 
    'wap_balance' : [np.mean], 
    'price_spread' : [np.mean], 
    'ask_spread' : [np.mean], 
    'bid_spread' : [np.mean], 
    'volume_imbalance': [np.mean], 
    'total_volume' : [np.mean], 
    'wap' : [np.mean]
}

In [9]:
df_feature = pd.DataFrame(book_example.groupby('time_id').agg(feature_dict)).reset_index()

In [10]:
df_feature.columns = ['_'.join(col) for col in df_feature.columns]

In [11]:
last_seconds = [300]

for second in last_seconds:
    second = 600 - second
    df_feature_sec = book_example.query(f'seconds_in_bucket >= {second}').groupby('time_id').agg(feature_dict).reset_index()
    df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
    df_feature_sec = df_feature_sec.add_suffix('_' + str(second))
    df_feature = pd.merge(df_feature, df_feature_sec, how='left', left_on='time_id_', right_on=f'time_id__{second}')
    df_feature = df_feature.drop([f'time_id__{second}'], axis=1)


