In [124]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import fastparquet

In [None]:
trades = pd.read_parquet("clean_trades.parquet", engine = 'fastparquet')
df = pd.read_csv("baseline2.csv",index_col='Unnamed: 0')

In [None]:
df.stock_id.unique().size

In [None]:
trades.head()

In [None]:
def calculate_price_mult(group):
    unique_vals = np.sort(group.unique())
    if len(unique_vals) >= 2:
        diff = unique_vals[1] - unique_vals[0]
        return 0.01 / diff if diff != 0 else np.nan
    return np.nan

trades['price_mult'] = trades.groupby(['stock_id','time_id'])['bid_price1'].transform(calculate_price_mult)

stock_means = trades.groupby('stock_id')['price_mult'].transform('mean')
trades['price_mult'] = trades['price_mult'].fillna(stock_means)

trades['w_spread'] = 2*(-(trades.bid_price1 * trades.bid_size1 + trades.bid_price2 * trades.bid_size2 +
trades.bid_price1_before * trades.bid_size1_before + trades.bid_price2_before * trades.bid_size2_before)/(
    trades.bid_size1 + trades.bid_size1_before + trades.bid_size2 + trades.bid_size2_before
)+(trades.ask_price1 * trades.ask_size1 + trades.ask_price2 * trades.ask_size2 +
trades.ask_price1_before * trades.ask_size1_before + trades.ask_price2_before * trades.ask_size2_before )/(
    trades.ask_size1 + trades.ask_size1_before + trades.ask_size2 + trades.ask_size2_before))/(trades.ask_price1 + trades.bid_price1)

In [None]:
trades['total_book_size'] = trades.bid_size1 + trades.bid_size1_before + trades.bid_size2 + trades.bid_size2_before + trades.ask_size1 + trades.ask_size1_before + trades.ask_size2 + trades.ask_size2_before

In [None]:
trades['product_shift'] = trades['size']*trades['trade']
trades['product_shift_abs'] = np.abs(trades['size']*trades['trade'])

In [None]:
trades['book_density_abs_impact'] = np.abs(((trades.ask_size1 + trades.bid_size1 + trades.ask_size2 + trades.bid_size2) - (
    trades.ask_size1_before + trades.ask_size2_before + trades.bid_size1_before + trades.bid_size2_before))/trades['total_book_size'])

In [None]:
gb = trades.groupby(['stock_id', 'time_id'])
df = df.merge(gb.agg({'price_mult':'median','product_shift':'mean','product_shift_abs':'sum','w_spread':'mean','total_book_size':'sum', 'book_density_abs_impact':'mean'}).reset_index())

In [None]:
df['shift_ratio']= df.product_shift_abs/df['size']

In [None]:
df['log_volume'] = np.log(df['size']*df['price_mult'])

In [None]:
df.plot.scatter(x='product_shift',y='target')

In [None]:
df.plot.scatter(x='shift_ratio',y='target')

In [None]:
df.corr().target

In [None]:
df.plot.scatter(x='book_density_abs_impact',y='target')

In [None]:
df.plot.scatter(x='log_volume',y='target')

In [None]:
df.to_csv("baseline3.csv")

time_id                       0
seconds_in_bucket             0
price                         0
size                          0
order_count                   0
stock_id                      0
bid_price1                    0
ask_price1                    0
bid_price2                    0
ask_price2                    0
bid_size1                     0
ask_size1                     0
bid_size2                     0
ask_size2                     0
bid_price1_before             0
ask_price1_before             0
bid_price2_before             0
ask_price2_before             0
bid_size1_before              0
ask_size1_before              0
bid_size2_before              0
ask_size2_before              0
time_diff                     0
trade                         0
mid                           0
spread                        0
w_spread                      0
total_book_size               0
product_shift                 0
product_shift_abs             0
book_density_abs_impact       0
price_mu

array([ 9936,  3002, 15818, 31819,  2803, 16999, 17428, 22780, 32053,
       18940, 30443,   337,  6324,  6679,   438,   650,   908,  1000,
        1016,  1070,  1128,  1255,  1712,  1822,  1826,  2102,  2117,
        2219,  2254,  2267,  2436,  2709,  2758,  2891,  2908,  2992,
        2999,  3922,  4470,  4661,  4714,  4754,  4867,  5063,  5139,
        5171,  5218,  5424,  5676,  5743,  5777,  5853,  5975,  6016,
        6523,  6742,  6819,  6904,  7055,  7460,  7567,  7593,  7758,
        7854,  8014,  8192,  8196,  8435,  8590,  9207,  9311,  9518,
        9606, 10206, 10450, 10941, 10946, 10985, 11143, 11389, 11559,
       11786, 11869, 12061, 12279, 12395, 12428, 12436, 12444, 12577,
       12633, 12996, 13762, 13960, 13989, 14065, 14093, 14246, 14273,
       14311, 14356, 14749, 14769, 14913, 15516, 15730, 15807, 15858,
       15934, 16320, 16519, 16560, 16601, 16611, 16614, 16629, 16706,
       16733, 17305, 17429, 17841, 17957, 18029, 18495, 18597, 18628,
       18908, 18912,

index
0           16.136262
1           16.158761
2           16.151262
3           16.153761
4           16.159595
              ...    
38382735    73.893440
38382736    73.903440
38382737    73.883444
38382738    73.883444
38382739    73.873444
Length: 38287937, dtype: float64