In [1]:
import pandas as pd 
import numpy as np
import public_timeseries_testing_util as optiver2023
from torch.nn.utils.rnn import pack_padded_sequence, pack_sequence, unpack_sequence, unpad_sequence
import torch
from tqdm.notebook import trange,tqdm
import torch.nn as nn 
import torch.optim as optim
import wandb
import torch_classes
from model_saver import model_saver_wandb as model_saver
import training_testing
from itertools import combinations
import gc
from sklearn.decomposition import PCA

In [2]:
env = optiver2023.make_env()
iter_test = env.iter_test()

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc.
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [4]:
train = pd.read_csv('train.csv')
train = train.query('seconds_in_bucket <= 290').reset_index(drop=True)
train.head()
train.date_id.value_counts()

date_id
480    6000
353    6000
363    6000
362    6000
360    6000
       ... 
4      5760
2      5730
1      5730
3      5730
0      5730
Name: count, Length: 481, dtype: int64

In [5]:
train.seconds_in_bucket.value_counts()

seconds_in_bucket
0      95236
10     95236
280    95236
270    95236
260    95236
250    95236
240    95236
230    95236
220    95236
210    95236
200    95236
190    95236
180    95236
170    95236
160    95236
150    95236
140    95236
130    95236
120    95236
110    95236
100    95236
90     95236
80     95236
70     95236
60     95236
50     95236
40     95236
30     95236
20     95236
290    95236
Name: count, dtype: int64

In [6]:
def imbalance_calculator(x):
    
    x_copy = x.copy()
    
    x_copy['imb_s1'] = x.eval('(bid_size - ask_size) / (bid_size + ask_size)')
    x_copy['imb_s2'] = x.eval('(imbalance_size - matched_size) / (matched_size + imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                x_copy[f'{a}_{b}_imb'] = x.eval(f'({a} - {b}) / ({a} + {b} + {0.001})')
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = x[[a,b,c]].max(axis=1)
                    min_ = x[[a,b,c]].min(axis=1)
                    mid_ = x[[a,b,c]].sum(axis=1)-min_-max_

                    x_copy[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_+0.001)
    x_copy.replace([np.inf, -np.inf], 0, inplace=True)
    return x_copy



In [7]:
median_vol = pd.read_csv("archive/MedianVolV2.csv")
median_vol.index.name = "stock_id";
median_vol = median_vol[['overall_medvol', "first5min_medvol", "last5min_medvol"]]

In [8]:
median_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()
std_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()

In [9]:
def feat_eng(df):
    
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']]
    df = df[cols]
    df = df.merge(median_vol, how = "left", left_on = "stock_id", right_index = True)
    
    df['bid_plus_ask_sizes'] = df['bid_size'] + train['ask_size']
#     df['median_size'] = df['stock_id'].map(median_sizes.to_dict())
    df['std_size'] = df['stock_id'].map(std_sizes.to_dict())
#     df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_size'], 1, 0) 
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    df['ask_x_size'] = df.eval('ask_size*ask_price')
    df['bid_x_size'] = df.eval('bid_size*bid_price')
        
    df['ask_minus_bid'] = df['ask_x_size'] - df['bid_x_size'] 
    
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"])
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    
    prices = ['reference_price', 'ask_price', 'bid_price', 'wap']
    
    for c in combinations(prices, 2):
        
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')

    for c in combinations(prices, 3):
        
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1)-min_-max_

        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)
    
        
    df.drop(columns=[
        # 'date_id', 
        'std_size',
        'bid_size_over_ask_size',
        'ask_price_bid_price_imb',
    ], inplace=True)
        
    gc.collect()

    df.replace([np.inf, -np.inf], np.NAN, inplace=True)
    
    return df

In [10]:
y = train['target'].values
X = feat_eng(train)

In [11]:
nulls = X.isna().sum(axis = 0)

In [12]:
# train = imbalance_calculator(train)

In [13]:
prices = [c for c in train.columns if 'price' in c]
pca_prices = PCA(n_components=1)
X['pca_prices'] = pca_prices.fit_transform(X[prices].fillna(1))

In [14]:
X

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_price_times_wap,ask_price_wap_imb,bid_price_minus_wap,bid_price_times_wap,bid_price_wap_imb,reference_price_ask_price_bid_price_imb2,reference_price_ask_price_wap_imb2,reference_price_bid_price_wap_imb2,ask_price_bid_price_wap_imb2,pca_prices
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,...,1.000026,0.000013,-0.000188,0.999812,-9.400884e-05,-1.927541e+12,0.138298,-1.693353e+12,0.138298,-0.000206
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,...,1.000660,0.000330,-0.000104,0.999896,-5.200270e-05,-6.881500e+12,6.346154,9.367487e+11,6.346154,0.000256
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,...,1.000298,0.000149,-0.000597,0.999403,-2.985891e-04,4.664557e+00,0.678815,2.778481e+00,0.499162,-0.000430
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,...,1.000214,0.000107,-0.000001,0.999999,-5.000003e-07,2.500000e-01,0.251462,1.710000e+02,214.000000,0.000219
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,...,1.000016,0.000008,-0.000606,0.999394,-3.030918e-04,3.507246e+00,0.034188,3.391304e+00,0.026403,-0.000614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2857075,195,480,290,6099963.33,-1,1.000317,18633850.25,,,1.000201,...,1.000608,0.000013,-0.000090,1.000492,-4.498893e-05,0.000000e+00,0.000000,2.888889e-01,0.288889,0.000479
2857076,196,480,290,67949.56,-1,0.999357,7936446.96,,,0.999871,...,1.000105,0.000076,-0.000105,0.999847,-5.250402e-05,5.019455e-01,0.247173,2.042802e-01,1.457143,-0.000379
2857077,197,480,290,3772211.05,-1,0.996538,7660852.97,,,0.996445,...,0.994375,0.000293,-0.000446,0.993347,-2.237455e-04,1.008602e+01,1.657224,3.795699e+00,1.311659,-0.005513
2857078,198,480,290,3224772.38,-1,0.999210,84569369.22,,,0.998729,...,0.997808,0.000067,-0.000108,0.997567,-5.406580e-05,9.958506e-01,1.804511,3.453704e+00,1.231481,-0.001785


In [15]:
X.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'overall_medvol', 'first5min_medvol',
       'last5min_medvol', 'bid_plus_ask_sizes', 'imbalance_ratio', 'imb_s1',
       'imb_s2', 'ask_x_size', 'bid_x_size', 'ask_minus_bid',
       'bid_price_over_ask_price', 'reference_price_minus_ask_price',
       'reference_price_times_ask_price', 'reference_price_ask_price_imb',
       'reference_price_minus_bid_price', 'reference_price_times_bid_price',
       'reference_price_bid_price_imb', 'reference_price_minus_wap',
       'reference_price_times_wap', 'reference_price_wap_imb',
       'ask_price_minus_bid_price', 'ask_price_times_bid_price',
       'ask_price_minus_wap', 'ask_price_times_wap', 'ask_price_wap_imb',
       'bid_price_minus_wap', 'bid_price_times_wap', 'bid_price_wap_imb',
       're

In [16]:
def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

In [17]:
import importlib

In [18]:
importlib.reload(torch_classes)
trading_data = torch_classes.TradingData(X)
hidden_size = 64
trading_data.generate_batches()

 10%|▉         | 19/200 [00:09<01:05,  2.77it/s]

Missing Targets for day=438,for stock_id=19, Excluding


 50%|█████     | 101/200 [00:46<00:35,  2.81it/s]

Missing Targets for day=328,for stock_id=101, Excluding


 66%|██████▌   | 131/200 [01:00<00:24,  2.77it/s]

Missing Targets for day=35,for stock_id=131, Excluding


 79%|███████▉  | 158/200 [01:10<00:14,  2.88it/s]

Missing Targets for day=388,for stock_id=158, Excluding


100%|██████████| 200/200 [01:29<00:00,  2.24it/s]


Length of train: 385, Length of test 96


100%|██████████| 385/385 [00:00<00:00, 9158.28it/s]
100%|██████████| 95/95 [00:00<00:00, 10545.70it/s]


In [19]:
len(trading_data.train_batches[0])

191

In [20]:
for x in trading_data.train_batches:
    for t1 in x:
        for t in t1:
            if len(t)!=42:
                print(len(t))
        

In [21]:
stat_cols = ['seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'overall_medvol', 'first5min_medvol',
       'last5min_medvol', 'bid_plus_ask_sizes', 'imbalance_ratio', 'imb_s1',
       'imb_s2', 'ask_x_size', 'bid_x_size', 'ask_minus_bid',
       'bid_price_over_ask_price', 'reference_price_minus_ask_price',
       'reference_price_times_ask_price', 'reference_price_ask_price_imb',
       'reference_price_minus_bid_price', 'reference_price_times_bid_price',
       'reference_price_bid_price_imb', 'reference_price_minus_wap',
       'reference_price_times_wap', 'reference_price_wap_imb',
       'ask_price_minus_bid_price', 'ask_price_times_bid_price',
       'ask_price_minus_wap', 'ask_price_times_wap', 'ask_price_wap_imb',
       'bid_price_minus_wap', 'bid_price_times_wap', 'bid_price_wap_imb',
       'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2', 'ask_price_bid_price_wap_imb2']

In [22]:
X['stats'] = pd.Series(X[stat_cols].fillna(-1).values.tolist())

In [35]:
importlib.reload(torch_classes)
importlib.reload(training_testing)

<module 'training_testing' from 'c:\\Users\\Nick\\Documents\\GitHub\\OptiverKaggle\\training_testing.py'>

In [29]:
def model_pipeline(trading_df:torch_classes.TradingData, config:dict):
    with wandb.init(project="Optviver", config=config,save_code=True):
        wandb.define_metric("val_epoch_loss_l1", summary="min")
        wandb.define_metric("epoch_l1_loss", summary="min")
        model = torch_classes.GRUNetV2(42,config['hidden_size'], num_layers=2).to('cuda:0')
        print(model)
        config = wandb.config
        optimizer = optim.RMSprop(model.parameters(), lr=config['learning_rate'])
        trading_df.reset_hidden(config['hidden_size'], config['num_layers'])
        criterion = nn.SmoothL1Loss()
        
        training_testing.train_model(trading_df,model,config,optimizer,criterion)

    return(model)



In [40]:
importlib.reload(training_testing)

<module 'training_testing' from 'c:\\Users\\Nick\\Documents\\GitHub\\OptiverKaggle\\training_testing.py'>

In [37]:
config_static = {'learning_rate':0.00001, 'hidden_size':16, 'num_layers':2}

In [41]:
model = model_pipeline(trading_data, config_static)

GRUNetV2(
  (gru): GRU(42, 16, num_layers=2, dropout=0.3)
  (relu0): ReLU()
  (batch_norm): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  (fc0): Linear(in_features=16, out_features=16, bias=True)
  (rl1): ReLU()
  (drop1): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=16, out_features=1, bias=True)
  (rl2): ReLU()
  (drop2): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=64, bias=True)
  (rl3): ReLU()
  (drop3): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=64, out_features=8, bias=True)
)


  0%|          | 0/10000 [00:00<?, ?it/s]

   stock  day  time    target      pred
0      0    0     0 -2.340078 -0.164086
1      1    0     0 -8.130074  0.149417
2      2    0     0  0.790358  0.154441
3      3    0     0  6.059408 -0.131441
4      4    0     0  0.499487  0.093122
5      5    0     0  5.190372  0.051417
6      6    0     0 -4.439950  0.004878
7      7    0     0 -3.280044 -0.060886
8      8    0     0  0.419617 -0.208988
9      9    0     0  1.959801 -0.163524
created path
   stock  day  time    target      pred
0      0    0     0 -2.340078 -0.553760
1      1    0     0 -8.130074 -0.117797
2      2    0     0  0.790358  0.184142
3      3    0     0  6.059408 -0.419055
4      4    0     0  0.499487  0.144455
5      5    0     0  5.190372  0.144241
6      6    0     0 -4.439950  0.131035
7      7    0     0 -3.280044 -0.222564
8      8    0     0  0.419617 -0.466315
9      9    0     0  1.959801 -0.289751
   stock  day  time    target      pred
0      0    0     0 -2.340078 -0.946524
1      1    0     0 -8.1300

In [None]:
torch.cuda.empty_cache()

In [None]:
trading_data.reset_hidden(config_static['hidden_size'], config_static['num_layers'])