In [22]:
import pandas as pd 
import numpy as np
import utils.public_timeseries_testing_util as optiver2023
from torch.nn.utils.rnn import pack_padded_sequence, pack_sequence, unpack_sequence, unpad_sequence
import torch
from tqdm.notebook import trange,tqdm
import torch.nn as nn 
import torch.optim as optim
import wandb
import utils.torch_classes as torch_classes
from utils.model_saver import model_saver_wandb as model_saver
import utils.training_testing_double 
from itertools import combinations
import importlib
import gc
from utils.consts import STATS_COLS
import lightgbm as lgb

In [2]:
env = optiver2023.make_env()
iter_test = env.iter_test()

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc.
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [10]:
model = torch_classes.GRUNetV4(22,128,num_layers=2,target_size=5)
model_loc = f"models/curious-dew-316/curious-dew-316_20.pt"
model_data = torch.load(model_loc,map_location=torch.device('cpu'))
print(model_data['model_state_dict'].keys())
model.load_state_dict(model_data['model_state_dict'], strict=True)

odict_keys(['gru.weight_ih_l0', 'gru.weight_hh_l0', 'gru.bias_ih_l0', 'gru.bias_hh_l0', 'gru.weight_ih_l1', 'gru.weight_hh_l1', 'gru.bias_ih_l1', 'gru.bias_hh_l1', 'batch_norm.weight', 'batch_norm.bias', 'batch_norm.running_mean', 'batch_norm.running_var', 'batch_norm.num_batches_tracked', 'layer_norm.weight', 'layer_norm.bias', 'layer_norm2.weight', 'layer_norm2.bias', 'fc0.weight', 'fc0.bias', 'fc1.weight', 'fc1.bias', 'fc_final.weight', 'fc_final.bias', 'fc_reg0.weight', 'fc_reg0.bias', 'fc_reg1.weight', 'fc_reg1.bias', 'fc_reg2.weight', 'fc_reg2.bias'])


<All keys matched successfully>

In [24]:
train = pd.read_csv('data/train.csv')
train.head()
train.date_id.value_counts()

date_id
480    11000
353    11000
363    11000
362    11000
360    11000
       ...  
4      10560
2      10505
1      10505
3      10505
0      10505
Name: count, Length: 481, dtype: int64

In [25]:
train.to_feather('train.fth')

In [13]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

In [14]:
weights_df = pd.DataFrame(data=list(zip(range(0,201),weights)),columns=['stock_id','index_weight'])
train = train.merge(weights_df,on='stock_id')

In [15]:
median_vol = pd.read_csv("archive/MedianVolV2.csv")
median_vol.index.name = "stock_id"
median_vol = median_vol[['overall_medvol', "first5min_medvol", "last5min_medvol"]]
median_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()
std_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median() 

In [16]:
trading_data = torch_classes.TradingData()

In [17]:
trading_data.fill_hidden_states_for_test(model_data['db_train'])
# trading_data.reset_hidden(64,device='cpu')

0
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.

In [18]:
def feat_eng(df):
    
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']]
    df = df[cols]
    df = df.merge(median_vol, how = "left", left_on = "stock_id", right_index = True)
    
    df['bid_plus_ask_sizes'] = df['bid_size'] + train['ask_size']
#     df['median_size'] = df['stock_id'].map(median_sizes.to_dict())
    df['std_size'] = df['stock_id'].map(std_sizes.to_dict())
#     df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_size'], 1, 0) 
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    df['ask_x_size'] = df.eval('ask_size*ask_price')
    df['bid_x_size'] = df.eval('bid_size*bid_price')
        
    df['ask_minus_bid'] = df['ask_x_size'] - df['bid_x_size'] 
    
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"])
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for c in combinations(prices, 2):
        
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')

    for c in combinations(prices, 3):
        
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1)-min_-max_

        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)
    
        
    df.drop(columns=[
        # 'date_id', 
        'reference_price_far_price_imb',
        'reference_price_minus_near_price',
        'reference_price_near_price_imb',
        'far_price_near_price_imb',
        'far_price_ask_price_imb',
        'far_price_bid_price_imb',
        'far_price_minus_wap',
        'std_size',
        'bid_size_over_ask_size',
        'ask_price_bid_price_imb',
        'near_price_times_wap'
    ], inplace=True)
        
    gc.collect()

    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

In [19]:
def generate_prev_race(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    original_cols = df_in.columns
    df[f'initial_wap'] = df_g['wap_calc'].transform('first')
    df[f'initial_bid_size'] = df_g['bid_size'].transform('first')
    df[f'initial_ask_size'] = df_g['ask_size'].transform('first')
    return(df)

In [20]:
def generate_index(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    df[f'index_wap'] = df_g['wap_weighted'].transform('mean')
    return(df)

def generate_index_2(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    df[f'index_wap_init'] = df_g['index_wap'].transform('first')
    return(df)


In [21]:
def variable_eng(train):

    train['wap_weighted'] = train['wap']*train['index_weight']
    train_g = train.groupby(['stock_id','date_id'])
    train = generate_prev_race(train,train_g)

    train_g = train.groupby(['seconds_in_bucket','date_id'])
    train = generate_index(train,train_g)


    train['wap_move_to_init'] = train['wap_calc']/train['initial_wap']
    train_g = train.groupby(['date_id'])
    train = generate_index_2(train,train_g)

    train['index_wap_move_to_init'] = train['index_wap']/train['index_wap_init']


    return train


In [23]:
y = train["target"].values
X = feat_eng(train)
prices = [
    c for c in X.columns if ("price" in c) and ("target" not in c) and ("60" not in c)
]
print(prices)
prices = [
    c for c in X.columns if ("price" in c) and ("target" not in c) and ("60" not in c)
]
# prices = [c for c in train.columns if 'price' in c]
pca_prices = PCA(n_components=1)
X["pca_prices"] = pca_prices.fit_transform(X[prices].fillna(1))

KeyboardInterrupt: 

In [None]:
lgbm = lgb.Booster(model_file="data/lgbm_model_new_t60.lgb")
X = train
X_train = X[[c for c in X.columns if ("target" not in c) and ("60" not in c)]].drop(
    columns=["delta_wap", "date_id"]
)
lgbm_preds = lgbm.predict(X_train)
X["lgbm_preds"] = lgbm_preds

del pca_prices

In [15]:
def gen_preds(test, trading_data:torch_classes.TradingData, model:torch_classes.GRUNet):
    test['stats']  = pd.Series(test[STATS_COLS].fillna(-1).values.tolist())
    stock_ids = test.stock_id.unique().tolist()
    stocks = [trading_data.stocksDict[x] for x in stock_ids]
    hidden = torch.stack([trading_data.stocksDict[x].hidden for x in stock_ids]).transpose(0,1)
    
    X = [torch.tensor(x) for x in test['stats'].tolist()]
    
    Xstacked = torch.stack(X)
    
    Xviewed = pack_sequence(Xstacked.view(-1,1,12))
    
    output,hidden_out = model(Xviewed,hidden)
    
    hidden_out = hidden_out.transpose(0,1)
    
    [setattr(obj, 'hidden', val.detach()) for obj, val in zip(stocks,hidden_out)]
    
    output = output.flatten().tolist()
    
    return output

In [16]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    if counter == 0:
        print(test.head(3))
        print(revealed_targets.head(3))
        print(sample_prediction.head(3))
    print(counter)
    test['wap_calc'] = (test['bid_price']*test['ask_size']+test['ask_price']*test['bid_size'])/(test['ask_size']+test['bid_size'])
    test = test.merge(weights_df,on='stock_id')
    test = variable_eng(test)
    test = feat_eng(test)
    test = X['pca_prices'] = pca_prices.transform(X[prices].fillna(1))
    preditcions = gen_preds(test,trading_data,model)
    
    sample_prediction['pred'] = preditcions
    test_df = test
    # print(preditcions)
    # break
    env.predict(sample_prediction)
    counter += 1

   Unnamed: 0  stock_id  date_id  seconds_in_bucket  imbalance_size  \
0     4192980         0      386                  0      2555434.64   
1     4192981         1      386                  0       274697.82   
2     4192982         2      386                  0       415532.77   

   imbalance_buy_sell_flag  reference_price  matched_size  far_price  \
0                       -1         1.000032    9966697.63        NaN   
1                        1         0.999714    1324398.60        NaN   
2                        1         0.999826    2463160.51        NaN   

   near_price  bid_price  bid_size  ask_price  ask_size  wap    target  \
0         NaN   0.999746  18840.60   1.000032   2408.10  1.0 -2.340078   
1         NaN   0.999370  71153.73   1.000002    174.08  1.0 -8.130074   
2         NaN   0.999323  46315.74   1.000328  22484.74  1.0  0.790358   

    row_id                                              stats  
0  386_0_0  [0.0, 2555434.64, -1.0, 1.000032, 9966697.63, ...  
1

KeyError: "['pca_prices'] not in index"

In [10]:
# for i in iter_test:
    
#     test_df = i[0]
#     print(test_df)
#     test_df['stats']  = pd.Series(test_df[stats_cols].fillna(-1).values.tolist())
#     stock_ids = test_df.stock_id.unique()
#     stocks = [trading_data.stocksDict[x] for x in stock_ids] 
#     hidden = torch.stack([trading_data.stocksDict[x].hidden for x in stock_ids]).transpose(0,1)
#     X = [torch.tensor(x) for x in test_df['stats'].tolist()]
#     X = torch.stack(X)
#     X = pack_sequence(X.view(-1,1,12))

    
#     model.eval()
#     output,hidden = model(X,hidden)
#     [setattr(obj, 'hidden', val.detach()) for obj, val in zip(stocks,hidden)]
#     print(output)
#     test_df['taget'] = output.flatten().tolist()
#     # test_df['actual'] = test_df['target']
#     env.predict(test_df)
    

In [347]:
x = test_df

In [356]:
test_df = x.head(5)

In [354]:
test_df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,row_id,stats
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0_0_0,"[0.0, 3180602.69, 1.0, 0.999812, 13380276.64, ..."


In [596]:
for i,data in test_df.iterrows():
    hidden = torch.stack([trading_data.stocksDict[data['stock_id']].hidden])
    print(hidden)
    hidden = torch.stack([trading_data.stocksDict[data['stock_id']].hidden]).transpose(0,1)
    X = torch.tensor(data['stats']).view(-1,12,1)
    model.eval()
    output,hidden  = model(X,hidden,test=True)
    trading_data.stocksDict[data['stock_id']].hidden = hidden[0]
    print(hidden)
    print(output.flatten().item())

tensor([[[-0.8575,  0.7445, -0.8014,  0.7816, -0.5938, -0.7823, -0.7967,
          -0.5086,  0.5378, -0.0737, -0.5103, -0.5201, -0.2461, -0.4879,
          -0.3435, -0.1746, -0.1386,  0.0818, -0.7184, -0.3409, -0.6650,
           0.9997, -0.7376, -0.4719,  0.5599,  0.6262,  0.6931, -0.9579,
          -0.6052,  0.0718,  0.6607,  0.6729,  0.0845, -0.4968, -0.6447,
          -0.9867,  0.6466, -0.9892, -0.7533,  0.7768, -0.3308, -0.0147,
           0.9768,  0.5592,  0.0995, -0.1919,  0.9054,  0.9997, -0.0796,
          -0.7784, -0.9714, -0.4716, -0.5839,  0.9843, -0.7999,  0.9457,
           0.4040, -1.0000, -0.4622, -0.3323,  0.7707,  0.3389,  0.3091,
          -0.8781]]], grad_fn=<StackBackward0>)
tensor([[[-0.8575,  0.7445, -0.8015,  0.7816, -0.5938, -0.7823, -0.7968,
          -0.5089,  0.5378, -0.0737, -0.5103, -0.5201, -0.2461, -0.4878,
          -0.3442, -0.1746, -0.1390,  0.0821, -0.7184, -0.3408, -0.6650,
           0.9997, -0.7375, -0.4719,  0.5599,  0.6262,  0.6941, -0.9582,
   

In [350]:
test_df['stats']  = pd.Series(test_df[stats_cols].fillna(-1).values.tolist())
stock_ids = test_df.stock_id.unique()
hidden = torch.stack([trading_data.stocksDict[x].hidden for x in stock_ids]).transpose(0,1)
X = [torch.tensor(x) for x in test_df['stats'].tolist()]
# X = pack_sequence(X)

In [31]:
# X = torch.stack(X)

In [352]:
X = [torch.tensor(x) for x in test_df['stats'].tolist()]
X = torch.stack(X).view(-1,12,1)
# X = pack_sequence(X)
model.eval()
output,hidden  = model(X,hidden,test=True)
output

tensor([[[-2.1240],
         [ 1.0552],
         [-2.5545],
         [ 2.2515],
         [-1.9348],
         [ 3.7566],
         [-0.6949],
         [-2.2269],
         [-2.6248],
         [ 6.1143]]], grad_fn=<ViewBackward0>)

In [56]:
X.shape

torch.Size([1, 12, 10])

In [43]:
hidden

tensor([[[-0.1262, -0.1012, -0.2037,  0.1888,  0.0976, -0.2847, -0.5807,
          -0.5896, -0.0159,  0.2682, -0.5628,  0.1121, -0.1300,  0.2515,
           0.1864,  0.0224,  0.0196, -0.1425, -0.6340, -0.5149, -0.6020,
          -0.2689, -0.0297, -0.6029,  0.4218,  0.0321,  0.1283, -0.3335,
           0.2974, -0.4182,  0.1575,  0.3770, -0.1936, -0.5595,  0.2087,
          -0.1358,  0.1713,  0.1705,  0.0112, -0.2112, -0.5883, -0.2184,
           0.6552,  0.0014, -0.0707, -0.0010,  0.4840,  0.5455, -0.3142,
          -0.4542, -0.2142, -0.2478, -0.2938, -0.1563, -0.5573,  0.7317,
          -0.0501, -0.3461, -0.5378,  0.3539, -0.1586,  0.4133,  0.1239,
          -0.1093]]])

In [40]:
X

tensor([[[ 0.0000e+00,  3.1806e+06,  1.0000e+00,  9.9981e-01,  1.3380e+07,
          -1.0000e+00, -1.0000e+00,  9.9981e-01,  6.0652e+04,  1.0000e+00,
           8.4930e+03,  1.0000e+00]]])

In [None]:
import importlib

In [None]:
importlib.reload(torch_classes)
trading_data = torch_classes.TradingData(train)
hidden_size = 64
# trading_data.generate_batches()

In [None]:
def model_pipeline(trading_df:torch_classes.TradingData, config:dict):
    with wandb.init(project="Optviver", config=config,save_code=True):
        wandb.define_metric("val_epoch_loss_l1", summary="min")
        wandb.define_metric("epoch_l1_loss", summary="min")
        model = torch_classes.GRUNet(12,hidden_size).to('cuda:0')
        config = wandb.config
        optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])
        trading_df.reset_hidden(config['hidden_size'])
        criterion = nn.MSELoss()
        
        training_testing.train_model(trading_df,model,config,optimizer,criterion)

    return(model)



In [363]:
importlib.reload(training_testing)

<module 'training_testing' from 'c:\\Users\\Nick\\Documents\\GitHub\\OptiverKaggle\\training_testing.py'>

In [None]:
config_static = {'learning_rate':0.0001, 'hidden_size':64}

In [None]:
model = model_pipeline(trading_data, config_static)