#### Optiver-Trading at the Close のcompetiton 
##### https://www.kaggle.com/competitions/optiver-trading-at-the-close

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import random


In [2]:
device=torch.device('mps')
device
random.seed(42)

In [3]:
df_raw=pd.read_csv('data/train.csv')
df_raw.isna().sum(axis=0)/len(df_raw)

stock_id                   0.000000
date_id                    0.000000
seconds_in_bucket          0.000000
imbalance_size             0.000042
imbalance_buy_sell_flag    0.000000
reference_price            0.000042
matched_size               0.000042
far_price                  0.552568
near_price                 0.545474
bid_price                  0.000042
bid_size                   0.000000
ask_price                  0.000042
ask_size                   0.000000
wap                        0.000042
target                     0.000017
time_id                    0.000000
row_id                     0.000000
dtype: float64

In [4]:
def add_info_columns(raw_df):
    df = raw_df.copy()
    df[["reference_price", "far_price","near_price","bid_price","ask_price","wap"]] = df[["reference_price", "far_price","near_price","bid_price","ask_price","wap"]].fillna(1.0)
    
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size'] 
    df["imbalance"] = df["imbalance_size"] * df["imbalance_buy_sell_flag"]
#     df.drop(columns=["imbalance_size","imbalance_buy_sell_flag"], inplace=True)
    
    df['imbl_size1'] = (df['bid_size']-df['ask_size']) / (df['bid_size']+df['ask_size'])
    df['imbl_size2'] = (df['imbalance_size']-df['matched_size']) / (df['imbalance_size']+df['matched_size'])    
    
    df['bid_size_diff'] = df[["stock_id", "date_id", "bid_size"]].groupby(["stock_id","date_id"]).diff()
    df['ask_size_diff'] = df[["stock_id", "date_id", "ask_size"]].groupby(["stock_id","date_id"]).diff()    
    
#     df = pd.concat((df, pd.get_dummies(df["stock_id"]).astype(int)), axis=1)
#     df = df.drop(columns="stock_id")
    
    return df

In [5]:
df=add_info_columns(df_raw)
df.fillna(0,inplace=True)
df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,wap,target,time_id,row_id,imbalance_ratio,imbalance,imbl_size1,imbl_size2,bid_size_diff,ask_size_diff
0,0,0,0,3180602.69,1,0.999812,13380276.64,1.0,1.0,0.999812,...,1.0,-3.029704,0,0_0_0,0.237708,3180602.69,0.75434,-0.61589,0.0,0.0
1,1,0,0,166603.91,-1,0.999896,1642214.25,1.0,1.0,0.999896,...,1.0,-5.519986,0,0_0_1,0.101451,-166603.91,-0.728751,-0.815787,0.0,0.0
2,2,0,0,302879.87,-1,0.999561,1819368.03,1.0,1.0,0.999403,...,1.0,-8.38995,0,0_0_2,0.166475,-302879.87,0.332935,-0.714567,0.0,0.0
3,3,0,0,11917682.27,-1,1.000171,18389745.62,1.0,1.0,0.999999,...,1.0,-4.0102,0,0_0_3,0.648061,-11917682.27,-0.99034,-0.213547,0.0,0.0
4,4,0,0,447549.96,-1,0.999532,17860614.95,1.0,1.0,0.999394,...,1.0,-7.349849,0,0_0_4,0.025058,-447549.96,0.948687,-0.951109,0.0,0.0


In [6]:
x_cols=[c for c in df.columns if c not in ['row_id',['time_id'],['date_id'],['target']]]
y_cols=['target']

In [7]:
means = df[x_cols].mean(0)
stds = df[x_cols].std(0)

In [8]:
def normalize_features(x):
    return (x - means) / (stds+1e-8)

In [9]:
def get_xy(df):
    x = df[x_cols]
    x = normalize_features(x)
    
    y = df[y_cols]
    
    return x.values, y.values

In [10]:
x,y=get_xy(df)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:

def get_dataloaders(df, batch_size=128):
    (x,y) = get_xy(df)
    
    x_tensor = torch.Tensor(x).to(device)
    y_tensor = torch.Tensor(y).to(device)
    
    full_dataset = TensorDataset(x_tensor, y_tensor)
    train_dataset, test_dataset = random_split(full_dataset, [0.8,0.2])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataloader = DataLoader(test_dataset, batch_size=min(batch_size, len(test_dataset)), drop_last=True)
    return (train_dataloader, test_dataloader)

In [12]:
train_dataloader, test_dataloader = get_dataloaders(df)

In [13]:
config={
    'epoch':80,
    'Batch_size':128,
    'lr':0.001

}

In [14]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN,self).__init__()
        self.fc1=nn.Linear(in_features=19,out_features=128)
        self.relu1=nn.ReLU()
        self.DropOut1=nn.Dropout(0.5)
        self.fc2=nn.Linear(in_features=64,out_features=32)
        self.relu2=nn.ReLU()
        self.Dropout2=nn.Dropout(0.5)
        self.fc3=nn.Linear(in_features=32,out_features=16)
        self.relu3=nn.ReLU()
        self.fc4=nn.Linear(in_features=16,out_features=1)

    def forward(self,x):
        x=self.fc1(x)
        x=self.relu1(x)
        x=self.DropOut1(x)
        x=self.fc2(x)
        self.relu2(x)
        self.Dropout2(x)
        self.fc3(x)
        self.relu3(x)
        self.fc4(x)
        return x
    
model=DNN()


In [15]:
def init_weights(m):  # Heの初期化
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.kaiming_normal_(m.weight)
        m.bias.data.fill_(0.0)

model.apply(init_weights)

model.to(device)
optimizer = optim.Adam(model.parameters(), lr=config['lr'])
scheduler=optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',patience=5,factor=0.5,verbose=True)
loss_function = nn.L1Loss()

In [16]:
def train_loop(dataloader,model,loss_function,optimizer):
    model.train()
    train_loss=0
    num_batches=len(dataloader)
    for config['Batch_size'],(x,y) in enumerate(dataloader):

        pred=model(x)

        loss=loss_function(pred,y)
        train_loss +=loss

        model.backward()
        optimizer.step()
        optimizer.zero_grad()
    return train_loss.detach().cpu().numpy()/num_batches

def valid_loop(dataloader,model,loss_function):
    model.val()
    num_batches=len(dataloader)
    valid_loss=0
    for config['Batch_size'],(x,y) in enumerate(dataloader):
        pred=model(x)
        loss=loss_function(pred,y)
        valid_loss +=loss
    scheduler.step(valid_loss)

    return valid_loss.detach().cpu().numpy()/num_batches

In [17]:
history = pd.DataFrame([], columns=["epoch","train_loss","test_loss","lr"])

for epoch in range(config['epoch']):
    print(f"Epoch {epoch+1:>3d}",end=" ")
    train_loss = train_loop(train_dataloader, model, loss_function, optimizer)
    print(f"Train: {train_loss:>5f}", end=" ")
    test_loss = valid_loop(test_dataloader, model, loss_function)
    print(f"| validation: {test_loss:>5f}")

Epoch   1 

: 