***
### Notes

- v4: IterativeImputer separated by f0, pytorch model trained in 2 steps
- v5: Changed IterativeImputer to SimpleImputer
- v6: Error
- v7: [train-v10] Back to IterativeImputer, standard BatchNorm1d in input layer, increased dropout rate of input layer (x2), reduced lr (1e-4) of second stage model, increased factor to 0.5 ReduceLROnPlateau.
- v8: [train-v11] Dropout of input layer (x1), lr of first step 1e-4, lr of second step 1e-4, patience set to 4, weight_decay set to 1e-4.
- v9: [train-v12] Changed optimizers to `torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-5)` 
- v10: [train-v13]: Removed BN from input layer, added RobustScaler on full dataset, changed batch_size=2048 and virtual_batch_size=256, reduced lr=5e-2 and lr=1e-2 respectively, increased label_smoothing=1e-2 and early_stop_on_metric for the second stage model.
- v11: [train-v14]: Removed RobustScaler, added GBN at input layer, set virtual_batch_size=512, set weight_decay=1e-4, set patience=20 (early stopping) on second model.
- v12: [train-v15]: Changed ReduceLROnPlateau -> OneCycleLR and add LRFinder test.
- v13: [train-v16]: Second step trains on full dataset with 5 targets.
- v14: [train-v16]: Action taken on median(probs).
- v15: [train-v16]: Custom rule for action.
- v16: [train-v17]: Adds SWA for second step model, set pct_start=0.1 and patience=15 for step1.
- v17: [train-v18]: Iterative imputer set to 2 iterations.
- v18: [train-v19]: Iterative imputer set to 1 iteration & 50 features.
- v19: [train-v20]: Iterative imputer set to 1 iteration & 30 features. Hyperparams: `{'nn_width': 112, 'dropout': 0.3, 'momentum': 0.05, 'virtual_batch_size': 1024, 'weight_decay': 1e-05, 'pct_start': 0.5}`.
- v20: [train-v20]: Inference with `snn-epoch1.pt`.
- v21: [train-v20]: Inference with `snn-epoch5.pt`.
- v22: [train-v20]: Inference with `snn-epoch10.pt`.
- v23: [train-v20]: Inference with `snn-epoch15.pt`.
- v24: [train-v21]: Hyperparams: `{'nn_depth':3, 'nn_width': 112, 'dropout': 0.25, 'momentum': 0.09999999999999999, 'virtual_batch_size': 128, 'weight_decay': 0.0001, 'pct_start': 0.2}` | Inference with SWA.
- v25: [train-v21]: Inference with `snn-epoch5.pt`.
- v26: [train-v21]: Inference with `snn-epoch10.pt`.
- v27: [train-v21]: Inference with `snn-epoch15.pt`.
- v28: [train-v23]: Hyperparams: `{'nn_depth':3, 'nn_width': 96, 'dropout': 0.3, 'momentum': 0.1, 'virtual_batch_size': 256, 'weight_decay': 1e-05, 'pct_start': 0.3}` Inference with `snn-swa-epoch5.pt`.
- v29: [train-v23]: Inference with `snn-swa-epoch10.pt`.
- v30: [train-v23]: Inference with `snn-swa-epoch15.pt`.
- v31: [train-v24]: Hyperparams: `{'nn_depth': 3, 'nn_width': 96, 'dropout_input': 0.15, 'dropout_hidden': 0.3, 'dropout_output': 0.15, 'momentum':0.1, 'virtual_batch_size':256, 'weight_decay': 1e-05, 'pct_start': 0.4, 'max_lr': 0.06,}`. Inference with `snn-swa-epoch15.pt`.
- v32: [train-v24]: Inference with `snn-swa-epoch25.pt`.
- v33: [train-v24]: Inference with `snn-epoch15.pt`.
- v34: [train-v24]: Inference with `snn-epoch25.pt`.
- v35: [train-v28]: Inference with swa.
- v36: [train-v29]: SWA training with min_lr=1e-2, max_lr=1e-1, momentum=0.7 and num_epochs=15.
- v38: [train-v31]: Trained with lower regularization `dropout_input=0.2`, `dropout_hidden=0.25`, `dropout_output=0.15` and `momentum=0.8` on SWA.
- v39: [train-v31]: `get_action` takes a weight mean of output probs (based on pearson correlation).
- v40: [train-v40]: `get_action` based on coincidence of `resp` and `resp_4`, and also the weighted mean of output probs.
- v41: [train-v33]: Inference with swa-st.
- v42: [train-v33]: Inference with swa-lt.

In [None]:
import numpy as np 
import pandas as pd
from pathlib import Path
import pickle
import janestreet
import time
from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

import torch
from torch import nn
from torch.utils.data import DataLoader,TensorDataset

import matplotlib.pyplot as plt
import seaborn as sns

torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

***
### loading the model

In [None]:
class GBN(nn.Module):
    """
    Ghost Batch Normalization
    https://arxiv.org/abs/1705.08741
    """

    def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
        super(GBN, self).__init__()

        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = nn.BatchNorm1d(self.input_dim, momentum=momentum)

    def forward(self, x):
        chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
        res = [self.bn(x_) for x_ in chunks]

        return torch.cat(res, dim=0)

class NormalLinear(nn.Module):
    """ 
    Linear layer with normalized weights
    """
    def __init__(self, size_in, size_out, bias=True):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights vector
        weights_v = torch.Tensor(size_out, size_in)
        nn.init.kaiming_uniform_(weights_v, a=np.sqrt(5)) 
        self.weights_v = nn.Parameter(weights_v)
        # weights magnitude
        weights_m = torch.norm(weights_v, dim=1, keepdim=True)
        self.weights_m = nn.Parameter(weights_m)
        
        if bias:
            bias_v = torch.Tensor(size_out)    
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights_v)
            bound = 1 / np.sqrt(fan_in)
            nn.init.uniform_(bias_v, -bound, bound)
            self.bias = nn.Parameter(bias_v)
        else:
            self.register_parameter('bias', None)
        
        self._normalize_weights()
            
    def _normalize_weights(self):
        with torch.set_grad_enabled(False):
            norm_per_output = torch.norm(self.weights_v, dim=1, keepdim=True)
            self.weights_v.div_(norm_per_output)
            
    def forward(self, x):
        self._normalize_weights()
        return nn.functional.linear(x, self.weights_v * self.weights_m, self.bias)

In [None]:
class SNN(nn.Module):
    """
    Standard NN
    """

    def __init__(self, input_dim, output_dim, nn_depth, nn_width, 
                 dropout_input=0.2, dropout_hidden=0.3, dropout_output=0.1, 
                 momentum=0.02, virtual_batch_size=128):
        super().__init__()
        
        self.bn_in = GBN(input_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_in = nn.Dropout(dropout_input)
        self.ln_in = NormalLinear(input_dim, nn_width, bias=False)
        
        self.bnorms = nn.ModuleList(
            [GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum) 
             for i in range(nn_depth-1)])
        self.dropouts = nn.ModuleList(
            [nn.Dropout(dropout_hidden) 
             for i in range(nn_depth-1)])
        self.linears = nn.ModuleList(
            [NormalLinear(nn_width, nn_width, bias=False) 
             for i in range(nn_depth-1)])
        
        self.bn_out = GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_out = nn.Dropout(dropout_output)
        self.ln_out = NormalLinear(nn_width, output_dim, bias=False)

    def forward(self, x):
        x = self.bn_in(x)
        x = self.dp_in(x)
        x = self.ln_in(x)
        x = nn.functional.relu(x)

        for bn_layer,dp_layer,ln_layer in zip(self.bnorms,self.dropouts,self.linears):
            x = bn_layer(x)
            x = dp_layer(x)
            x = ln_layer(x)
            x = nn.functional.relu(x)
            
        x = self.bn_out(x)
        x = self.dp_out(x)
        x = self.ln_out(x)
        return x

In [None]:
model = SNN(input_dim=131, output_dim=5, nn_depth=3, nn_width=112, 
            dropout_input=0.20, dropout_hidden=0.30, dropout_output=0.20, 
            momentum=0.1, virtual_batch_size=256)
model = torch.optim.swa_utils.AveragedModel(model)
model.load_state_dict(torch.load("../input/janestreet-snn/snn-swa-lt.pt", map_location=device))
model = model.to(device)
model.eval()

In [None]:
with open("../input/janestreet-imputer/imputer_f0m1.pickle", "rb") as file:
    imputer_f0m1 = pickle.load(file)
    file.close()
    
with open("../input/janestreet-imputer/imputer_f0p1.pickle", "rb") as file:
    imputer_f0p1 = pickle.load(file)
    file.close()

***
### inference

In [None]:
input_columns = [f"feature_{i}" for i in range(130)]

def prepare_input(test_df, imputer_f0m1, iputer_f0p1, input_columns):
    # todo: add case when feature_0 is nan
    
    if np.any(np.isnan(test_df.values)):
        # in case there is a nan
        if test_df["feature_0"].item() == -1:
            imputed = imputer_f0m1.transform(test_df.loc[:, input_columns[1:]])
            x_input = np.concatenate([imputed, np.array([[1, 0]])], axis=1)
        else:
            imputed = imputer_f0p1.transform(test_df.loc[:, input_columns[1:]])
            x_input = np.concatenate([imputed, np.array([[0, 1]])], axis=1)
    else:
        # in case there is no nan values
        if test_df["feature_0"].item() == -1:
            x_input = np.concatenate([test_df.loc[:,input_columns[1:]].values, np.array([[1, 0]])], axis=1)
        elif test_df["feature_0"].item() == 1:
            x_input = np.concatenate([test_df.loc[:,input_columns[1:]].values, np.array([[0, 1]])], axis=1)
           
    return x_input

def sigmoid(x):
    return 1./(1+np.exp(-x))

In [None]:
def get_action(probs, threshold=0.5):
    other_actions = np.sum(probs[0,1:] > 0.5)
    if probs[0,0] > 0.5:
        action = 1
        if other_actions <= 1:
            # if 3 or more other actions say action=0 -> take action=0
            action = 0
    else:
        action = 0
        if other_actions == 4:
            # if 4 other actions say action=1 -> take action=1
            action = 1
    
    return action

In [None]:
imputer_times = list()
model_times = list()
total_actions = 0
probabilities = list()

env = janestreet.make_env()

for (test_df, pred_df) in tqdm(env.iter_test()):
    
    if test_df['weight'].item() > 0:
        tic = time.time()
        x_input = prepare_input(test_df, imputer_f0m1, imputer_f0p1, input_columns)  
        tac = time.time()
        imputer_times.append(tac-tic)
                
        tic = time.time()
        x_input = torch.as_tensor(x_input, dtype=torch.float, device=device)
        with torch.no_grad():
            probs = sigmoid(model(x_input).detach().cpu().numpy())
        action = get_action(probs)
        pred_df.action = action
        total_actions += action
        probabilities.append(probs)
        tac = time.time()
        model_times.append(tac-tic)
    else:
        pred_df.action = 0
        
    env.predict(pred_df)

In [None]:
n_decisions = len(probabilities)

print("% of actions taken:", 100*total_actions/15219)
print("% of actions taken:", 100*total_actions/n_decisions, (" (weight != 0)"))

In [None]:
probabilities = np.array(probabilities)
targets = ["action", "action_1", "action_2", "action_3", "action_4"]

for i in range(5):
    
    print(f" Probabilities for: {targets[i]} ".center(60, "-"))

    plt.figure(figsize=(15,5))
    sns.displot(probabilities[:,:,i], aspect=2., kind="hist", legend=False)
    plt.axvline(x=0.5, c="red")
    plt.title("Hist of model probabilities")
    plt.grid()
    plt.show()

    sns.displot(probabilities[:,:,i], aspect=2., kind="ecdf", legend=False)
    plt.axvline(x=0.5, c="red")
    plt.title("CDF of model probabilities")
    plt.grid()
    plt.show()

In [None]:
imputer_times = np.asarray(imputer_times)
model_times = np.asarray(model_times)

print(f"Mean time imputer: {np.mean(imputer_times)*1000:2f} [ms]")
print(f"Total time imputer: {np.sum(imputer_times)/60:2f} [min]")
print(f"Total inference time imputer: {np.mean(imputer_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time model: {np.mean(model_times)*1000:2f} [ms]")
print(f"Total time model: {np.sum(model_times)/60} [min]")
print(f"Total inference time model: {np.mean(model_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time iteration: {np.mean(imputer_times+model_times)*1000:2f} [ms]")
print(f"Total time iteration: {np.sum(imputer_times+model_times)/60:2f} [min]")
print(f"Total inference time: {np.mean(imputer_times+model_times)*1e6/3600} [hrs]")

***