***
### Notes

- v1: First ensemble.
- v2: snn [train-v28] - cnn1d [train-v10]. Inference with swa.
- v3: snn [train-v28] - cnn1d [train-v10] - bottlenet [train-v1]. Inference with swa.
- v4: Adds calculation of correlation between models.
- v5: Inference with swa-st & majority voting.
- v6: Inference with swa-lt & majority voting.
- v7: Inference with swa-st & majority voting & `get_action`++.
- v8: Inference with swa-lt & majority voting & `get_action`++.
- v9: Inference with swa-st & averaged probabilities.
- v10: Inference with swa-lt & averaged probabilities.

In [None]:
import numpy as np 
import pandas as pd
from scipy import stats
from pathlib import Path
import pickle
import janestreet
import time
from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset

import matplotlib.pyplot as plt
import seaborn as sns

torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

***
### loading the models

In [None]:
class GBN(nn.Module):
    """
    Ghost Batch Normalization
    https://arxiv.org/abs/1705.08741
    """

    def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
        super(GBN, self).__init__()

        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = nn.BatchNorm1d(self.input_dim, momentum=momentum)

    def forward(self, x):
        chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
        res = [self.bn(x_) for x_ in chunks]

        return torch.cat(res, dim=0)

class NormalLinear(nn.Module):
    """ 
    Linear layer with normalized weights
    """
    def __init__(self, size_in, size_out, bias=True):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights vector
        weights_v = torch.Tensor(size_out, size_in)
        nn.init.kaiming_uniform_(weights_v, a=np.sqrt(5)) 
        self.weights_v = nn.Parameter(weights_v)
        # weights magnitude
        weights_m = torch.norm(weights_v, dim=1, keepdim=True)
        self.weights_m = nn.Parameter(weights_m)
        
        if bias:
            bias_v = torch.Tensor(size_out)    
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights_v)
            bound = 1 / np.sqrt(fan_in)
            nn.init.uniform_(bias_v, -bound, bound)
            self.bias = nn.Parameter(bias_v)
        else:
            self.register_parameter('bias', None)
        
        self._normalize_weights()
            
    def _normalize_weights(self):
        with torch.set_grad_enabled(False):
            norm_per_output = torch.norm(self.weights_v, dim=1, keepdim=True)
            self.weights_v.div_(norm_per_output)
            
    def forward(self, x):
        self._normalize_weights()
        return nn.functional.linear(x, self.weights_v * self.weights_m, self.bias)

In [None]:
class SNN(nn.Module):
    """
    Standard NN
    """

    def __init__(self, input_dim, output_dim, nn_depth, nn_width, 
                 dropout_input=0.2, dropout_hidden=0.3, dropout_output=0.1, 
                 momentum=0.02, virtual_batch_size=128):
        super().__init__()
        
        self.bn_in = GBN(input_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_in = nn.Dropout(dropout_input)
        self.ln_in = NormalLinear(input_dim, nn_width, bias=False)
        
        self.bnorms = nn.ModuleList(
            [GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum) 
             for i in range(nn_depth-1)])
        self.dropouts = nn.ModuleList(
            [nn.Dropout(dropout_hidden) 
             for i in range(nn_depth-1)])
        self.linears = nn.ModuleList(
            [NormalLinear(nn_width, nn_width, bias=False) 
             for i in range(nn_depth-1)])
        
        self.bn_out = GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_out = nn.Dropout(dropout_output)
        self.ln_out = NormalLinear(nn_width, output_dim, bias=False)

    def forward(self, x):
        x = self.bn_in(x)
        x = self.dp_in(x)
        x = self.ln_in(x)
        x = nn.functional.relu(x)

        for bn_layer,dp_layer,ln_layer in zip(self.bnorms,self.dropouts,self.linears):
            x = bn_layer(x)
            x = dp_layer(x)
            x = ln_layer(x)
            x = nn.functional.relu(x)
            
        x = self.bn_out(x)
        x = self.dp_out(x)
        x = self.ln_out(x)
        return x

In [None]:
class CNN1D(nn.Module):
    def __init__(self, input_dim, output_dim, sign_size=16, cha_input=32, cha_hidden=32, K=2,
                 dropout_input=0.2, dropout_hidden=0.2, dropout_output=0.2):
        super(CNN1D, self).__init__()

        hidden_size = sign_size*cha_input
        sign_size1 = sign_size
        sign_size2 = sign_size//2
        output_size = (sign_size//4) * cha_hidden

        self.hidden_size = hidden_size
        self.cha_input = cha_input
        self.cha_hidden = cha_hidden
        self.K = K
        self.sign_size1 = sign_size1
        self.sign_size2 = sign_size2
        self.output_size = output_size
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.dropout_output = dropout_output

        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout1 = nn.Dropout(dropout_input)
        self.dense1 = NormalLinear(input_dim, hidden_size, bias=False)

        # 1st conv layer
        self.batch_norm_c1 = nn.BatchNorm1d(cha_input)
        self.conv1 = nn.Conv1d(cha_input, cha_input*K, kernel_size = 5, stride = 1, padding=2,  groups=cha_input, bias=False)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = sign_size2)

        # 2nd conv layer
        self.batch_norm_c2 = nn.BatchNorm1d(cha_input*K)
        self.dropout_c2 = nn.Dropout(dropout_hidden)
        self.conv2 = nn.Conv1d(cha_input*K, cha_hidden, kernel_size = 3, stride = 1, padding=1, bias=False)

        # 3rd conv layer
        self.batch_norm_c3 = nn.BatchNorm1d(cha_hidden)
        self.dropout_c3 = nn.Dropout(dropout_hidden)
        self.conv3 = nn.Conv1d(cha_hidden, cha_hidden, kernel_size = 3, stride = 1, padding=1, bias=False)

        # 4th conv layer
        self.batch_norm_c4 = nn.BatchNorm1d(cha_hidden)
        self.conv4 = nn.Conv1d(cha_hidden, cha_hidden, kernel_size = 5, stride = 1, padding=2, groups=cha_hidden, bias=False)

        self.avg_po_c4 = nn.AvgPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        self.batch_norm2 = nn.BatchNorm1d(output_size)
        self.dropout2 = nn.Dropout(dropout_output)
        self.dense2 = NormalLinear(output_size, output_dim, bias=False)

    def forward(self, x):

        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.celu(self.dense1(x))

        x = x.reshape(x.shape[0], self.cha_input, self.sign_size1)

        x = self.batch_norm_c1(x)
        x = F.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = F.relu(self.conv2(x))
        x_s = x

        x = self.batch_norm_c3(x)
        x = self.dropout_c3(x)
        x = F.relu(self.conv3(x))

        x = self.batch_norm_c4(x)
        x = self.conv4(x)
        x =  x + x_s
        x = F.relu(x)

        x = self.avg_po_c4(x)

        x = self.flt(x)

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.dense2(x)

        return x

In [None]:
class BottleNet(nn.Module):
    """
    NN with bottleneck input
    """

    def __init__(self, input_dim, output_dim, encode_dim, nn_depth, nn_width, 
                 dropout_ae=0.1, dropout_input=0.2, dropout_hidden=0.3, 
                 dropout_output=0.1, momentum=0.02, virtual_batch_size=128):
        super().__init__()
        
        self.bn_encoder = GBN(input_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_encoder = nn.Dropout(dropout_ae)
        self.ln_encoder = NormalLinear(input_dim, encode_dim, bias=False)
        
        self.bn_decoder = GBN(encode_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_decoder = nn.Dropout(dropout_ae)
        self.ln_decoder = NormalLinear(encode_dim, input_dim, bias=False)
        
        self.bn_in = GBN(encode_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_in = nn.Dropout(dropout_input)
        self.ln_in = NormalLinear(encode_dim, nn_width, bias=False)
        
        self.bnorms = nn.ModuleList(
            [GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum) 
             for i in range(nn_depth-1)])
        self.dropouts = nn.ModuleList(
            [nn.Dropout(dropout_hidden) 
             for i in range(nn_depth-1)])
        self.linears = nn.ModuleList(
            [NormalLinear(nn_width, nn_width, bias=False) 
             for i in range(nn_depth-1)])
        
        self.bn_out = GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_out = nn.Dropout(dropout_output)
        self.ln_out = NormalLinear(nn_width, output_dim, bias=False)
        
    def encoder(self, x):
        x = self.bn_encoder(x)
        x = self.dp_encoder(x)
        x = self.ln_encoder(x)
        x = nn.functional.relu(x)
        return x
    
    def decoder(self, x):
        x = self.bn_decoder(x)
        x = self.dp_decoder(x)
        x = self.ln_decoder(x)
        x = nn.functional.relu(x)
        return x

    def forward(self, x):
        x_encoded = self.encoder(x)
        x_decoded = self.decoder(x_encoded)
        
        x = self.bn_in(x_encoded)
        x = self.dp_in(x)
        x = self.ln_in(x)
        x = nn.functional.relu(x)

        for bn_layer,dp_layer,ln_layer in zip(self.bnorms,self.dropouts,self.linears):
            x = bn_layer(x)
            x = dp_layer(x)
            x = ln_layer(x)
            x = nn.functional.relu(x)
            
        x = self.bn_out(x)
        x = self.dp_out(x)
        x = self.ln_out(x)
        
        return x,x_decoded

In [None]:
model_snn = SNN(input_dim=131, output_dim=5, nn_depth=3, nn_width=112, 
                dropout_input=0.20, dropout_hidden=0.30, dropout_output=0.20, 
                momentum=0.1, virtual_batch_size=256)
model_snn = torch.optim.swa_utils.AveragedModel(model_snn)
model_snn.load_state_dict(torch.load("../input/janestreet-snn/snn-swa-lt.pt", map_location=device))
model_snn = model_snn.to(device)
model_snn.eval()

In [None]:
model_cnn1d = CNN1D(input_dim=131, output_dim=5,
                    sign_size=8, cha_input=32, cha_hidden=64, K=2,
                    dropout_input=0.25, dropout_hidden=0.3, dropout_output=0.15)
model_cnn1d = torch.optim.swa_utils.AveragedModel(model_cnn1d)
model_cnn1d.load_state_dict(torch.load("../input/janestreet-cnn1d/cnn1d-swa-lt.pt", map_location=device))
model_cnn1d = model_cnn1d.to(device)
model_cnn1d.eval()

In [None]:
model_bottle = BottleNet(input_dim=131, output_dim=5,
                         encode_dim=72, nn_depth=3, nn_width=96, dropout_ae=0.15,
                         dropout_input=0.20, dropout_hidden=0.3, dropout_output=0.15,
                         momentum=0.1, virtual_batch_size=256)  
model_bottle = torch.optim.swa_utils.AveragedModel(model_bottle)
model_bottle.load_state_dict(torch.load("../input/janestreet-bottlenet/bottlenet-swa-lt.pt", map_location=device))
model_bottle = model_bottle.to(device)
model_bottle.eval()

In [None]:
with open("../input/janestreet-imputer/imputer_f0m1.pickle", "rb") as file:
    imputer_f0m1 = pickle.load(file)
    file.close()
    
with open("../input/janestreet-imputer/imputer_f0p1.pickle", "rb") as file:
    imputer_f0p1 = pickle.load(file)
    file.close()

***
### inference

In [None]:
input_columns = [f"feature_{i}" for i in range(130)]

def prepare_input(test_df, imputer_f0m1, iputer_f0p1, input_columns):
    # todo: add case when feature_0 is nan
    
    if np.any(np.isnan(test_df.values)):
        # in case there is a nan
        if test_df["feature_0"].item() == -1:
            imputed = imputer_f0m1.transform(test_df.loc[:, input_columns[1:]])
            x_input = np.concatenate([imputed, np.array([[1, 0]])], axis=1)
        else:
            imputed = imputer_f0p1.transform(test_df.loc[:, input_columns[1:]])
            x_input = np.concatenate([imputed, np.array([[0, 1]])], axis=1)
    else:
        # in case there is no nan values
        if test_df["feature_0"].item() == -1:
            x_input = np.concatenate([test_df.loc[:,input_columns[1:]].values, np.array([[1, 0]])], axis=1)
        elif test_df["feature_0"].item() == 1:
            x_input = np.concatenate([test_df.loc[:,input_columns[1:]].values, np.array([[0, 1]])], axis=1)
           
    return x_input

def sigmoid(x):
    return 1./(1+np.exp(-x))

def get_action(probs, threshold=0.5):
    other_actions = np.sum(probs[0,1:] > 0.5)
    if probs[0,0] > 0.5:
        action = 1
        if other_actions <= 1:
            # if 3 or more other actions say action=0 -> take action=0
            action = 0
    else:
        action = 0
        if other_actions == 4:
            # if 4 other actions say action=1 -> take action=1
            action = 1
    
    return action

In [None]:
imputer_times = list()
snn_times = list()
cnn1d_times = list()
bottle_times = list()
total_actions = 0

snn_probs = list()
cnn1d_probs = list()
bottle_probs = list()

env = janestreet.make_env()

for (test_df, pred_df) in tqdm(env.iter_test()):
    
    if test_df['weight'].item() > 0:
        tic = time.time()
        x_input = prepare_input(test_df, imputer_f0m1, imputer_f0p1, input_columns)
        x_input = torch.as_tensor(x_input, dtype=torch.float, device=device)
        tac = time.time()
        imputer_times.append(tac-tic)
        
        # inference for snn
        tic = time.time()
        with torch.no_grad():
            probs_snn = sigmoid(model_snn(x_input).detach().cpu().numpy())
        tac = time.time()
        snn_probs.append(probs_snn)
        snn_times.append(tac-tic)
        
        # inference for cnn1d
        tic = time.time()
        with torch.no_grad():
            probs_cnn1d = sigmoid(model_cnn1d(x_input).detach().cpu().numpy())
        tac = time.time()
        cnn1d_probs.append(probs_cnn1d)
        cnn1d_times.append(tac-tic)
        
        # inference for bottlenet
        tic = time.time()
        with torch.no_grad():
            probs_bottle = sigmoid(model_bottle(x_input)[0].detach().cpu().numpy())
        tac = time.time()
        bottle_probs.append(probs_bottle)
        bottle_times.append(tac-tic)

        # averaged probabilities
        probs = np.mean([probs_snn,probs_cnn1d,probs_bottle], axis=0)
        action = get_action(probs)
        pred_df.action = action
        total_actions += action
        
    else:
        pred_df.action = 0
        
    env.predict(pred_df)

In [None]:
n_decisions = len(imputer_times)

print("% of actions taken:", 100*total_actions/15219)
print("% of actions taken:", 100*total_actions/n_decisions, (" (weight != 0)"))

In [None]:
# correlation between models
snn_probs = np.asarray(snn_probs)
cnn1d_probs = np.asarray(cnn1d_probs)
bottle_probs = np.asarray(bottle_probs)
all_probs = [snn_probs, cnn1d_probs, bottle_probs]

# calculated only with resp
corrs1 = np.empty((3,3))
for i in range(3):
    for j in range(3):
        corr = stats.pearsonr(all_probs[i][:,:,0].ravel(), all_probs[j][:,:,0].ravel())[0]
        corrs1[i,j] = corr
print("Correlations with resp")
print(corrs1)

# calculated with resp_*
corrs2 = np.empty((3,3))
for i in range(3):
    for j in range(3):
        corr = np.mean([stats.pearsonr(all_probs[i][:,:,k].ravel(), all_probs[j][:,:,k].ravel())[0] for k in range(5)])
        corrs2[i,j] = corr
print("\nCorrelations with resp*")
print(corrs2)

In [None]:
imputer_times = np.asarray(imputer_times)
snn_times = np.asarray(snn_times)
cnn1d_times = np.asarray(cnn1d_times)
bottle_times = np.asarray(bottle_times)

print(f"Mean time imputer: {np.mean(imputer_times)*1000:2f} [ms]")
print(f"Total time imputer: {np.sum(imputer_times)/60:2f} [min]")
print(f"Total inference time imputer: {np.mean(imputer_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time snn: {np.mean(snn_times)*1000:2f} [ms]")
print(f"Total time snn: {np.sum(snn_times)/60} [min]")
print(f"Total inference time snn: {np.mean(snn_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time cnn1d: {np.mean(cnn1d_times)*1000:2f} [ms]")
print(f"Total time cnn1d: {np.sum(cnn1d_times)/60} [min]")
print(f"Total inference time cnn1d: {np.mean(cnn1d_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time bottlenet: {np.mean(bottle_times)*1000:2f} [ms]")
print(f"Total time bottlenet: {np.sum(bottle_times)/60} [min]")
print(f"Total inference time bottlenet: {np.mean(bottle_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time iteration: {np.mean(imputer_times + snn_times + cnn1d_times)*1000:2f} [ms]")
print(f"Total time iteration: {np.sum(imputer_times + snn_times + cnn1d_times)/60:2f} [min]")
print(f"Total inference time: {np.mean(imputer_times + snn_times + cnn1d_times)*1e6/3600} [hrs]")

***