***
### Notes

- v1: First version with channels 64x8 and 128x4.
- v2: [train-v3]: Reduces weight_decay to 1e-4.
- v3: [train-v4]: IterativeImputer set to 1 iteration & 50 features.
- v4: Failed
- v5: [train-v6]: Hyperparams: `{'sign_size': 22, 'cha_1': 16, 'cha_2': 32, 'cha_3': 32, 'input_dim': 131, 'output_dim': 5, 'dropout_in1': 0.25, 'dropout_in2': 0.4, 'dropout_mid': 0.4, 'dropout_out': 0.25, 'weight_decay': 0.0001, 'pct_start': 0.1, 'max_lr': 0.07}`.
- v6: [train-v9]: Major refactoring of model structure (depthwise convolution, higher regularization, lower lr, swa). Inference with `cnn1d-swa-epoch10.pt`.
- v7: Failed.
- v8: [train-v9]: Inference with `cnn1d-epoch10.pt`.
- v9: [train-v10]: Hyperparams: `sign_size=16, cha_input=32, cha_hidden=48, K=2, dropout_input=0.2, dropout_hidden=0.25, dropout_output=0.25, weight_decay=1e-4, max_lr=1e-2, pct_start=0.1`. Inference with swa.
- v10: [train-v13]  Inference with swa-st.
- v11: [train-v13]  Inference with swa-lt.

In [None]:
import numpy as np 
import pandas as pd
from pathlib import Path
import pickle
import janestreet
import time
from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset

import matplotlib.pyplot as plt
import seaborn as sns

torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

***
### loading the model

In [None]:
class NormalLinear(nn.Module):
    """ 
    Linear layer with normalized weights
    """
    def __init__(self, size_in, size_out, bias=True):
        super(NormalLinear, self).__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights vector
        weights_v = torch.Tensor(size_out, size_in)
        nn.init.kaiming_uniform_(weights_v, a=np.sqrt(5)) 
        self.weights_v = nn.Parameter(weights_v)
        # weights magnitude
        weights_m = torch.norm(weights_v, dim=1, keepdim=True)
        self.weights_m = nn.Parameter(weights_m.clone().detach())
        
        if bias:
            bias_v = torch.Tensor(size_out)    
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights_v)
            bound = 1 / np.sqrt(fan_in)
            nn.init.uniform_(bias_v, -bound, bound)
            self.bias = nn.Parameter(bias_v)
        else:
            self.register_parameter('bias', None)
            
    def _compute_weights(self):
        norm_per_output = torch.norm(self.weights_v, dim=1, keepdim=True)
        return self.weights_m * torch.div(self.weights_v, norm_per_output)
            
    def forward(self, x):
        weights = self._compute_weights()
        return nn.functional.linear(x, weights, self.bias)


class CNN1D(nn.Module):
    def __init__(self, input_dim, output_dim, sign_size=16, cha_input=32, cha_hidden=32, K=2,
                 dropout_input=0.2, dropout_hidden=0.2, dropout_output=0.2):
        super(CNN1D, self).__init__()

        hidden_size = sign_size*cha_input
        sign_size1 = sign_size
        sign_size2 = sign_size//2
        output_size = (sign_size//4) * cha_hidden

        self.hidden_size = hidden_size
        self.cha_input = cha_input
        self.cha_hidden = cha_hidden
        self.K = K
        self.sign_size1 = sign_size1
        self.sign_size2 = sign_size2
        self.output_size = output_size
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.dropout_output = dropout_output

        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout1 = nn.Dropout(dropout_input)
        self.dense1 = NormalLinear(input_dim, hidden_size, bias=False)

        # 1st conv layer
        self.batch_norm_c1 = nn.BatchNorm1d(cha_input)
        self.conv1 = nn.Conv1d(cha_input, cha_input*K, kernel_size = 5, stride = 1, padding=2,  groups=cha_input, bias=False)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = sign_size2)

        # 2nd conv layer
        self.batch_norm_c2 = nn.BatchNorm1d(cha_input*K)
        self.dropout_c2 = nn.Dropout(dropout_hidden)
        self.conv2 = nn.Conv1d(cha_input*K, cha_hidden, kernel_size = 3, stride = 1, padding=1, bias=False)

        # 3rd conv layer
        self.batch_norm_c3 = nn.BatchNorm1d(cha_hidden)
        self.dropout_c3 = nn.Dropout(dropout_hidden)
        self.conv3 = nn.Conv1d(cha_hidden, cha_hidden, kernel_size = 3, stride = 1, padding=1, bias=False)

        # 4th conv layer
        self.batch_norm_c4 = nn.BatchNorm1d(cha_hidden)
        self.conv4 = nn.Conv1d(cha_hidden, cha_hidden, kernel_size = 5, stride = 1, padding=2, groups=cha_hidden, bias=False)

        self.avg_po_c4 = nn.AvgPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        self.batch_norm2 = nn.BatchNorm1d(output_size)
        self.dropout2 = nn.Dropout(dropout_output)
        self.dense2 = NormalLinear(output_size, output_dim, bias=False)

    def forward(self, x):

        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.celu(self.dense1(x))

        x = x.reshape(x.shape[0], self.cha_input, self.sign_size1)

        x = self.batch_norm_c1(x)
        x = F.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = F.relu(self.conv2(x))
        x_s = x

        x = self.batch_norm_c3(x)
        x = self.dropout_c3(x)
        x = F.relu(self.conv3(x))

        x = self.batch_norm_c4(x)
        x = self.conv4(x)
        x =  x + x_s
        x = F.relu(x)

        x = self.avg_po_c4(x)

        x = self.flt(x)

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.dense2(x)

        return x

In [None]:
model = CNN1D(input_dim=131, output_dim=5,
              sign_size=8, cha_input=32, cha_hidden=64, K=2,
              dropout_input=0.25, dropout_hidden=0.3, dropout_output=0.15)
model = torch.optim.swa_utils.AveragedModel(model)
model.load_state_dict(torch.load("../input/janestreet-cnn1d/cnn1d-swa-lt.pt", map_location=device))
model = model.to(device)
model.eval()

In [None]:
with open("../input/janestreet-imputer/imputer_f0m1.pickle", "rb") as file:
    imputer_f0m1 = pickle.load(file)
    file.close()
    
with open("../input/janestreet-imputer/imputer_f0p1.pickle", "rb") as file:
    imputer_f0p1 = pickle.load(file)
    file.close()

***
### inference

In [None]:
input_columns = [f"feature_{i}" for i in range(130)]

def prepare_input(test_df, imputer_f0m1, iputer_f0p1, input_columns):
    # todo: add case when feature_0 is nan
    
    if np.any(np.isnan(test_df.values)):
        # in case there is a nan
        if test_df["feature_0"].item() == -1:
            imputed = imputer_f0m1.transform(test_df.loc[:, input_columns[1:]])
            x_input = np.concatenate([imputed, np.array([[1, 0]])], axis=1)
        else:
            imputed = imputer_f0p1.transform(test_df.loc[:, input_columns[1:]])
            x_input = np.concatenate([imputed, np.array([[0, 1]])], axis=1)
    else:
        # in case there is no nan values
        if test_df["feature_0"].item() == -1:
            x_input = np.concatenate([test_df.loc[:,input_columns[1:]].values, np.array([[1, 0]])], axis=1)
        elif test_df["feature_0"].item() == 1:
            x_input = np.concatenate([test_df.loc[:,input_columns[1:]].values, np.array([[0, 1]])], axis=1)
           
    return x_input

def sigmoid(x):
    return 1./(1+np.exp(-x))

In [None]:
def get_action(probs, threshold=0.5):
    other_actions = np.sum(probs[0,1:] > 0.5)
    if probs[0,0] > 0.5:
        action = 1
        if other_actions <= 1:
            # if 3 or more other actions say action=0 -> take action=0
            action = 0
    else:
        action = 0
        if other_actions == 4:
            # if 4 other actions say action=1 -> take action=1
            action = 1
    
    return action

In [None]:
imputer_times = list()
model_times = list()
total_actions = 0
probabilities = list()

env = janestreet.make_env()

for (test_df, pred_df) in tqdm(env.iter_test()):
    
    if test_df['weight'].item() > 0:
        tic = time.time()
        x_input = prepare_input(test_df, imputer_f0m1, imputer_f0p1, input_columns)  
        tac = time.time()
        imputer_times.append(tac-tic)
                
        tic = time.time()
        x_input = torch.as_tensor(x_input, dtype=torch.float, device=device)
        with torch.no_grad():
            probs = sigmoid(model(x_input).detach().cpu().numpy())
        action = get_action(probs)
        pred_df.action = action
        total_actions += action
        probabilities.append(probs)
        tac = time.time()
        model_times.append(tac-tic)
    else:
        pred_df.action = 0
        
    env.predict(pred_df)

In [None]:
n_decisions = len(probabilities)

print("% of actions taken:", 100*total_actions/15219)
print("% of actions taken:", 100*total_actions/n_decisions, (" (weight != 0)"))

In [None]:
probabilities = np.array(probabilities)
targets = ["action", "action_1", "action_2", "action_3", "action_4"]

for i in range(5):
    
    print(f" Probabilities for: {targets[i]} ".center(60, "-"))

    plt.figure(figsize=(15,5))
    sns.displot(probabilities[:,:,i], aspect=2., kind="hist", legend=False)
    plt.axvline(x=0.5, c="red")
    plt.title("Hist of model probabilities")
    plt.grid()
    plt.show()

    sns.displot(probabilities[:,:,i], aspect=2., kind="ecdf", legend=False)
    plt.axvline(x=0.5, c="red")
    plt.title("CDF of model probabilities")
    plt.grid()
    plt.show()

In [None]:
imputer_times = np.asarray(imputer_times)
model_times = np.asarray(model_times)

print(f"Mean time imputer: {np.mean(imputer_times)*1000:2f} [ms]")
print(f"Total time imputer: {np.sum(imputer_times)/60:2f} [min]")
print(f"Total inference time imputer: {np.mean(imputer_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time model: {np.mean(model_times)*1000:2f} [ms]")
print(f"Total time model: {np.sum(model_times)/60} [min]")
print(f"Total inference time model: {np.mean(model_times)*1e6/3600} [hrs]")
print("-")
print(f"Mean time iteration: {np.mean(imputer_times+model_times)*1000:2f} [ms]")
print(f"Total time iteration: {np.sum(imputer_times+model_times)/60:2f} [min]")
print(f"Total inference time: {np.mean(imputer_times+model_times)*1e6/3600} [hrs]")

***