In [None]:
!pip install ../input/torchlrfinder/torch_lr_finder-0.2.1-py3-none-any.whl
!pip install ../input/torchsummary/torchsummary-1.5.1-py3-none-any.whl

***
### Notes

- v9: Changed IterativeImputer to SimpleImputer
- v10: Back to IterativeImputer, standard BatchNorm1d in input layer, increased dropout rate of input layer (x2), reduced lr (1e-4) of second stage model, increased factor to 0.5 ReduceLROnPlateau.
- v11: Dropout of input layer (x1), lr of first step 1e-4, lr of second step 1e-4, patience set to 4, weight_decay set to 1e-4.
- v12: Changed optimizers to `torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-5)`
- v13: Removed BN from input layer, added RobustScaler on full dataset, changed `batch_size=2048` and `virtual_batch_size=256`, reduced `lr=5e-2` and `lr=1e-2` respectively, increased `label_smoothing=1e-2` and `early_stop_on_metric` for the second stage model.
- v14: Removed RobustScaler, added GBN at input layer, set `virtual_batch_size=512`, set `weight_decay=1e-4`, set `patience=20` (early stopping) on second model.
- v15: Changed ReduceLROnPlateau -> OneCycleLR and add LRFinder test.
- v16: Second step trains on full dataset with 5 targets.
- v17: Adds SWA for second step model, set `pct_start=0.1` and `patience=15` for step1. 
- v18: Iterative imputer set to 2 iterations.
- v19: Iterative imputer set to 1 iteration & 50 features.
- v20: Iterative imputer set to 1 iteration & 30 features. Hyperparams: `{'nn_width': 112, 'dropout': 0.3, 'momentum': 0.05, 'virtual_batch_size': 1024, 'weight_decay': 1e-05, 'pct_start': 0.5}` & `weight_decay=1e-5` - `pct_start=0.5`.
- v21: Hyperparams: `{'nn_depth':3, 'nn_width': 112, 'dropout': 0.25, 'momentum': 0.09999999999999999, 'virtual_batch_size': 128, 'weight_decay': 0.0001, 'pct_start': 0.2}`
- v23: Hyperparams: `{'nn_depth':3, 'nn_width': 96, 'dropout': 0.3, 'momentum': 0.1, 'virtual_batch_size': 256, 'weight_decay': 1e-05, 'pct_start': 0.3}`
- v24: Hyperparams: `{'nn_depth': 3, 'nn_width': 96, 'dropout_input': 0.15, 'dropout_hidden': 0.3, 'dropout_output': 0.15, 'momentum':0.1, 'virtual_batch_size':256, 'weight_decay': 1e-05, 'pct_start': 0.4, 'max_lr': 0.06,}`. 
- v26: Visualizing the effect of SWA on validation set.
- v27: Hyperparams: `{'nn_width': 96, 'dropout_input': 0.15, 'dropout_hidden': 0.4, 'dropout_output': 0.15, 'weight_decay': 1e-05, 'pct_start': 0.3, 'max_lr': 0.1}`.
- v28: Hyperaprams: `nn_depth=3, nn_width=112, dropout_input=0.25, dropout_hidden=0.25, dropout_output=0.2, momentum=0.1, virtual_batch_size=256`, `weight_decay=1e-4`.
- v29: SWA training with min_lr=1e-2, max_lr=1e-1, momentum=0.7 and num_epochs=15.
- v30: Trained with sharp SWA on 2nd stage.
- v31: Trained with lower regularization `dropout_input=0.2, dropout_hidden=0.25, dropout_output=0.15` and `momentum=0.8` on SWA.

In [None]:
import copy
import numpy as np 
import pandas as pd
from pathlib import Path
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from category_encoders.one_hot import OneHotEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset,TensorDataset,DataLoader
from torch_lr_finder import LRFinder
from torchsummary import summary

# custom modules
import sys
sys.path.append("../usr/lib/janestreet_torch_utils")
from janestreet_torch_utils import Monitor, train_step, valid_step

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
set_seed(2)

torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
def utility_score(date, weight, resp, action):
    """
    Fast computation of utility score
    """
    date = date.astype(int)
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return -u

In [None]:
def cat_encoder(X):
    """
    Fast one-hot encoding of feature_0
    """
    X["feature_00"] = 0
    idx00 = X.query("feature_0 == -1").index
    X.loc[idx00,"feature_00"] = 1
    
    X["feature_01"] = 0
    idx01 = X.query("feature_0 == 1").index
    X.loc[idx01,"feature_01"] = 1
    
    return X.iloc[:,1:]

In [None]:
def show_metrics(monitor):
    x = np.arange(len(monitor.train_loss))
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(21, 7))
    
    ax1 = axes[0]
    ax2 = ax1.twinx()
    ax1.plot(x, monitor.train_loss, 'go-', label="train_loss")
    ax2.plot(x, monitor.train_metric, 'ro-', label="train_metric")
    plt.legend(loc="best")
    ax1.set_xlabel('epochs')
    ax1.set_ylabel('loss')
    ax1.set_title("Training")
    plt.grid()
    
    ax1 = axes[1]
    ax2 = ax1.twinx()
    ax1.plot(x, monitor.valid_loss, 'go-', label="valid_loss")
    ax2.plot(x, monitor.valid_metric, 'ro-', label="valid_metric")
    plt.legend(loc="best")
    ax1.set_xlabel('epochs')
    ax2.set_ylabel('metric')
    ax1.set_title("Validation")
    plt.grid()
    
    plt.show()

***
### preparing the data

In [None]:
root = Path("../input/janestreet-preprocessing")

train = pd.read_parquet(root/"train.parquet")
features = pd.read_parquet(root/"features.parquet")

train.info()

In [None]:
train = train.query("date > 85").query("weight > 0").reset_index(drop=True)

input_features = [col for col in train.columns if "feature" in col]
resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
w_cols = ["w", "w1", "w2", "w3", "w4"]

X_dset = train.loc[:,input_features].copy()
y_dset = (train.loc[:,resp_cols] > 0).astype(int).copy()
w_dset = train.loc[:, w_cols].copy()
dwr_dset = train.loc[:, ["date","weight","resp"]].copy()

In [None]:
%%time

with open("../input/janestreet-imputer/imputer_f0m1.pickle", "rb") as file:
    imputer_f0m1 = pickle.load(file)
    file.close()
    
with open("../input/janestreet-imputer/imputer_f0p1.pickle", "rb") as file:
    imputer_f0p1 = pickle.load(file)
    file.close()
    
    
idx_f0m1 = X_dset.query("feature_0 == -1").index
X_dset.loc[idx_f0m1, input_features[1:]] = imputer_f0m1.transform(X_dset.loc[idx_f0m1, input_features[1:]])

idx_f0p1 = X_dset.query("feature_0 ==  1").index
X_dset.loc[idx_f0p1, input_features[1:]] = imputer_f0p1.transform(X_dset.loc[idx_f0p1, input_features[1:]])

In [None]:
X_dset = cat_encoder(X_dset)
input_features = X_dset.columns.tolist()

***
### model definition

In [None]:
class BCELabelSmoothing(nn.Module):
    def __init__(self, label_smoothing=0.0):
        super(BCELabelSmoothing, self).__init__()
        self.label_smoothing = label_smoothing
        self.bce_loss = torch.nn.functional.binary_cross_entropy_with_logits
        
    def forward(self, prediction, target, weight=None):
        target_smooth = target*(1.0 - self.label_smoothing) + 0.5*self.label_smoothing
        if weight is None:
            loss = self.bce_loss(prediction, target_smooth, reduction="mean")
        else:
            loss = self.bce_loss(prediction, target_smooth, weight, reduction="sum") / torch.sum(weight)
        return loss

bce_loss = BCELabelSmoothing(label_smoothing=1e-2)

In [None]:
class GBN(nn.Module):
    """
    Ghost Batch Normalization
    https://arxiv.org/abs/1705.08741
    """

    def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
        super(GBN, self).__init__()

        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = nn.BatchNorm1d(self.input_dim, momentum=momentum)

    def forward(self, x):
        chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
        res = [self.bn(x_) for x_ in chunks]

        return torch.cat(res, dim=0)

    
class NormalLinear(nn.Module):
    """ 
    Linear layer with normalized weights
    """
    def __init__(self, size_in, size_out, bias=True):
        super(NormalLinear, self).__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights vector
        weights_v = torch.Tensor(size_out, size_in)
        nn.init.kaiming_uniform_(weights_v, a=np.sqrt(5)) 
        self.weights_v = nn.Parameter(weights_v)
        # weights magnitude
        weights_m = torch.norm(weights_v, dim=1, keepdim=True)
        self.weights_m = nn.Parameter(weights_m.clone().detach())
        
        if bias:
            bias_v = torch.Tensor(size_out)    
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights_v)
            bound = 1 / np.sqrt(fan_in)
            nn.init.uniform_(bias_v, -bound, bound)
            self.bias = nn.Parameter(bias_v)
        else:
            self.register_parameter('bias', None)
            
    def _compute_weights(self):
        norm_per_output = torch.norm(self.weights_v, dim=1, keepdim=True)
        return self.weights_m * torch.div(self.weights_v, norm_per_output)
            
    def forward(self, x):
        weights = self._compute_weights()
        return nn.functional.linear(x, weights, self.bias)

In [None]:
class SNN(nn.Module):
    """
    Standard NN
    """

    def __init__(self, input_dim, output_dim, nn_depth, nn_width, 
                 dropout_input=0.2, dropout_hidden=0.3, dropout_output=0.1, 
                 momentum=0.02, virtual_batch_size=128):
        super().__init__()
        
        self.bn_in = GBN(input_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_in = nn.Dropout(dropout_input)
        self.ln_in = NormalLinear(input_dim, nn_width, bias=False)
        
        self.bnorms = nn.ModuleList(
            [GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum) 
             for i in range(nn_depth-1)])
        self.dropouts = nn.ModuleList(
            [nn.Dropout(dropout_hidden) 
             for i in range(nn_depth-1)])
        self.linears = nn.ModuleList(
            [NormalLinear(nn_width, nn_width, bias=False) 
             for i in range(nn_depth-1)])
        
        self.bn_out = GBN(nn_width, virtual_batch_size=virtual_batch_size, momentum=momentum)
        self.dp_out = nn.Dropout(dropout_output)
        self.ln_out = NormalLinear(nn_width, output_dim, bias=False)

    def forward(self, x):
        x = self.bn_in(x)
        x = self.dp_in(x)
        x = self.ln_in(x)
        x = nn.functional.relu(x)

        for bn_layer,dp_layer,ln_layer in zip(self.bnorms,self.dropouts,self.linears):
            x = bn_layer(x)
            x = dp_layer(x)
            x = ln_layer(x)
            x = nn.functional.relu(x)
            
        x = self.bn_out(x)
        x = self.dp_out(x)
        x = self.ln_out(x)
        return x

***
### model training: 1st step

In [None]:
train_idx = train.query("date < 450").index
valid_idx = train.query("date >= 450").index

train_dset = TensorDataset(torch.tensor(X_dset.loc[train_idx].values, dtype=torch.float), 
                           torch.tensor(y_dset.loc[train_idx].values, dtype=torch.float),
                           torch.tensor(w_dset.loc[train_idx].values, dtype=torch.float),
                           torch.tensor(dwr_dset.loc[train_idx].values, dtype=torch.float),
                          )

valid_dset = TensorDataset(torch.tensor(X_dset.loc[valid_idx].values, dtype=torch.float), 
                           torch.tensor(y_dset.loc[valid_idx].values, dtype=torch.float),
                           torch.tensor(w_dset.loc[valid_idx].values, dtype=torch.float),
                           torch.tensor(dwr_dset.loc[valid_idx].values, dtype=torch.float),
                          )

dataset_sizes = {'train': len(train_dset), 'valid': len(valid_dset)}
train_dataloader = DataLoader(train_dset, batch_size=2048, shuffle=True, num_workers=2)
valid_dataloader = DataLoader(valid_dset, batch_size=len(valid_dset), shuffle=False, num_workers=2)

print("Number of step per epoch:", len(train_dset)//2048)

In [None]:
_model = SNN(input_dim=len(input_features), output_dim=len(resp_cols), nn_depth=3, nn_width=112, 
             dropout_input=0.20, dropout_hidden=0.30, dropout_output=0.20, 
             momentum=0.1, virtual_batch_size=256)
_model = _model.to(device)

_optimizer = torch.optim.SGD(_model.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
lr_finder = LRFinder(_model, _optimizer, bce_loss, device="cuda")
lr_finder.range_test(train_dataloader, start_lr=1e-4, end_lr=1e1, num_iter=652*2, step_mode="exp")
lr_finder.plot(show_lr=1e-2)
plt.show()

_optimizer = torch.optim.SGD(_model.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
lr_finder = LRFinder(_model, _optimizer, bce_loss, device="cuda")
lr_finder.range_test(train_dataloader, start_lr=1e-4, end_lr=1e1, num_iter=652*2, step_mode="exp")
lr_finder.plot(show_lr=1e-2)
plt.show()

In [None]:
model = SNN(input_dim=len(input_features), output_dim=len(resp_cols), nn_depth=3, nn_width=112, 
            dropout_input=0.20, dropout_hidden=0.30, dropout_output=0.20, 
            momentum=0.1, virtual_batch_size=256)
model = model.to(device)
summary(model, input_size=(len(input_features),))

In [None]:
models_history = list()

optimizer = torch.optim.SGD(model.parameters(), lr=2e-2, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=2e-2,
    epochs=50,
    pct_start=0.2,
    anneal_strategy='cos', 
    cycle_momentum=True, 
    base_momentum=0.8, 
    max_momentum=0.9, 
    div_factor=1e1,
    final_div_factor=1e0,
    steps_per_epoch=len(train_dataloader),
    verbose=False)

monitor = Monitor(
    model=model,
    optimizer=optimizer,
    scheduler=scheduler,
    patience=10,
    metric_fn=utility_score,
    experiment_name=f'SNN',
    num_epochs=50,
    dataset_sizes=dataset_sizes,
    early_stop_on_metric=False,
    lower_is_better=True)

for epoch in monitor.iter_epochs:
    train_step(model, train_dataloader, optimizer, monitor, bce_loss, scheduler=scheduler, clip_value=None)    
    early_stop = valid_step(model, valid_dataloader, optimizer, monitor, bce_loss)
    models_history.append(copy.deepcopy(model))
    if early_stop: break

In [None]:
show_metrics(monitor)

In [None]:
# saves all the models
for epoch,_model in enumerate(models_history):
    torch.save(_model.state_dict(), f"./snn-step1-epoch{epoch+1}.pt")  

In [None]:
# SWA around optimal on validation data
swa_model = torch.optim.swa_utils.AveragedModel(model)
best_epoch = len(models_history)-11

for _model in models_history[best_epoch:]:
    swa_model.update_parameters(_model)
    
torch.optim.swa_utils.update_bn(train_dataloader, swa_model, device=device)

In [None]:
# puts state_dict of SWA in the format of original model
state_dict = OrderedDict()
state_dict_swa = swa_model.state_dict()

for key,params in state_dict_swa.items():
    if key == "n_averaged": continue
    key = key.replace("module.","")
    state_dict[key] = params
    
# restore model to best averaged state
model.load_state_dict(state_dict)

***
### model training: 2nd step

In [None]:
train_idx = train.query("date < 450").index
valid_idx = train.query("date >= 450").index

train_dset = TensorDataset(torch.tensor(X_dset.loc[:].values, dtype=torch.float), 
                           torch.tensor(y_dset.loc[:].values, dtype=torch.float),
                           torch.tensor(w_dset.loc[:].values, dtype=torch.float),
                           torch.tensor(dwr_dset.loc[:].values, dtype=torch.float),
                          )

valid_dset = TensorDataset(torch.tensor(X_dset.loc[valid_idx].values, dtype=torch.float), 
                           torch.tensor(y_dset.loc[valid_idx].values, dtype=torch.float),
                           torch.tensor(w_dset.loc[valid_idx].values, dtype=torch.float),
                           torch.tensor(dwr_dset.loc[valid_idx].values, dtype=torch.float),
                          )

dataset_sizes = {'train': len(train_dset), 'valid': len(valid_dset)}
train_dataloader = DataLoader(train_dset, batch_size=2048, shuffle=True, num_workers=2)
valid_dataloader = DataLoader(valid_dset, batch_size=len(valid_dset), shuffle=False, num_workers=2)

print("Number of step per epoch:", len(train_dset)//2048)

In [None]:
_optimizer = torch.optim.SGD(model.parameters(), lr=2e-3, momentum=0.8, weight_decay=1e-4)
lr_finder = LRFinder(model, _optimizer, bce_loss, device="cuda")
lr_finder.range_test(train_dataloader, start_lr=1e-4, end_lr=1e1, num_iter=652*4, step_mode="exp")
lr_finder.plot(show_lr=1e-2)
plt.show()

In [None]:
# restore to state of best epoch
model.load_state_dict(state_dict)
optimizer = torch.optim.SGD(model.parameters(), lr=2e-3, momentum=0.8, weight_decay=1e-4)

In [None]:
scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=7e-3, anneal_strategy="cos", anneal_epochs=5)

models_history = list()

monitor = Monitor(
    model=model,
    optimizer=optimizer,
    scheduler=scheduler,
    patience=35,
    metric_fn=utility_score,
    experiment_name=f'SNN',
    num_epochs=35,
    dataset_sizes=dataset_sizes,
    early_stop_on_metric=False,
    lower_is_better=True)

for epoch in monitor.iter_epochs:
    train_step(model, train_dataloader, optimizer, monitor, bce_loss, scheduler=None, clip_value=None)    
    valid_step(model, valid_dataloader, optimizer, monitor, bce_loss)
    scheduler.step()
    
    models_history.append(copy.deepcopy(model))
    
    # save models along the path of swa
    torch.save(model.state_dict(), f"./snn-epoch{epoch+1}.pt")


In [None]:
show_metrics(monitor)

In [None]:
# SWA short training
start_epoch = 5
end_epoch = 20

swa_model_st = torch.optim.swa_utils.AveragedModel(model)
swa_model_st = swa_model_st.to(device)

for _model in models_history[start_epoch:end_epoch]:
    swa_model_st.update_parameters(_model)
    
torch.optim.swa_utils.update_bn(train_dataloader, swa_model_st, device=device)

# save final model for inference
torch.save(swa_model_st.state_dict(), "./snn-swa-st.pt")

In [None]:
# SWA long training
start_epoch = 5
end_epoch = 35

swa_model_lt = torch.optim.swa_utils.AveragedModel(model)
swa_model_lt = swa_model_lt.to(device)

for _model in models_history[start_epoch:end_epoch]:
    swa_model_lt.update_parameters(_model)
    
torch.optim.swa_utils.update_bn(train_dataloader, swa_model_lt, device=device)

# save final model for inference
torch.save(swa_model_lt.state_dict(), "./snn-swa-lt.pt")

***