In many notebooks, models predict actions corresponding to ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp'] and take mean/median of them. In order to improve the utility score, we consider linear combination instead of taking mean/median of them, and optimize the weights using the CMA-ES.

The Covariance Matrix Adaptation Evolution Strategy (CMA-ES) is a stochastic method for black-box continuous optimization, where the gradient of the objective function cannot be accessed such as the utility score in this competition. The CMA-ES defines a Gaussian distribution on the search space and iteratively updates the parameters of the distribution (the mean vector and the covariance matrix) to improve the objective function value of the samples generated from the distribution. 

↓ Illustration of CMA-ES optimizing a simple two-dimensional function.

<img src="https://upload.wikimedia.org/wikipedia/commons/d/d8/Concept_of_directional_optimization_in_CMA-ES_algorithm.png" width="500px">

https://en.wikipedia.org/wiki/CMA-ES

In [None]:
from typing import List, Tuple
import numpy as np
import pandas as pd
import gc
import os
import warnings
from dataclasses import dataclass
from tqdm.notebook import tqdm
from random import choices
from matplotlib import pyplot as plt
import seaborn as sns
import random
import pickle
import itertools
from cmaes import CMA
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow import keras

In [None]:
def seed_everything(seed: int):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    

@dataclass
class Config:
    n_fold: int
    dropouts: List[float]
    n_units: List[int]
    lr: float
    ls: float
    wd: float
    patience: int
    batch_size: int
    epochs: int
    seed: int
    train: bool

### Configuration
Swith 'train=True' when you train models.

In [None]:
config = Config(
    n_fold=4,
    dropouts=[0.1, 0.5, 0.5],
    n_units=[256, 256],
    lr=1e-3,
    ls=1e-2,
    wd=1e-5,
    patience=10,
    batch_size=4096,
    epochs=300,
    seed=36,
    train=False
)
warnings.filterwarnings('ignore')
sns.set()

In [None]:
class DateKFold:
    __i: int = -1
        
    def __init__(self, data: pd.DataFrame, n_fold: int):
        self.data = data
        self.n_fold = n_fold
        self.uni_date = np.unique(data['date'].values)
        self.date_block = np.array_split(self.uni_date, n_fold)
        self.block_idxs = [(np.delete(np.arange(n_fold), i), i) for i in range(n_fold)]
    
    def __len__(self):
        return self.n_fold
    
    def __iter__(self):
        self.__i = -1
        return self
    
    def __next__(self):
        self.__i += 1
        if self.__i < 0 or self.n_fold <= self.__i:
            raise StopIteration()
        return self.split()
    
    def __split_date(self) -> Tuple[List[int], List[int]]:
        return np.hstack([self.date_block[j] for j in self.block_idxs[self.__i][0]]).tolist(), self.date_block[self.block_idxs[self.__i][1]].tolist()
    
    def split(self):
        tr_date, va_date = self.__split_date()
        return self.data.query(f'date in {tr_date}'), self.data.query(f'date in {va_date}')
    
    def plot(self):
        fig, ax = plt.subplots(1, 1, figsize=(15, 4))
        for i in range(self.n_fold):
            self.__i = i
            tr_date, va_date = self.__split_date()
            if i == 0:
                ax.scatter(tr_date, [i] * len(tr_date), label='train', color='blue')
                ax.scatter(va_date, [i] * len(va_date), label='valid', color='red')
            else:
                ax.scatter(tr_date, [i] * len(tr_date), color='blue')
                ax.scatter(va_date, [i] * len(va_date), color='red')
        ax.set_title('K-fold by date')
        ax.set_ylabel('fold')
        ax.set_xlabel('date')
        ax.set_yticks(list(range(self.n_fold)))
        ax.legend()
        plt.show()

In [None]:
class UtilityScoreCallback(keras.callbacks.Callback):
    def __init__(self, df: pd.DataFrame, features: List[str]):
        super().__init__()
        self.__weight = df['weight'].values
        self.__resp = df['resp'].values
        self.__date = df['date'].values
        self.__X = df[features].values
        
    def on_train_begin(self, logs={}):
        self.scores = list()
        self.__best = -np.inf
    
    def on_epoch_end(self, epoch, logs={}):
        preds = self.model(self.__X, training=False).numpy()
        us = self.bincount(preds=preds.mean(axis=1))
        self.scores.append(us)
        print(f"\nEpoch {epoch + 1}: utility score = {us}")
    
    def bincount(self, preds: np.ndarray) -> float:
        """
        preds: np.ndarray predictions for 'action' (resp > 0)
        """
        action = np.where(preds > 0.5, 1, 0)
        count_i = len(np.unique(self.__date))
        Pi = np.bincount(self.__date, self.__weight * self.__resp * action)
        t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
        u = np.clip(t, 0, 6) * np.sum(Pi)
        u = 0 if np.isnan(u) else u
        return u
    
    def get_scores(self) -> List[float]:
        return self.scores
    

def create_model(input_dim: int, output_dim: int, n_units: List[int], dropouts: List[float], lr: float, ls: float, wd: float):
    i = Input(input_dim)
    x = BatchNormalization()(i)
    x = Dropout(dropouts[0])(x)
    
    for unit, dropout in zip(n_units, dropouts[1:]):
        x = Dense(unit)(x)
        x = BatchNormalization()(x)
        x = Activation(tf.keras.activations.swish)(x)
        x = Dropout(dropout)(x)
    
    x = Dense(output_dim, activation='sigmoid')(x)
    
    model = Model(inputs=i, outputs=x)
    
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=lr, weight_decay=wd),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=ls),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )
    return model

In [None]:
%%time

train = pd.read_feather('../input/janestreetmarketprediction/train.feather')
train = train.query('date > 86 & weight != 0').reset_index(drop=True)
train.fillna(train.mean(), inplace=True)
train['rtn'] = train['weight'] * train['resp']

features = [c for c in train.columns if 'feature' in c]
resp_cols = [c for c in train.columns if 'resp' in c]

for i, c in enumerate(resp_cols):
    train[f'action_{i}'] = (train[c] > 0).astype('int8')

action_cols = [c for c in train.columns if 'action' in c]

### Cross Validation
In this notebook, I used a simple K-Fold (K=4).

In [None]:
dkf = DateKFold(data=train, n_fold=config.n_fold)
dkf.plot()

In [None]:
if config.train:
    for fold, (tr_df, va_df) in enumerate(dkf):
        print(f'fold {fold}')

        utility_score = UtilityScoreCallback(df=va_df, features=features)
        model = create_model(
            input_dim=len(features), 
            output_dim=len(action_cols), 
            n_units=config.n_units,
            dropouts=config.dropouts,
            lr=config.lr,
            ls=config.ls,
            wd=config.wd,
        )

        model.fit(
            tr_df[features].values,
            tr_df[action_cols].values,
            validation_data=(va_df[features].values, va_df[action_cols].values),
            epochs=config.epochs,
            batch_size=config.batch_size,
            verbose=1,
            callbacks=[
                utility_score,
                EarlyStopping(monitor='val_loss', min_delta=0, patience=config.patience, verbose=1, mode='min'),
                ReduceLROnPlateau(monitor='val_loss', foctor=0.2, patience=int(config.patience * 0.5), min_lr=1e-5, verbose=1),
            ]
        )

        model.save(f'./model-f{fold}.h5')

### Load Models
The utility scores of each validation data is like this.

In [None]:
def utility_score(action: np.ndarray, df: pd.DataFrame) -> float:
    date, rtn = df['date'].values, df['rtn'].values
    count_i = len(np.unique(date))
    Pi = np.bincount(date, rtn * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    u = 0 if np.isnan(u) else u
    return u

In [None]:
models = [keras.models.load_model(f'../input/janestreetmarketprediction/model-f{fold}.h5') for fold in range(config.n_fold)]

preds_list = list()
for fold, (_, va_df) in enumerate(dkf):
    preds = models[fold](va_df[features].values, training=False).numpy()
    preds_list.append(preds)
    print(f'fold {fold + 1}/{config.n_fold}: utility score = {utility_score(action=(np.mean(preds, axis=1) > 0.5).astype(int), df=va_df)}')

In [None]:
class LinearCombination:
    def __init__(self, w_init: np.ndarray):
        self.w = w_init

    def __call__(self, X: np.ndarray, step: bool = True) -> np.ndarray:
        """
        :param X: np.ndarray shape=(n_data, n_dim)
        :return np.ndarray shape=(n_data, )
        """
        y = np.dot(X, self.w)
        if step:
            y = np.where(y > 0.5, 1, 0)
        return y


def split_tr_val_fold(preds_list: List[np.ndarray], fold: int, dkf: DateKFold) -> Tuple[pd.DataFrame, np.ndarray]:
    tr_set = list()
    va_set = None
    for i, ((_, va_df), preds) in enumerate(zip(dkf, preds_list)):
        if fold == i:
            va_set = (va_df, preds)
        else:
            tr_set.append((va_df, preds))
    return tr_set, va_set


def train_fold_i(dkf: DateKFold, preds_list: List[np.ndarray], fold: int, n_iters: int = 50, sigma=1.) -> LinearCombination:
    tr_set, va_set = split_tr_val_fold(preds_list=preds_list, fold=fold, dkf=dkf)

    ordinary = utility_score(action=(np.mean(va_set[1], axis=1) > 0.5).astype(int), df=va_set[0])

    lc = LinearCombination(w_init=np.array([0.2] * 5))
    lc = optimize_utility_score(
        train_set=tr_set, 
        valid_set=va_set,
        lc=lc, 
        step=True, 
        n_iters=n_iters, 
        sigma=sigma
    )
    opt = utility_score(action=lc(X=va_set[1], step=True), df=va_set[0])
    
    print('### valid')
    print(f'{ordinary} ⇨ {opt}')
    return lc


def optimize_utility_score(train_set: List[Tuple[pd.DataFrame, np.ndarray]], valid_set: Tuple[pd.DataFrame, np.ndarray],
                           lc: LinearCombination, step: bool, patience: int = 5, n_iters: int = 50, sigma: float = 1.):
    
    optimizer = CMA(mean=lc.w, sigma=sigma, seed=2020)
    init_value = np.sum([utility_score(action=lc(X=X, step=step), df=df) for df, X in train_set])

    best = np.inf
    val_best = -np.inf
    wait = 0
    best_x = None
    __stop = False
    for generation in tqdm(range(n_iters)):
        solutions = list()
        for _ in range(optimizer.population_size):

            x = optimizer.ask()
            lc.w = x
            value = -np.sum([utility_score(action=lc(X=X, step=step), df=df) for df, X in train_set])
            solutions.append((x, value))

            if best > value:
                best = value
                val_values = utility_score(action=lc(X=valid_set[1], step=step), df=valid_set[0]) # warning: using the original utility score
                print(f'train best={-best}, valid value={val_values}')
                if val_best < val_values:
                    best_x = x
                    val_best = val_values
                    wait = 0
                else:
                    wait += 1
                    if wait > patience:
                        print(f'Early Stopping at {generation}: score={val_best}')
                        __stop = True
        
        optimizer.tell(solutions)
        if __stop:
              break
    
    print("\n### train")
    print(f'f0={init_value} ⇨ fbest={-best}')
    print(f'best_x={best_x}')
    
    lc.w = best_x
    return lc

### Optimize the weights using the CMA-ES
Initial values of the mean vector is set to [0.2, 0.2, 0.2, 0.2, 0.2], which is equivalent to taking mean of predictions.
The Scale of distribution ('sigma') and the number of iterations is also set to 0.1 and 50.

The algorithm is terminated when it reaches 50 iterations or the utility score for the validation data does not improve for 5 iterations.

In [None]:
lcs = [train_fold_i(dkf=dkf, preds_list=preds_list, fold=i, n_iters=50, sigma=0.1) for i in range(config.n_fold)]

### the weights

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
for i, c in enumerate(resp_cols):
    ax.plot([lc.w[i] for lc in lcs], label=f'action ({c})')
ax.set_xticks(list(range(config.n_fold)))
ax.set_xlabel('fold')
ax.set_ylabel('weight')
ax.legend()
plt.show()

### We can see the utility scores improve on every fold!!!

In [None]:
for fold, (_, va_df) in enumerate(dkf):
    action_mean = utility_score(action=(np.mean(preds_list[fold], axis=1) > 0.5).astype(int), df=va_df)
    action_lc = utility_score(action=lcs[fold](X=preds_list[fold], step=True), df=va_df)
    print(f'fold {fold + 1}/{config.n_fold}: utility score = {action_mean} ⇨ {action_lc}')