In [None]:
from collections import OrderedDict

import pandas as pd
import sklearn as sk
import numpy as np
import datamol as dm
import optuna
import numpy as np
import matplotlib.pyplot as plt
from optuna.samplers import TPESampler

from torch import nn
import torch

from skorch import NeuralNet
from skorch.dataset import Dataset
from skorch.callbacks import EarlyStopping

from spacecutter.models import OrdinalLogisticMultiTaskModel
from spacecutter.losses import MultiTaskCumulativeLinkLoss
from spacecutter.callbacks import AscensionCallback

from utils import train_data, to_model_format

In [None]:
proj_dir = '/Users/robertarbon/Library/CloudStorage/GoogleDrive-robert.arbon@gmail.com/My Drive/Polaris_ASAP_competition/polaris_challenge/admet'

In [None]:
# Imputed training data
df_imp = pd.read_csv(f'{proj_dir}/dm_features/ordinal_data_split_2/train_admet_split2_log_pmm_imputed.csv')
# Non-imputed validation data
df_val = pd.read_csv(f'{proj_dir}/dm_features/ordinal_data_split_2/train_admet_split2_features.csv')
# change names
df_val.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)
df_imp.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)

# Smiles columns because they were removed (for some unknown reason)
df_smiles = pd.read_csv(f'{proj_dir}/data/train_admet_all.csv')
df_smiles.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)

df_imp = df_imp.merge(df_smiles.loc[:, ['Molecule.Name', 'CXSMILES']], on='Molecule.Name', how='left')
df_val = df_val.merge(df_smiles.loc[:, ['Molecule.Name', 'CXSMILES']], on='Molecule.Name', how='left')

In [None]:
random_state = 92038745
features = ['chemberta', 'chem_prop']
patience = 100
storage_name = "sqlite:///mtl_ordinal.db"
study_name = f"{'_'.join(features)}_{patience}"
n_startup_trials = 20
n_model_trials = 100
n_total_trials = n_startup_trials + n_model_trials


train, val = train_data(df_imp, imp_ix=1, df_val=df_val, n_cuts=None, features=features, proj_dir=proj_dir, remove_nans=False)
X, y, train_ix, val_ix, config = to_model_format(train, val)

def objective(trial):
    weight_decay = trial.suggest_float("weight_deacy", low=1e-6, high=1, log=True)
    backbone_depth = trial.suggest_int("backbone_depth", 1, 5)
    head_depth = trial.suggest_int("head_depth", 1, 3)

    
    n_features = config['n_features']
    backbone = []
    for i in range(backbone_depth):
        backbone.append((f"Backbone_FC_{i}",nn.Linear(n_features, n_features)))
        backbone.append((f"Backbone_ReLU_{i}", nn.ReLU()))
    backbone = nn.Sequential(OrderedDict(backbone))

    head = []
    for i in range(head_depth):
        if i < head_depth - 1: 
            out_dim = n_features
        else:
            out_dim = 1
        head.append((f"Head_FC_{i}",nn.Linear(n_features, out_dim)))
        head.append((f"Head_ReLU_{i}", nn.ReLU()))
    head = nn.Sequential(OrderedDict(head))
     
    
    # out_dim = max(n_features//10, 2)
    model = NeuralNet(
        module=OrdinalLogisticMultiTaskModel,
        module__backbone=backbone,
        module__head=head,
        module__n_classes=config['n_classes_per_task'],
        criterion=MultiTaskCumulativeLinkLoss,
        criterion__n_tasks=config['n_tasks'],
        criterion__n_classes_per_task = config['n_classes_per_task'], 
        criterion__loss_reduction = 'inv_num_classes', 
        optimizer=torch.optim.Adam,
        optimizer__weight_decay = weight_decay,
        train_split=lambda ds, y: (torch.utils.data.Subset(ds, train_ix),
                                    torch.utils.data.Subset(ds, val_ix)),
        callbacks=[
            ('ascension', AscensionCallback()),
            ('early_stopping', EarlyStopping(threshold=0.0001, load_best=True,
                                            patience=patience))
        ],
        verbose=0,
        batch_size=train_ix.shape[0],
        max_epochs=1000,
    )

    model.fit(X, y)
    # Get all predictions (train + val)
    mod = model.module_
    mod.eval()
    y_pred_list = [x.cpu().detach().numpy() for x in mod.forward(torch.as_tensor(X))]
    y_preds_ord = np.concatenate([np.argmax(x, axis=1).reshape(-1, 1) for x in y_pred_list], axis=1)

    # Convert to continuous
    y_pred_cont = []
    for i, target in enumerate(config['targets']):
        bins = train[1][target]['bins']
        y_pred_cont.append(np.array([bins[x] if not np.isnan(x) else np.nan for x in y_preds_ord[:, i]]).reshape(-1, 1))
    y_pred_cont = np.concatenate(y_pred_cont, axis=1)

    y_true_train_cont = np.concatenate([train[1][targ]['original'].reshape(-1, 1) for targ in config['targets']], axis=1)
    y_true_val_cont = np.concatenate([val[1][targ]['original'].reshape(-1, 1) for targ in config['targets']], axis=1)
    y_true_cont = np.concatenate([y_true_train_cont, y_true_val_cont], axis=0)
        
    diff = np.abs(y_pred_cont - y_true_cont)

    train_mask = np.isin(np.arange(diff.shape[0]), train_ix).reshape(-1, 1)
    val_mask = np.isin(np.arange(diff.shape[0]), val_ix).reshape(-1, 1)
    train_mae = np.mean(diff, where=~np.isnan(diff) & train_mask)
    val_mae = np.mean(diff, where=~np.isnan(diff) & val_mask)
    return val_mae, np.abs(train_mae-val_mae)


sampler = TPESampler(n_startup_trials=n_startup_trials, seed=random_state, multivariate=True)
study = optuna.create_study(study_name=study_name, 
                            storage=storage_name, 
                            directions=['minimize', 'minimize'], 
                            load_if_exists=True)

study.optimize(objective, n_trials=n_total_trials)


In [None]:
# from optuna.visualization import plot_optimization_history, plot_param_importances, plot_pareto_front, plot_slice

# plot_pareto_front(study, include_dominated_trials=True)


## Compare all trials

In [None]:
study_names = optuna.study.get_all_study_names(storage="sqlite:///mtl_ordinal.db")
all_trials = []
for study_name in study_names:
    study = optuna.study.load_study(storage="sqlite:///mtl_ordinal.db", study_name=study_name)
    df = study.trials_dataframe()
    if study_name.endswith('_100'):
        patience = 100
        features = study_name[:-len('_100')]
    else: 
        patience = 10
        features = study_name
    print(study_name, patience, features)
    df.rename(columns={'values_0': 'MAE(val.)', 'values_1': '|MAE(val.) - MAE(train)|'}, inplace=True)
    df['patience'] = patience
    df['features'] = features
    all_trials.append(df)
df = pd.concat(all_trials)
df.shape




In [None]:
# import plotly.express as px
# fig = px.scatter(df, x="MAE(val.)", y="|MAE(val.) - MAE(train)|", 
#                  color='features', symbol='patience', 
#                  custom_data=['params_backbone_depth', 
#                               'params_head_depth', 
#                               'params_weight_deacy', 
#                               'features'], 
#                  )
# fig.update_traces(
#     hovertemplate="<br>".join([
#         "MAE: %{x:4.2f}",
#         "Overfitting: %{y:4.2f}",
#         "Features:%{customdata[3]}",
#         "backbone: %{customdata[0]:d}",
#         "head: %{customdata[1]:d}",
#         "weight decay: %{customdata[2]:4.1e}",
#     ])
# )
# fig.show()

Optimum parameters: 

- features = 'chem_prop' + 'chemberta'
- weight decay = 9.9e-5
- backbone depth, head depth = 1, 1
