In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as f

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, roc_curve, average_precision_score

import optuna

%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
properties = ["RecordID", "Time"]
categorical = ["Gender", "MechVent"]
ordinal = ["GCS"]
target = "In-hospital_death"

#### Load Data

In [13]:
train = pd.read_parquet("../../data/set-a-filled.parquet") #TODO for Pascal
train = train.drop(columns=["ICUType"])
train

Unnamed: 0,Time,RecordID,Age,Gender,Height,Weight,HCT,HR,MechVent,WBC,...,NIMAP,Glucose,ALP,Lactate,Na,RespRate,Temp,SysABP,DiasABP,In-hospital_death
0,00:00,136463,61.0,1.0,152.4,86.0,31.583625,87.248902,-1.0,12.64227,...,76.411843,136.79973,107.206897,2.206357,138.958957,19.877148,37.017982,118.078988,59.249764,0
1,01:00,136463,61.0,1.0,152.4,86.0,31.583625,75.000000,1.0,12.64227,...,76.411843,136.79973,107.206897,2.206357,138.958957,19.877148,35.200000,109.250000,56.000000,0
2,02:00,136463,61.0,1.0,152.4,86.0,32.800000,80.000000,1.0,12.64227,...,76.411843,136.79973,107.206897,2.206357,138.958957,19.877148,35.300000,112.000000,58.250000,0
3,03:00,136463,61.0,1.0,152.4,86.0,32.800000,86.000000,1.0,12.64227,...,76.411843,136.79973,107.206897,2.206357,138.958957,19.877148,35.700000,104.750000,54.500000,0
4,04:00,136463,61.0,1.0,152.4,86.0,32.800000,85.000000,1.0,12.64227,...,83.330000,136.79973,107.206897,2.206357,138.958957,19.877148,35.700000,99.500000,63.250000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195995,44:00,136897,49.0,1.0,182.9,105.0,25.600000,57.000000,1.0,12.80000,...,76.411843,117.00000,107.206897,0.900000,134.000000,19.877148,36.900000,159.666667,81.000000,1
195996,45:00,136897,49.0,1.0,182.9,105.0,25.600000,52.000000,1.0,12.80000,...,76.411843,117.00000,107.206897,0.900000,134.000000,19.877148,37.000000,164.000000,84.000000,1
195997,46:00,136897,49.0,1.0,182.9,105.0,25.600000,54.000000,1.0,12.80000,...,76.411843,117.00000,107.206897,0.900000,134.000000,19.877148,37.000000,165.000000,85.000000,1
195998,47:00,136897,49.0,1.0,182.9,105.0,25.600000,58.000000,1.0,12.80000,...,76.411843,117.00000,107.206897,0.900000,134.000000,19.877148,37.000000,159.000000,77.000000,1


In [14]:
val = pd.read_parquet("../../data/set-b-filled.parquet") #TODO for Pascal
val = val.drop(columns=["ICUType"])
val

Unnamed: 0,Time,RecordID,Age,Gender,Height,Weight,HCT,HR,MechVent,WBC,...,NIMAP,Glucose,ALP,Lactate,Na,RespRate,Temp,SysABP,DiasABP,In-hospital_death
0,00:00,144801,67.0,1.0,-1.0,102.0,31.583625,87.248902,-1.0,12.64227,...,76.411843,136.79973,107.206897,2.206357,138.958957,19.877148,37.017982,118.078988,59.249764,0
1,01:00,144801,67.0,1.0,-1.0,102.0,31.583625,92.000000,1.0,12.64227,...,89.000000,136.79973,107.206897,2.206357,138.958957,19.877148,37.017982,118.078988,59.249764,0
2,02:00,144801,67.0,1.0,-1.0,102.0,34.600000,84.000000,1.0,23.10000,...,77.000000,168.00000,72.000000,2.206357,139.000000,19.877148,37.017982,127.000000,52.000000,0
3,03:00,144801,67.0,1.0,-1.0,102.0,34.600000,85.000000,1.0,23.10000,...,67.000000,168.00000,72.000000,2.206357,139.000000,19.877148,37.017982,108.000000,47.000000,0
4,04:00,144801,67.0,1.0,-1.0,102.0,34.600000,83.000000,1.0,23.10000,...,75.000000,168.00000,72.000000,2.206357,139.000000,19.877148,38.100000,117.000000,50.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195995,44:00,148029,52.0,1.0,182.9,108.6,27.000000,83.000000,1.0,12.70000,...,75.330000,118.00000,107.206897,2.206357,137.000000,19.877148,37.600000,103.000000,55.000000,0
195996,45:00,148029,52.0,1.0,182.9,108.6,27.000000,81.000000,1.0,12.70000,...,71.000000,118.00000,107.206897,2.206357,137.000000,19.877148,37.600000,103.000000,54.000000,0
195997,46:00,148029,52.0,1.0,182.9,108.6,26.600000,84.000000,1.0,9.40000,...,71.000000,119.00000,107.206897,2.206357,133.000000,19.877148,37.600000,100.000000,54.000000,0
195998,47:00,148029,52.0,1.0,182.9,108.6,26.600000,81.000000,1.0,9.40000,...,71.000000,119.00000,107.206897,2.206357,133.000000,19.877148,37.600000,101.000000,53.000000,0


In [15]:
train = pd.get_dummies(train, columns=categorical, dtype=float)
train

Unnamed: 0,Time,RecordID,Age,Height,Weight,HCT,HR,WBC,Platelets,Mg,...,RespRate,Temp,SysABP,DiasABP,In-hospital_death,Gender_-1.0,Gender_0.0,Gender_1.0,MechVent_-1.0,MechVent_1.0
0,00:00,136463,61.0,152.4,86.0,31.583625,87.248902,12.64227,205.438863,2.030821,...,19.877148,37.017982,118.078988,59.249764,0,0.0,0.0,1.0,1.0,0.0
1,01:00,136463,61.0,152.4,86.0,31.583625,75.000000,12.64227,205.438863,2.030821,...,19.877148,35.200000,109.250000,56.000000,0,0.0,0.0,1.0,0.0,1.0
2,02:00,136463,61.0,152.4,86.0,32.800000,80.000000,12.64227,143.000000,2.030821,...,19.877148,35.300000,112.000000,58.250000,0,0.0,0.0,1.0,0.0,1.0
3,03:00,136463,61.0,152.4,86.0,32.800000,86.000000,12.64227,143.000000,2.030821,...,19.877148,35.700000,104.750000,54.500000,0,0.0,0.0,1.0,0.0,1.0
4,04:00,136463,61.0,152.4,86.0,32.800000,85.000000,12.64227,143.000000,2.030821,...,19.877148,35.700000,99.500000,63.250000,0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195995,44:00,136897,49.0,182.9,105.0,25.600000,57.000000,12.80000,180.000000,1.800000,...,19.877148,36.900000,159.666667,81.000000,1,0.0,0.0,1.0,0.0,1.0
195996,45:00,136897,49.0,182.9,105.0,25.600000,52.000000,12.80000,180.000000,1.800000,...,19.877148,37.000000,164.000000,84.000000,1,0.0,0.0,1.0,0.0,1.0
195997,46:00,136897,49.0,182.9,105.0,25.600000,54.000000,12.80000,180.000000,1.800000,...,19.877148,37.000000,165.000000,85.000000,1,0.0,0.0,1.0,0.0,1.0
195998,47:00,136897,49.0,182.9,105.0,25.600000,58.000000,12.80000,180.000000,1.800000,...,19.877148,37.000000,159.000000,77.000000,1,0.0,0.0,1.0,0.0,1.0


In [16]:
one_hot_columns = train.loc[:, target:].columns[1:].to_list()
cont_features = train[train.columns.difference(properties + [target] + one_hot_columns)]

scaler = StandardScaler().fit(cont_features.to_numpy())
scaled_cont_features = scaler.transform(cont_features.to_numpy())
scaled_cont_features = pd.DataFrame(scaled_cont_features, columns=cont_features.columns)
scaled_cont_features[properties + [target] + one_hot_columns] = train[properties + [target] + one_hot_columns]
train = scaled_cont_features[properties + one_hot_columns + cont_features.columns.to_list() + [target]]
train

Unnamed: 0,RecordID,Time,Gender_-1.0,Gender_0.0,Gender_1.0,MechVent_-1.0,MechVent_1.0,ALP,ALT,AST,...,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,In-hospital_death
0,136463,00:00,0.0,0.0,1.0,1.0,0.0,2.223182e-16,-6.685649e-17,0.0,...,6.707814e-15,2.014173e-15,0.000000,-4.438207e-16,0.0,0.000000,2.738935e-16,0.306013,0.000000,0
1,136463,01:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,1.110962e+00,-4.171250e-01,-1.623199,-4.438207e-16,0.0,2.674900,2.738935e-16,0.306013,-0.016426,0
2,136463,02:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,1.110962e+00,-2.872014e-01,-1.533913,-4.438207e-16,0.0,0.597106,2.738935e-16,0.306013,-0.016426,0
3,136463,03:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,1.110962e+00,-6.297272e-01,-1.176770,-4.438207e-16,0.0,0.597106,2.738935e-16,0.306013,-0.016426,0
4,136463,04:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,1.110962e+00,-8.777631e-01,-1.176770,-4.438207e-16,0.0,0.102393,2.738935e-16,0.306013,-0.016426,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195995,136897,44:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,-7.771199e-01,1.964807e+00,-0.105341,-2.621947e+00,0.0,-0.441791,2.432006e-02,0.903420,-0.021638,1
195996,136897,45:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,-7.771199e-01,2.169535e+00,-0.016055,-2.621947e+00,0.0,-0.441791,2.432006e-02,0.903420,-0.021638,1
195997,136897,46:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,-7.771199e-01,2.216780e+00,-0.016055,-2.621947e+00,0.0,-0.293377,2.432006e-02,0.903420,-0.021638,1
195998,136897,47:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.0,...,-7.771199e-01,1.933311e+00,-0.016055,-2.621947e+00,0.0,-0.095492,2.432006e-02,0.903420,-0.021638,1


In [17]:
train["Features"] = train.drop(columns=["RecordID", "Time", "In-hospital_death"]).apply(lambda row: list(row), axis=1)
train = train[["RecordID", "Time", "Features", "In-hospital_death"]]
train_dataset = train.groupby(["RecordID", "In-hospital_death"]).Features.apply(list).reset_index()
train_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Features"] = train.drop(columns=["RecordID", "Time", "In-hospital_death"]).apply(lambda row: list(row), axis=1)


Unnamed: 0,RecordID,In-hospital_death,Features
0,132539,0,"[[0.0, 1.0, 0.0, 1.0, 0.0, 2.223182107728116e-..."
1,132540,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
2,132541,0,"[[0.0, 1.0, 0.0, 1.0, 0.0, 2.223182107728116e-..."
3,132543,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
4,132545,0,"[[0.0, 1.0, 0.0, 1.0, 0.0, 2.223182107728116e-..."
...,...,...,...
3995,142665,0,"[[0.0, 1.0, 0.0, 1.0, 0.0, 2.223182107728116e-..."
3996,142667,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
3997,142670,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
3998,142671,1,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."


In [18]:
val = pd.get_dummies(val, columns=categorical, dtype=float)
val

Unnamed: 0,Time,RecordID,Age,Height,Weight,HCT,HR,WBC,Platelets,Mg,...,RespRate,Temp,SysABP,DiasABP,In-hospital_death,Gender_-1.0,Gender_0.0,Gender_1.0,MechVent_-1.0,MechVent_1.0
0,00:00,144801,67.0,-1.0,102.0,31.583625,87.248902,12.64227,205.438863,2.030821,...,19.877148,37.017982,118.078988,59.249764,0,0.0,0.0,1.0,1.0,0.0
1,01:00,144801,67.0,-1.0,102.0,31.583625,92.000000,12.64227,205.438863,2.030821,...,19.877148,37.017982,118.078988,59.249764,0,0.0,0.0,1.0,0.0,1.0
2,02:00,144801,67.0,-1.0,102.0,34.600000,84.000000,23.10000,284.000000,2.200000,...,19.877148,37.017982,127.000000,52.000000,0,0.0,0.0,1.0,0.0,1.0
3,03:00,144801,67.0,-1.0,102.0,34.600000,85.000000,23.10000,284.000000,2.200000,...,19.877148,37.017982,108.000000,47.000000,0,0.0,0.0,1.0,0.0,1.0
4,04:00,144801,67.0,-1.0,102.0,34.600000,83.000000,23.10000,284.000000,2.200000,...,19.877148,38.100000,117.000000,50.000000,0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195995,44:00,148029,52.0,182.9,108.6,27.000000,83.000000,12.70000,172.000000,1.500000,...,19.877148,37.600000,103.000000,55.000000,0,0.0,0.0,1.0,0.0,1.0
195996,45:00,148029,52.0,182.9,108.6,27.000000,81.000000,12.70000,172.000000,1.500000,...,19.877148,37.600000,103.000000,54.000000,0,0.0,0.0,1.0,0.0,1.0
195997,46:00,148029,52.0,182.9,108.6,26.600000,84.000000,9.40000,109.000000,1.500000,...,19.877148,37.600000,100.000000,54.000000,0,0.0,0.0,1.0,0.0,1.0
195998,47:00,148029,52.0,182.9,108.6,26.600000,81.000000,9.40000,109.000000,1.500000,...,19.877148,37.600000,101.000000,53.000000,0,0.0,0.0,1.0,0.0,1.0


In [19]:
one_hot_columns = val.loc[:, target:].columns[1:].to_list()
cont_features = val[val.columns.difference(properties + [target] + one_hot_columns)]

scaled_cont_features = scaler.transform(cont_features.to_numpy())
scaled_cont_features = pd.DataFrame(scaled_cont_features, columns=cont_features.columns)
scaled_cont_features[properties + [target] + one_hot_columns] = val[properties + [target] + one_hot_columns]
val = scaled_cont_features[properties + one_hot_columns + cont_features.columns.to_list() + [target]]
val

Unnamed: 0,RecordID,Time,Gender_-1.0,Gender_0.0,Gender_1.0,MechVent_-1.0,MechVent_1.0,ALP,ALT,AST,...,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,In-hospital_death
0,144801,00:00,0.0,0.0,1.0,1.0,0.0,2.223182e-16,-6.685649e-17,0.000000,...,6.707814e-15,2.014173e-15,0.000000,-4.438207e-16,0.0,0.000000,2.738935e-16,0.809092,0.000000,0
1,144801,01:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.000000,...,6.707814e-15,2.014173e-15,0.000000,-4.438207e-16,0.0,-0.629782,2.738935e-16,0.809092,0.000000,0
2,144801,02:00,0.0,0.0,1.0,0.0,1.0,-5.507856e-01,-3.491180e-01,-0.363683,...,6.707814e-15,4.214726e-01,0.000000,-4.438207e-16,0.0,-0.392320,1.612460e+00,0.809092,-0.011214,0
3,144801,03:00,0.0,0.0,1.0,0.0,1.0,-5.507856e-01,-3.491180e-01,-0.363683,...,6.707814e-15,-4.761811e-01,0.000000,-4.438207e-16,0.0,-0.590205,1.612460e+00,0.809092,-0.011214,0
4,144801,04:00,0.0,0.0,1.0,0.0,1.0,-5.507856e-01,-3.491180e-01,-0.363683,...,6.707814e-15,-5.097672e-02,0.966088,-4.438207e-16,0.0,-0.540734,1.612460e+00,0.809092,-0.011214,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195995,148029,44:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.000000,...,6.389415e-01,-7.124058e-01,0.519659,-4.438207e-16,0.0,-0.243906,8.901229e-03,1.016613,-0.021638,0
195996,148029,45:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.000000,...,6.389415e-01,-7.124058e-01,0.519659,-4.438207e-16,0.0,-0.144963,8.901229e-03,1.016613,-0.021638,0
195997,148029,46:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.000000,...,6.389415e-01,-8.541406e-01,0.519659,-4.438207e-16,0.0,-0.194435,-4.999203e-01,1.016613,-0.021638,0
195998,148029,47:00,0.0,0.0,1.0,0.0,1.0,2.223182e-16,-6.685649e-17,0.000000,...,6.389415e-01,-8.068957e-01,0.519659,-4.438207e-16,0.0,-0.194435,-4.999203e-01,1.016613,-0.021638,0


In [20]:
val["Features"] = val.drop(columns=["RecordID", "Time", "In-hospital_death"]).apply(lambda row: list(row), axis=1)
val = val[["RecordID", "Time", "Features", "In-hospital_death"]]
val_dataset = val.groupby(["RecordID", "In-hospital_death"]).Features.apply(list).reset_index()
val_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["Features"] = val.drop(columns=["RecordID", "Time", "In-hospital_death"]).apply(lambda row: list(row), axis=1)


Unnamed: 0,RecordID,In-hospital_death,Features
0,142675,1,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
1,142676,0,"[[0.0, 1.0, 0.0, 1.0, 0.0, 2.223182107728116e-..."
2,142680,1,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
3,142683,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
4,142688,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
...,...,...,...
3995,152849,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
3996,152851,0,"[[0.0, 0.0, 1.0, 1.0, 0.0, 2.223182107728116e-..."
3997,152858,0,"[[0.0, 1.0, 0.0, 1.0, 0.0, 2.223182107728116e-..."
3998,152862,0,"[[0.0, 1.0, 0.0, 1.0, 0.0, 2.223182107728116e-..."


In [21]:
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler, WeightedRandomSampler

training_dataset = TensorDataset(
    torch.stack(train_dataset.apply(lambda row: torch.from_numpy(np.array(row["Features"]).astype(np.float32)), axis=1).tolist()),
    torch.tensor(train_dataset["In-hospital_death"], dtype=torch.long)
)

class_sample_count = train_dataset["In-hospital_death"].value_counts().to_numpy()

weight = 1.0 / class_sample_count
samples_weight = torch.from_numpy(weight[train_dataset["In-hospital_death"].to_numpy()]).double()
samples_weight

sampler = WeightedRandomSampler(samples_weight, len(samples_weight))


val_dataset = TensorDataset(
    torch.stack(val_dataset.apply(lambda row: torch.from_numpy(np.array(row["Features"]).astype(np.float32)), axis=1).tolist()),
    torch.tensor(val_dataset["In-hospital_death"], dtype=torch.long)
)

#### Positional Encoding

In [12]:
import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 512) -> None:
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        positional_encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        positional_encoding = positional_encoding.unsqueeze(0).transpose(0, 1)
        
        # positional encoding is not a trainable
        self.register_buffer('positional_encoding', positional_encoding)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.positional_encoding[:x.size(dim=0), :]
        return self.dropout(x)

In [13]:
class TimeSeriesTransformer(nn.Module):
    
    def __init__(self, 
                 input_dim: int, 
                 num_classes: int, 
                 n_heads: int, 
                 d_model: int, 
                 num_layers: int,
                 dim_feedforward: int, 
                 dropout: float = 0.1, 
                 max_len=512
        ) -> None:
        
        super().__init__()
        
        self.input_projection = nn.Linear(input_dim, d_model)
        
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout, max_len=max_len)
        
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation=f.selu,
        )
        
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        
        self.dropout = nn.Dropout(dropout)
        
        # outputs logits, there is no need for softmax (or log softmax), because CrossEntropyLoss
        # does this internally. If probabilities are needed, add f.log_softmax and use NLLoss.
        self.output_layer = nn.Linear(d_model * max_len, num_classes)
        
    def forward(self, x: torch.Tensor, padding_mask=None) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): (batch_size, seq_len, input_dim)
            padding_masks: (batch_size, seq_len) boolean tensor, 1 means keep vector at this position, 0 means padding
        """
        
        # Transformer expects (seq_len, batch_size, d_model)
        x = x.permute(1, 0, 2)
        
        # print(x.shape)
        
        # Project input into d_model dimensional space
        x = self.input_projection(x)
        
        # print(x.shape)
        
        # Add positional encoding
        x = self.positional_encoding(x)
       
        # print("positional encoding", x.shape)
         
        # Transformer encoder
        #if padding_mask is not None:
        #    padding_mask = padding_mask.bool()
        x = self.transformer_encoder(x)#, src_key_padding_mask=~padding_mask)

        # print("transformer encoder", x.shape)
       
        x = f.selu(x)
        x = x.permute(1, 0, 2)
        x = self.dropout(x)
        
        # print("dropout", x.shape)
        
        # Output
        output = x #* padding_mask.unsqueeze(-1)  # zero-out padding embeddings
        output = output.reshape(output.shape[0], -1)  # (batch_size, seq_length * d_model)
        
        # print("before output", output.shape)
        output = self.output_layer(output)  # (batch_size, num_classes)
        return output

In [14]:
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


In [15]:
# Model parameters
BATCH_SIZE = 64
SEQ_LENGTH = train_dataset["Features"].apply(len).max()
N_FEATURES = train_dataset["Features"].apply(lambda x: len(x[0])).max()

hyperparams = {
    "n_trials": 10,
    "batch_size": [16, 32, 64],
    "epochs": [10, 20, 50],
    "learning_rate": [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2],
    "weight_decay":  [1e-2, 1e-3, 5e-5],
    "d_model": [32, 64, 128],
    "n_heads": [2, 4, 8],
    "num_layers": [2, 3, 4],
    "dim_feedforward": [64, 128, 256],
}

In [16]:
def train_model(n_epochs: int, train_dataloader, model, optimizer, criterion) -> None:
    model.train()
    for epoch in range(n_epochs):
        for i, batch in enumerate(train_dataloader):
        
            x, y = batch
            x = x.to(device)
            y = y.to(device)
            
            pred = model(x)
            loss = criterion(pred, y)
                    
            loss.backward()
            optimizer.step()
            optimizer.zero_grad() 


def validate_model(model, val_dataloader) -> float:
    model.eval()
    val_pred, val_true = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            x, y = batch
            x = x.to(device)
            y = y.to(device)
            
            logit = model(x).cpu()
                       
            val_pred.extend(torch.argmax(logit.cpu(), dim=-1))
            val_true.extend(y.cpu())

    fpr, tpr, thresholds = roc_curve(np.array(val_true), np.array(val_pred))
    return auc(fpr, tpr)
    

def objective(trial: optuna.Trial) -> float:
    
    # define hyperparameter search space from the given configuration
    batch_size = trial.suggest_categorical("batch_size", hyperparams["batch_size"])
    epochs = trial.suggest_categorical("epochs", hyperparams["epochs"])
    lr = trial.suggest_categorical("learning_rate", hyperparams["learning_rate"])
    decay = trial.suggest_categorical("weight_decay", hyperparams["weight_decay"])
    d_model = trial.suggest_categorical("d_model", hyperparams["d_model"])
    n_heads = trial.suggest_categorical("n_heads", hyperparams["n_heads"])
    num_layers = trial.suggest_categorical("num_layers", hyperparams["num_layers"])
    dim_feedforward = trial.suggest_categorical("dim_feedforward", hyperparams["dim_feedforward"])
    
    # create dataloaders
    train_dataloader = DataLoader(training_dataset, sampler=sampler, batch_size=batch_size)
    val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
    
    # create the model
    model = TimeSeriesTransformer(
        input_dim=N_FEATURES,
        num_classes=train_dataset["In-hospital_death"].nunique(),
        d_model=d_model,
        n_heads=n_heads,
        num_layers=num_layers,
        dim_feedforward=dim_feedforward,
        max_len=SEQ_LENGTH
    )

    model.to(device)

    # define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=decay)

    # train model
    train_model(epochs, train_dataloader, model, optimizer, criterion)
    
    return validate_model(model, val_dataloader)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=hyperparams["n_trials"], show_progress_bar=True, n_jobs=-1)

# print the top 10 best configurations
study_results: pd.DataFrame = study.trials_dataframe(attrs=["duration", "params", "value"])
study_results = study_results.rename({"value": "score"}, axis=1).sort_values("score", ascending=False)
study_results.head(10)

[I 2025-04-06 23:30:40,767] A new study created in memory with name: no-name-f472432d-0fda-4058-8ff1-e530ca998067
Best trial: 0. Best value: 0.761023:  10%|█         | 1/10 [02:40<24:07, 160.80s/it]

[I 2025-04-06 23:33:21,594] Trial 0 finished with value: 0.7610230145441413 and parameters: {'batch_size': 64, 'epochs': 10, 'learning_rate': 0.0001, 'weight_decay': 0.01, 'd_model': 128, 'n_heads': 4, 'num_layers': 3, 'dim_feedforward': 64}. Best is trial 0 with value: 0.7610230145441413.


Best trial: 0. Best value: 0.761023:  20%|██        | 2/10 [04:42<18:23, 137.92s/it]

[I 2025-04-06 23:35:23,503] Trial 4 finished with value: 0.5 and parameters: {'batch_size': 32, 'epochs': 10, 'learning_rate': 0.05, 'weight_decay': 0.001, 'd_model': 128, 'n_heads': 2, 'num_layers': 4, 'dim_feedforward': 64}. Best is trial 0 with value: 0.7610230145441413.
[I 2025-04-06 23:35:23,511] Trial 9 finished with value: 0.7532995173840245 and parameters: {'batch_size': 32, 'epochs': 10, 'learning_rate': 1e-05, 'weight_decay': 5e-05, 'd_model': 64, 'n_heads': 4, 'num_layers': 4, 'dim_feedforward': 256}. Best is trial 0 with value: 0.7610230145441413.


Best trial: 0. Best value: 0.761023:  40%|████      | 4/10 [04:43<05:10, 51.69s/it] 

[I 2025-04-06 23:35:24,269] Trial 8 finished with value: 0.5713828425096031 and parameters: {'batch_size': 64, 'epochs': 20, 'learning_rate': 0.01, 'weight_decay': 5e-05, 'd_model': 32, 'n_heads': 2, 'num_layers': 3, 'dim_feedforward': 64}. Best is trial 0 with value: 0.7610230145441413.


Best trial: 0. Best value: 0.761023:  50%|█████     | 5/10 [08:14<08:25, 101.11s/it]

[I 2025-04-06 23:38:55,717] Trial 1 finished with value: 0.6937666207032404 and parameters: {'batch_size': 64, 'epochs': 50, 'learning_rate': 0.0005, 'weight_decay': 0.01, 'd_model': 32, 'n_heads': 2, 'num_layers': 3, 'dim_feedforward': 64}. Best is trial 0 with value: 0.7610230145441413.


Best trial: 0. Best value: 0.761023:  60%|██████    | 6/10 [11:42<08:55, 133.82s/it]

[I 2025-04-06 23:42:23,555] Trial 6 finished with value: 0.7496942611379231 and parameters: {'batch_size': 16, 'epochs': 20, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'd_model': 128, 'n_heads': 8, 'num_layers': 4, 'dim_feedforward': 256}. Best is trial 0 with value: 0.7610230145441413.


Best trial: 0. Best value: 0.761023:  70%|███████   | 7/10 [12:32<05:24, 108.19s/it]

[I 2025-04-06 23:43:13,204] Trial 7 finished with value: 0.5411577858760956 and parameters: {'batch_size': 32, 'epochs': 50, 'learning_rate': 0.005, 'weight_decay': 5e-05, 'd_model': 128, 'n_heads': 4, 'num_layers': 2, 'dim_feedforward': 128}. Best is trial 0 with value: 0.7610230145441413.


Best trial: 0. Best value: 0.761023:  80%|████████  | 8/10 [17:08<05:18, 159.15s/it]

[I 2025-04-06 23:47:49,513] Trial 3 finished with value: 0.5594528710725895 and parameters: {'batch_size': 16, 'epochs': 50, 'learning_rate': 0.01, 'weight_decay': 0.01, 'd_model': 64, 'n_heads': 4, 'num_layers': 2, 'dim_feedforward': 64}. Best is trial 0 with value: 0.7610230145441413.
[I 2025-04-06 23:47:49,515] Trial 5 finished with value: 0.5 and parameters: {'batch_size': 16, 'epochs': 50, 'learning_rate': 0.01, 'weight_decay': 0.001, 'd_model': 32, 'n_heads': 8, 'num_layers': 2, 'dim_feedforward': 256}. Best is trial 0 with value: 0.7610230145441413.


Best trial: 0. Best value: 0.761023: 100%|██████████| 10/10 [17:24<00:00, 104.49s/it]

[I 2025-04-06 23:48:05,712] Trial 2 finished with value: 0.7454611116582948 and parameters: {'batch_size': 16, 'epochs': 50, 'learning_rate': 1e-05, 'weight_decay': 0.001, 'd_model': 64, 'n_heads': 4, 'num_layers': 3, 'dim_feedforward': 64}. Best is trial 0 with value: 0.7610230145441413.





Unnamed: 0,duration,params_batch_size,params_d_model,params_dim_feedforward,params_epochs,params_learning_rate,params_n_heads,params_num_layers,params_weight_decay,score
0,0 days 00:02:40.796240,64,128,64,10,0.0001,4,3,0.01,0.761023
9,0 days 00:04:42.699191,32,64,256,10,1e-05,4,4,5e-05,0.7533
6,0 days 00:11:42.743670,16,128,256,20,5e-05,8,4,0.01,0.749694
2,0 days 00:17:24.907855,16,64,64,50,1e-05,4,3,0.001,0.745461
1,0 days 00:08:14.914957,64,32,64,50,0.0005,2,3,0.01,0.693767
8,0 days 00:04:43.458091,64,32,64,20,0.01,2,3,5e-05,0.571383
3,0 days 00:17:08.707201,16,64,64,50,0.01,4,2,0.01,0.559453
7,0 days 00:12:32.393709,32,128,128,50,0.005,4,2,5e-05,0.541158
4,0 days 00:04:42.696817,32,128,64,10,0.05,2,4,0.001,0.5
5,0 days 00:17:08.707030,16,32,256,50,0.01,8,2,0.001,0.5


In [17]:
study_results.iloc[0]

duration                  0 days 00:02:40.796240
params_batch_size                             64
params_d_model                               128
params_dim_feedforward                        64
params_epochs                                 10
params_learning_rate                      0.0001
params_n_heads                                 4
params_num_layers                              3
params_weight_decay                         0.01
score                                   0.761023
Name: 0, dtype: object

In [18]:
best_params = study_results.iloc[0]

BATCH_SIZE = int(best_params.params_batch_size)

train_dataloader = DataLoader(training_dataset, sampler=sampler, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

In [19]:
BATCH_SIZE = int(best_params.params_batch_size)
D_MODEL = int(best_params.params_d_model)
DIM_FEEDFORWARD = int(best_params.params_dim_feedforward)
NUM_LAYERS = int(best_params.params_num_layers)
N_HEADS = int(best_params.params_n_heads)

model = TimeSeriesTransformer(
    input_dim=N_FEATURES,
    num_classes=train_dataset["In-hospital_death"].nunique(),
    d_model=D_MODEL,
    n_heads=N_HEADS,
    num_layers=NUM_LAYERS,
    dim_feedforward=DIM_FEEDFORWARD,
    max_len=SEQ_LENGTH
)

model.to(device)



TimeSeriesTransformer(
  (input_projection): Linear(in_features=43, out_features=128, bias=True)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=64, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=64, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (output_layer): Linear(in_features=6272, out_feat

In [20]:
# Train the model
N_EPOCHS = int(best_params.params_epochs)
LR = float(best_params.params_learning_rate)
DECAY = float(best_params.params_weight_decay)
 
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=DECAY)

n_batches = len(train_dataloader)
n_examples = len(train_dataloader.dataset) 
    
for epoch in range(N_EPOCHS):
    print(f"Epoch {epoch}:")
    model.train()
    for i, batch in enumerate(train_dataloader):
       
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        
        pred = model(x)
        loss = criterion(pred, y)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad() 
        
        # if i % 10 == 0 or i == n_batches - 1:
        #     current, loss = i * BATCH_SIZE, loss.cpu().mean().item()
        #     print(f"loss: {loss:>7f} [{current:>5d}/{n_examples:>5d}]")
        
    
    model.eval()
    val_pred, val_true = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            x, y = batch
            x = x.to(device)
            y = y.to(device)
            
            logit = model(x).cpu()
            probs = f.sigmoid(logit)
            
            val_pred.extend(torch.argmax(logit.cpu(), dim=-1))
            val_true.extend(y.cpu())

    fpr, tpr, thresholds = roc_curve(np.array(val_true), np.array(val_pred) )
    print(auc(fpr, tpr))

Epoch 0:
0.7414187924751305
Epoch 1:
0.7248186086214256
Epoch 2:
0.7041391706884664
Epoch 3:
0.7552119242260087
Epoch 4:
0.7634709773794282
Epoch 5:
0.755640779408385
Epoch 6:
0.7613697921796514
Epoch 7:
0.751370695032667
Epoch 8:
0.7606680291539446
Epoch 9:
0.7665858202830034


In [None]:
test = pd.read_parquet("../../data/set-b-filled.parquet") #TODO for Pascal
test = test.drop(columns=["ICUType"])
test = pd.get_dummies(test, columns=categorical, dtype=float)

one_hot_columns = test.loc[:, target:].columns[1:].to_list()
cont_features = test[test.columns.difference(properties + [target] + one_hot_columns)]

scaled_cont_features = scaler.transform(cont_features.to_numpy())
scaled_cont_features = pd.DataFrame(scaled_cont_features, columns=cont_features.columns)
scaled_cont_features[properties + [target] + one_hot_columns] = test[properties + [target] + one_hot_columns]
test = scaled_cont_features[properties + one_hot_columns + cont_features.columns.to_list() + [target]]


test["Features"] = test.drop(columns=["RecordID", "Time", "In-hospital_death"]).apply(lambda row: list(row), axis=1)
test = test[["RecordID", "Time", "Features", "In-hospital_death"]]
test_dataset = test.groupby(["RecordID", "In-hospital_death"]).Features.apply(list).reset_index()

test_dataset = TensorDataset(
    torch.stack(test_dataset.apply(lambda row: torch.from_numpy(np.array(row["Features"]).astype(np.float32)), axis=1).tolist()),
    torch.tensor(test_dataset["In-hospital_death"], dtype=torch.long)
)

test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=BATCH_SIZE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["Features"] = test.drop(columns=["RecordID", "Time", "In-hospital_death"]).apply(lambda row: list(row), axis=1)


In [22]:
model.eval()
val_pred, val_true = [], []
with torch.no_grad():
    for batch in test_dataloader:
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        
        logit = model(x).cpu()        
        val_pred.extend(torch.argmax(logit.cpu(), dim=-1))
        val_true.extend(y.cpu())

fpr, tpr, thresholds = roc_curve(np.array(val_true), np.array(val_pred))
print(f"AUC on test: {auc(fpr, tpr)}")

auprc = average_precision_score(np.array(val_true), np.array(val_pred))
print(f"AuPRC on test: {auprc}")

AUC on test: 0.7665858202830034
AuPRC on test: 0.29762755556988
