In [352]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
import torch
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.logging import TensorBoardLogger
import os
from pytorch_lightning.callbacks import ModelCheckpoint

In [49]:
data_path = Path('./data')
train = pd.read_csv(data_path/'train.csv')
test = pd.read_csv(data_path / 'data_for_test.csv')

In [50]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [53]:
train.key = train.key.apply(lambda x:x.split('-')[-1]).astype(int)

In [54]:
test.key = test.key.apply(lambda x:x.split('-')[-1]).astype(int)

In [55]:
test.key.min(), test.key.max()

(1, 73)

In [56]:
train = train.loc[train.key<=73]

In [123]:
train = train.sort_values(['patient_id','key']).reset_index(drop=True)

In [58]:
static_features = train[['patient_id','gender','age','x1',
                        'x2','x3', 'x4', 'x5',
       'x6',]].drop_duplicates().reset_index(drop=True)

In [59]:
static_features = static_features.set_index('patient_id')

In [60]:
timeseries_features = train[['patient_id','key','xx1', 
                             'xx2', 'xx3', 'xx4', 'xx5']]

In [61]:
timeseries_featurestures  = timeseries_features.sort_values(['patient_id', 'key']).reset_index(drop=True)

In [94]:
d = timeseries_features.groupby(['patient_id', 'key'],as_index=False).indices
l = [[i,min(j),max(j)] for i,j in d.items()]
l = [list(i[0]) + i[1:] for i in l]
l = pd.DataFrame(l,columns=['patient_id','key','min_index','max_index'])

In [116]:
l['min_index'] = l.groupby('patient_id')['min_index'].transform('min')

In [133]:
target_df = train[['patient_id', 'key', 'y_mean_MAP', 'y_mean_HR']].drop_duplicates()

In [136]:
util_df = pd.merge(l,target_df, on = ['patient_id', 'key'])

In [129]:
# check if 
assert pd.DataFrame.equals(l, l.sort_values(['patient_id','key']))
assert pd.DataFrame.equals(timeseries_features, 
                           timeseries_features.sort_values(['patient_id','key']))

In [142]:
util_df.iloc[0]

patient_id     0.000000
key           28.000000
min_index      0.000000
max_index     29.000000
y_mean_MAP    86.426667
y_mean_HR     79.130000
Name: 0, dtype: float64

In [143]:
timeseries_features.head()

Unnamed: 0,patient_id,key,xx1,xx2,xx3,xx4,xx5
0,0,28,82.0,100.0,124.0,69.0,89.0
1,0,28,82.0,100.0,121.0,67.0,87.0
2,0,28,81.0,100.0,118.0,66.0,85.0
3,0,28,81.0,100.0,117.0,66.0,85.0
4,0,28,81.0,100.0,116.0,67.0,85.0


In [336]:
class HeartVariableLength(Dataset):
    
    def __init__(self, util_df, timeseries ):
        '''
        util_df has indexes = for variables timeseries loading
        timeseries = entire timeseries data
        '''
        self.util_df = util_df
        self.timeseries = timeseries
        
    def __len__(self):
        return len(self.util_df)
    
    def __getitem__(self,idx):
        
        s = self.util_df.iloc[idx]
        min_index, max_index = s.min_index, s.max_index
        target = self.util_df.loc[idx,['y_mean_MAP','y_mean_HR']].values
        
        inp = self.timeseries.loc[min_index: max_index,
                                   ['xx1','xx2','xx3','xx4','xx5'] ].values
        
        inp = torch.tensor(inp).float()
        target = torch.tensor(target).reshape(1,2).float()
        return inp, target

    
            

In [None]:
def r2_squared_loss_function(preds , target):
    '''
    PREDICTIONS --> TARGET ORDER IMPORTANT
    
    '''
    
    mean_target = torch.mean(target,0)

    mean_mse = torch.mean((target-mean_target)**2,0)
    pred_mse = torch.mean((preds-target)**2,0)
    
    div = 1 - pred_mse/(mean_mse+10e-6)
    
    mean_r2_squared = torch.mean(div)
    
    return -mean_r2_squared


In [337]:
heart_dataset = HeartVariableLength(util_df, timeseries_features)

In [365]:
batch = [heart_dataset[i] for i in range(32)]

In [421]:
def variable_length_collate(batch):
    
    sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)
    
    sequences = [x[0] for x in sorted_batch]
    
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
    
    lengths = torch.LongTensor([len(x) for x in sequences])
    
    labels = torch.cat([ x[1] for x in sorted_batch])
    
    return sequences_padded.to(device), labels.to(device)

In [410]:
sequences_padded, labels = variable_length_collate(batch)

In [411]:
BATCH_SIZE=32

In [413]:
dataloader_heart = DataLoader(heart_dataset, batch_size=BATCH_SIZE,
                          collate_fn=variable_length_collate,shuffle=True)

In [414]:
class Lstm1(pl.LightningModule):
    
    def __init__(self, hidden_size):
        super(Lstm1, self).__init__()
        self.rnn = nn.LSTM(input_size=5,hidden_size=hidden_size,
                    num_layers=2,batch_first=True,
                    bidirectional=False)
        
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(num_features=hidden_size//2)
        
        self.fc2 = nn.Linear(hidden_size//2, 2)
    
    def forward(self,sequences_padded):
        output, (h_n, c_n) = self.rnn(sequences_padded)
        output = output[:,-1,:]
        
        output = self.fc1(output)
        output = self.relu1(output)
        output = self.bn1(output)
        
        output = self.fc2(output)
        
        return output
    
    def training_step(self, batch, batch_idx):
        # REQUIRED
        sequences_padded, y = batch
        y_hat = self.forward(sequences_padded)
        loss = r2_squared_loss_function(y_hat, y)
        tensorboard_logs = {'train_loss': loss}

        return {'loss': loss, 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):
        # REQUIRED
        # can return multiple optimizers and learning_rate schedulers
        # (LBFGS it is automatically supported, no need for closure function)
        return torch.optim.Adam(self.parameters(), lr=0.2)
    
    @pl.data_loader
    def train_dataloader(self):
        # REQUIRED
        return dataloader_heart

In [418]:
trainer = pl.Trainer()

In [419]:
model = Lstm1(128)
model = model.to(device)

In [420]:
trainer.fit(model)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
INFO:root:
    Name         Type Params
0    rnn         LSTM  201 K
1    fc1       Linear    8 K
2  relu1         ReLU    0  
3    bn1  BatchNorm1d  128  
4    fc2       Linear  130  


Epoch 1:   0%|          | 5/1056 [00:36<2:13:08,  7.60s/batch, batch_idx=4, loss=26.369, train_loss=21.1, v_num=27]

KeyboardInterrupt: 