In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

import argparse
import logging
from tqdm import tqdm
import numpy as np
import pandas as pd
import logging
import os
import math
import copy
import matplotlib.pyplot as plt
import optuna
from datetime import datetime
import random
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
import copy

import scipy.stats

%matplotlib inline

import sys
sys.path.append('..')
import time

from src.data.data_loader import MIMICDataset,import_data
from src.utils import setup_logger
from src.training.training_nn import *
from src.utils import seed_everything
from src.data.data_scaler import PreProcess
from data.feature_sets import all_features

In [2]:
def glc_transform(x):
    x = x.copy()
    x[x > 0] = np.log(x[x > 0]) - np.log(140)
    return x

def glc_invtransform(x):
    x = x.copy()
    x = np.exp(x + np.log(140))
    return x

ginv = glc_invtransform

In [3]:
import torch
from torch.utils.data import Dataset,DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def import_data(path):
    df = pd.read_csv(path)
    ids = df.icustay_id.unique()
    for id_ in ids:
        df_id = df.loc[df.icustay_id == id_,:]
#         if (sum(df_id.msk) == df_id.shape[0]):
#             df.drop(df.loc[df.icustay_id == id_,:].index,inplace=True)
#             print("excluding:",id_)
    return df

TIME_VARS = ["timer","timer_dt"]

def pad_numpy(vec, pad, val="zeros"):
    pad_size = list(vec.shape)
    pad_size[0] = pad - vec.shape[0]
    if val == "zeros":
        out = np.concatenate([vec, np.zeros(pad_size)], axis=0)
    elif val == "ones":
        out = np.concatenate([vec, np.ones(pad_size)], axis=0)
    return out

def collate_fn_padd(batch):
    '''
    Padds batch of variable length

    note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    '''
    ## get sequence lengths
    lengths = torch.tensor([b[0].shape[0] for b in batch])
    ## padd
    x = [torch.Tensor(b[0]) for b in batch]
    y = [torch.Tensor(b[1]) for b in batch]
    msk = [torch.Tensor(b[2]) for b in batch]
    dt = [torch.Tensor(b[3]) for b in batch]
    msk0 = [torch.Tensor(b[4]) for b in batch]
    x = torch.nn.utils.rnn.pad_sequence(x,batch_first=True)
    y = torch.nn.utils.rnn.pad_sequence(y,batch_first=True)
    msk = torch.nn.utils.rnn.pad_sequence(msk,batch_first=True,padding_value=int(1))
    dt = torch.nn.utils.rnn.pad_sequence(dt,batch_first=True)
    msk0 = torch.nn.utils.rnn.pad_sequence(msk0,batch_first=True,padding_value=int(1))
    return x, y, msk, dt, msk0

TIME_VARS = ["timer","timer_dt"]

class MIMICDataset(Dataset):
    """
    Args:
        patientunitstayids: 
        df:
        ...
    
    Example:
    """
    def __init__(self,df,features,pad=-1,verbose=True):
        self.pad = pad
        #self.maxrows = maxrows
        self.X,self.y,self.msk,self.dt,self.msk0,self.seqlen = self.load_data(df,features,verbose=verbose)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        # pad
        X = self.X[idx].astype(np.float32)
        y = self.y[idx].astype(np.float32)
        msk = self.msk[idx].astype(np.int32)
        dt = self.dt[idx].astype(np.float32)
        msk0 = self.msk0[idx].astype(np.int32)
        seqlen = self.seqlen[idx]

        return X,y,msk,dt,msk0
    
    def load_data(self,df,features,verbose):
        excl = []
        n_excl_pt = 0
        n_excl_rws = 0
        X_list, y_list, msk_list, dt_list, msk0_list, seqlen_list = [], [], [], [], [], []
        ids = df.icustay_id.unique()
        if verbose:
            print("reconfiguring data...")
        for id_ in ids:
            if self.pad == -1:
                df_id = df.loc[df.icustay_id == id_,:]
            else:
                df_id = df.loc[df.icustay_id == id_,:].iloc[0:self.pad]
            X = df_id.loc[:,features]
            y = df_id.loc[:,"glc_dt"]
            msk = df_id.loc[:,"msk"]
            dt = df_id.loc[:,TIME_VARS]
            msk0 = df_id.loc[:,"msk0"]
            seqlen = df_id.shape[0]
            X = np.array(X).astype(np.float32)
            y = np.array(y).astype(np.float32)
            msk = np.array(msk).astype(np.int32)
            dt = np.array(dt).astype(np.float32)
            msk0 = np.array(msk0).astype(np.int32)
            X_list.append(X)
            y_list.append(y)
            msk_list.append(msk)
            dt_list.append(dt)
            msk0_list.append(msk0)
            seqlen_list.append(seqlen)
        if verbose:
            print("excluded patients:",n_excl_pt)
            print("excluded rows:",n_excl_rws)
        return X_list,y_list,msk_list,dt_list,msk0_list,seqlen_list

In [4]:
df = pd.read_csv('../data/treatment_only_analysis.csv')
train_ids, valid_ids = train_test_split(df.icustay_id.unique(),test_size=0.1)
df_train = df.loc[df.icustay_id.isin(train_ids)]
df_valid = df.loc[df.icustay_id.isin(valid_ids)]

In [5]:
# FEATURES = all_features()
# NFEATURES = len(all_features())
# preproc = PreProcess(FEATURES,QuantileTransformer())
# preproc.fit(df_train)
# df_train = preproc.transform(df_train)
# df_valid = preproc.transform(df_valid)

In [6]:
# dl_train = DataLoader(MIMICDataset(df_train,FEATURES),batch_size=64,pin_memory=False,collate_fn=collate_fn_padd)
# dl_valid = DataLoader(MIMICDataset(df_valid,FEATURES),batch_size=128,pin_memory=False,collate_fn=collate_fn_padd)
# dataloaders = {'train':dl_train,'validation':dl_valid}

In [7]:
df

Unnamed: 0,subject_id,hadm_id,icustay_id,icu_admissiontime,icu_dischargetime,timer,timer_dt,glc,glc_dt,input_short_injection,input_short_push,input_intermediate,input_long,starttime,endtime,input_hrs,infxstop,msk0,msk
0,55973,152234,200001,0.0,73.88,2.88,20.78,118.0,72.0,0.0,0.0,0.0,0.0,,,0.0,,0,0
1,55973,152234,200001,0.0,73.88,20.78,26.88,72.0,135.0,0.0,0.0,0.0,0.0,,,0.0,,0,0
2,55973,152234,200001,0.0,73.88,26.88,36.88,135.0,106.0,0.0,0.0,0.0,0.0,,,0.0,,0,0
3,55973,152234,200001,0.0,73.88,36.88,44.88,106.0,125.0,0.0,0.0,0.0,0.0,,,0.0,,0,0
4,55973,152234,200001,0.0,73.88,44.88,50.88,125.0,101.0,0.0,0.0,0.0,0.0,,,0.0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314195,69587,158288,299998,0.0,46.60,10.88,17.72,196.0,235.0,4.0,0.0,0.0,0.0,,,0.0,,0,0
314196,69587,158288,299998,0.0,46.60,17.72,23.37,235.0,162.0,6.0,0.0,0.0,0.0,,,0.0,,0,0
314197,69587,158288,299998,0.0,46.60,23.37,29.20,162.0,137.0,3.0,0.0,0.0,0.0,,,0.0,,0,0
314198,69587,158288,299998,0.0,46.60,29.20,35.55,137.0,189.0,0.0,0.0,0.0,0.0,,,0.0,,0,0


In [11]:
preproc = PreProcess(["glc","input_short_injection"],QuantileTransformer())