In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import time

from pathlib import Path
from sklearn import metrics
import random
from scipy import stats

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import torchvision

from datetime import datetime
from collections import OrderedDict

In [2]:
import pickle

In [3]:
PATH = Path("../../multi-task-romain/data/")
# PATH = Path("/data2/yinterian/multi-task-romain")

In [4]:
gap = "15min"
gap

'15min'

In [5]:
filename = "data_train_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    train = pickle.load(f)

In [6]:
filename = "data_valid_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    valid = pickle.load(f)

In [7]:
train.shape, valid.shape

((42830, 14), (5069, 14))

In [8]:
subject_id_list = np.sort(np.unique(train.subject_id.values))
id2index = {v: k+1 for k,v in enumerate(subject_id_list)}
num_subjects = len(subject_id_list)

In [9]:
num_subjects

2170

## Dataset

In [10]:
def get_mean_std_series(train):
    ss = np.concatenate(train.series.values)
    ss = ss.reshape(-1,5)
    return ss.mean(axis=0), ss.std(axis=0)

In [11]:
def get_mean_std_static(train):
    res = {}
    for name in ["age", "sapsii", "sofa"]:
        values = train[name].values
        res[name] = (values.mean(), values.std())
    res["series"] = get_mean_std_series(train)
    return res

In [12]:
norm_dict = get_mean_std_static(train)
norm_dict 

{'age': (64, 15.087455295966063),
 'sapsii': (33, 14.265492481117855),
 'sofa': (4, 3.7831641172054082),
 'series': (array([ 83.19453341,  93.64397046, 121.07613603,  58.73969887,
          78.6694367 ]),
  array([16.08727268, 17.53684697, 21.3399693 , 12.26982071, 14.36323955]))}

In [13]:
class MultiTask(Dataset):
    def __init__(self, df, norm_dict, id2index, k=20, train=True):
        """
        Args:
            df: dataframe with data
            norm_dict: mean and std of all variables to normalize
            
        """
        self.norm_dict = norm_dict
        self.df = df
        self.names = ["age", "sapsii", "sofa"] ## needs normalization
        self.names_binary = ["gender", "amine", "sedation", "ventilation"]
        self.id2index = id2index
        self.train = train
        self.df_sample = self.pick_a_sample(k)
            
    def pick_a_sample(self, k=20):
        """ Picks sample with the same number of observations per patient"""
        if not self.train: # fix seed for validation and test
            np.random.seed(3)
        sample = self.df.groupby("subject_id", group_keys=False).apply(lambda x: x.sample(min(len(x), k)))
        sample = sample.copy()
        if self.train:
            self.subject_index = [self.id2index[subject_id] for subject_id in sample.subject_id.values]
            self.random = np.random.choice(2, sample.shape[0], p=[0.1, 0.9])
            self.subject_index = self.subject_index*self.random
        return sample

    def __getitem__(self, index):
        row = self.df_sample.iloc[index,:]
        x_series = (row.series - self.norm_dict["series"][0])/self.norm_dict["series"][1]
        x_cont = [(row[name]-self.norm_dict[name][0])/self.norm_dict[name][1] for name in self.names]
        x_binary = [row[name] for name in self.names_binary]
        subject_index = 0
        if self.train:
            subject_index = self.subject_index[index]
        x_cat = np.array([row["care_unit"], subject_index])
        x_cont = np.array(x_cont + x_binary)
        return x_series, x_cont, x_cat, row["prediction_mean_HR"], row["prediction_mean_MAP"]

    def __len__(self):
        return self.df_sample.shape[0]

In [14]:
train_ds = MultiTask(train, norm_dict, id2index)
valid_ds = MultiTask(valid, norm_dict, id2index, train=False)

In [15]:
x1, x2, x3, y1, y2 = train_ds[1200]
x1, x2, x3, y1, y2

(array([[ 0.78356766,  0.24839297, -0.77207871, -0.66339183, -0.67320723],
        [ 0.57843655,  0.23698841, -0.73459037, -0.57374097, -0.57573618],
        [ 1.00734704,  0.24839297, -0.7486485 , -0.43518964, -0.49915179],
        [ 0.70897453,  0.24839297, -0.69241599, -0.51669042, -0.52700066],
        [ 0.66546187,  0.24839297, -0.75333454, -0.56559089, -0.57573618],
        [ 0.80221594,  0.24839297, -0.8704856 , -0.56559089, -0.64535836],
        [ 0.8457286 ,  0.24839297, -0.80019497, -0.56559089, -0.61750949],
        [ 0.78356766,  0.24839297, -0.69710204, -0.44333972, -0.4643407 ],
        [ 0.92653782,  0.24839297, -0.74396246, -0.45963988, -0.52003844],
        [ 1.01356314,  0.24839297, -0.77207871, -0.45963988, -0.54092509],
        [ 0.88924126,  0.24839297, -0.72053225, -0.54929074, -0.57573618],
        [ 1.02599533,  0.24839297, -0.67835787, -0.46778995, -0.47826513],
        [ 0.90167345,  0.24839297, -0.7814508 , -0.47594003, -0.53396288],
        [ 0.82086422,  0.

## Model

In [16]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [17]:
def pearsonr_ci(x,y,alpha=0.05):
    ''' calculate Pearson correlation along with the confidence interval using scipy and numpy
    Parameters
    ----------
    x, y : iterable object such as a list or np.array
      Input for correlation calculation
    alpha : float
      Significance level. 0.05 by default
    Returns
    -------
    r : float
      Pearson's correlation coefficient
    pval : float
      The corresponding p value
    lo, hi : float
      The lower and upper bound of confidence intervals
    '''
    r, p = stats.pearsonr(x,y)
    r_z = np.arctanh(r)
    se = 1/np.sqrt(x.size-3)
    z = stats.norm.ppf(1-alpha/2)
    lo_z, hi_z = r_z-z*se, r_z+z*se
    lo, hi = np.tanh((lo_z, hi_z))
    return r, lo, hi

In [18]:
def val_metrics(model, valid_dl, which_y="y1"):
    model.eval()
    total = 0
    sum_loss = 0
    y_hat = []
    ys = []
    for x_series, x_cont, x_cat, y1, y2 in valid_dl:
        batch = y1.shape[0]
        x_series = x_series.float()
        x_cont = x_cont.float()
        x_cat = x_cat.long()
        y1 = y1.float()
        y2 = y2.float()
        out = model(x_series, x_cont, x_cat)
        if which_y=="y1":
            mse_loss = F.mse_loss(out,  y1.unsqueeze(-1))
            ys.append(y1.view(-1).numpy())
        else:
            mse_loss = F.mse_loss(out, y2.unsqueeze(-1))
            ys.append(y2.view(-1).numpy())
        sum_loss += batch*(mse_loss.item())
        total += batch
        y_hat.append(out.view(-1).detach().numpy())
    
    y_hat = np.concatenate(y_hat)
    ys = np.concatenate(ys)
    #r2 = metrics.r2_score(ys, y_hat)
    r2, lo, hi =  pearsonr_ci(ys, y_hat, alpha=0.05)
    
    return sum_loss/total, r2, lo, hi

In [19]:
def train_epochs(model, train_ds, optimizer, lr=1e-3, epochs = 30, which_y="y1"):
    prev_val_r2 = 0
    for i in range(epochs):
        sum_loss = 0
        total = 0
        train_ds.pick_a_sample()
        train_dl = DataLoader(train_ds, batch_size=5000, shuffle=True)
        for x_series, x_cont, x_cat, y1, y2 in train_dl:
            model.train()
            x_series = x_series.float()
            x_cont = x_cont.float()
            x_cat = x_cat.long()
            y1 = y1.float()
            y2 = y2.float()
            out = model(x_series, x_cont, x_cat)
            if which_y=="y1":
                loss = F.mse_loss(out, y1.unsqueeze(-1))
            else:
                loss = F.mse_loss(out, y2.unsqueeze(-1))
            sum_loss += y1.shape[0] * loss.item()
            
            total += y1.shape[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if i % 1 == 0:
            print("iteration : ", i)
            val_loss, val_r2 , val_lo,  val_hi = val_metrics(model, valid_dl, which_y=which_y)
            print("\tTrain loss: {:.3f} \n \t valid loss: {:.3f} valid r2 {:.3f}[{:.3f}-{:.3f}]".format(
                sum_loss/total, val_loss, val_r2, val_lo, val_hi))
        if val_r2 > prev_val_r2:
            prev_val_r2 = val_r2
            if val_r2 > 0.95 :
                PATH = Path("../../multi-task-romain/2e_analyse/singletask/")
                filename = "single_model_15min_" + which_y
                path = "{0}/{1}_r2_{2:.0f}.pth".format(PATH, filename, 100*val_r2) 
                save_model(model, path)
                print(path)

In [20]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

Model 2

In [21]:
class EventModel2(nn.Module):
    def __init__(self, hidden_size=100):
        super(EventModel2, self).__init__()
        self.embedding1 = nn.Embedding(5, 1)
        self.embedding2 = nn.Embedding(num_subjects+1, 5)
        self.gru = nn.GRU(5, hidden_size, batch_first=True)
        self.num = hidden_size + 1 + 5 + 7
        self.linear1 = nn.Linear(self.num, self.num)
        self.out = nn.Linear(self.num, 1)
        self.bn1 = nn.BatchNorm1d(self.num)
        
    def forward(self, x_series, x_cont, x_cat):
        _, ht = self.gru(x_series)
        x_cat_1 = self.embedding1(x_cat[:,0])
        x_cat_2 = self.embedding2(x_cat[:,1])
        x = torch.cat((ht[-1], x_cat_1, x_cat_2, x_cont), 1)
        x = self.bn1(F.relu(self.linear1(x)))
        return self.out(x)

In [22]:
# model for mean_HR
model = EventModel2()

optimizer = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=15)

#optimizer = torch.optim.Adam(model.parameters(), lr=0.03, weight_decay=1e-5)
#train_epochs(model, train_ds, optimizer, epochs=5)

iteration :  0
	Train loss: 6904.949 
 	 valid loss: 7767.161 valid r2 0.793[0.779-0.806]
iteration :  1
	Train loss: 5334.744 
 	 valid loss: 6739.319 valid r2 0.935[0.930-0.940]
iteration :  2
	Train loss: 2471.856 
 	 valid loss: 739.397 valid r2 0.935[0.931-0.940]
iteration :  3
	Train loss: 265.513 
 	 valid loss: 503.243 valid r2 0.924[0.919-0.929]
iteration :  4
	Train loss: 637.682 
 	 valid loss: 859.743 valid r2 0.941[0.936-0.945]
iteration :  5
	Train loss: 367.111 
 	 valid loss: 47.789 valid r2 0.954[0.951-0.957]
../../multi-task-romain/2e_analyse/singletask/single_model_15min_y1_r2_95.pth
iteration :  6
	Train loss: 53.908 
 	 valid loss: 177.926 valid r2 0.960[0.957-0.963]
../../multi-task-romain/2e_analyse/singletask/single_model_15min_y1_r2_96.pth
iteration :  7
	Train loss: 182.477 
 	 valid loss: 196.674 valid r2 0.964[0.962-0.967]
../../multi-task-romain/2e_analyse/singletask/single_model_15min_y1_r2_96.pth
iteration :  8
	Train loss: 89.505 
 	 valid loss: 30.649 v

In [23]:
# model mean_MAP
model = EventModel2()

optimizer = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=10, which_y="y2")

#optimizer = torch.optim.Adam(model.parameters(),lr=0.03, weight_decay=1e-5)
#train_epochs(model, train_ds, optimizer, epochs=5, which_y="y2")

iteration :  0
	Train loss: 6032.056 
 	 valid loss: 5087.966 valid r2 0.919[0.914-0.925]
iteration :  1
	Train loss: 4653.722 
 	 valid loss: 2319.435 valid r2 0.873[0.864-0.881]
iteration :  2
	Train loss: 2082.864 
 	 valid loss: 120.943 valid r2 0.926[0.921-0.931]
iteration :  3
	Train loss: 214.120 
 	 valid loss: 739.675 valid r2 0.941[0.936-0.945]
iteration :  4
	Train loss: 586.011 
 	 valid loss: 968.196 valid r2 0.946[0.942-0.950]
iteration :  5
	Train loss: 303.581 
 	 valid loss: 70.093 valid r2 0.948[0.944-0.952]
iteration :  6
	Train loss: 54.749 
 	 valid loss: 108.767 valid r2 0.952[0.948-0.955]
../../multi-task-romain/2e_analyse/singletask/single_model_15min_y2_r2_95.pth
iteration :  7
	Train loss: 164.650 
 	 valid loss: 101.847 valid r2 0.952[0.949-0.956]
../../multi-task-romain/2e_analyse/singletask/single_model_15min_y2_r2_95.pth
iteration :  8
	Train loss: 76.266 
 	 valid loss: 17.104 valid r2 0.952[0.949-0.955]
iteration :  9
	Train loss: 28.231 
 	 valid loss: 

## Test 

In [25]:
PATH = Path("../../multi-task-romain/2e_analyse/singletask/")
path = PATH/"single_model_15min_y1_r2_97.pth"
model = EventModel2()
load_model(model, path)

In [26]:
filename = "../../data/data_test_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    test = pickle.load(f)
    
filename = "../../data/data_validation_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    test_larib = pickle.load(f)
test_larib["care_unit"] = 4
test.shape, test_larib.shape

((5933, 14), (1097, 13))

In [27]:
def predict_y1_one_batch(model, dl):
    for x_series, x_cont, x_cat, y1, y2 in dl:
        x_series = x_series.float()
        x_cont = x_cont.float()
        x_cat = x_cat.long()
        y1 = y1.float()
        out1 = model(x_series, x_cont, x_cat)
    return out1.detach().numpy(), y1.detach().numpy()

class MultiTask_validation(Dataset):
    def __init__(self, df, norm_dict, id2index, k=20, train=True):
        """
        Args:
            df: dataframe with data
            norm_dict: mean and std of all variables to normalize
            
        """
        self.norm_dict = norm_dict
        self.df = df
        self.names = ["age", "sapsii", "sofa"] ## needs normalization
        self.names_binary = ["gender", "amine", "sedation", "ventilation"]
        self.id2index = id2index
        self.train = train
        self.df_sample = self.pick_a_sample(k)
            
    def pick_a_sample(self, k=20):
        """ Picks sample with the same number of observations per patient"""
        if not self.train: # fix seed for validation and test
            np.random.seed(3)
# We don't want the same number of period per patient
        # sample = self.df.groupby("subject_id", group_keys=False).apply(lambda x: x.sample(k, replace=True))
        sample = self.df.copy()
        if self.train:
# 10 percent of the periods have a subject_index == 0
            self.subject_index = [self.id2index[subject_id] for subject_id in sample.subject_id.values]
            self.random = np.random.choice(2, sample.shape[0], p = [0.1, 0.9])
            self.subject_index = self.subject_index*self.random
        return sample

    def __getitem__(self, index):
        row = self.df_sample.iloc[index,:] 
        x_series = (row.series - self.norm_dict["series"][0])/self.norm_dict["series"][1]
        x_cont = [(row[name]-self.norm_dict[name][0])/self.norm_dict[name][1] for name in self.names]
        x_binary = [row[name] for name in self.names_binary]
        subject_index = 0
        if self.train:
            subject_index = self.subject_index[index]
        x_cat = np.array([row["care_unit"], subject_index])
        x_cont = np.array(x_cont + x_binary)
        return x_series, x_cont, x_cat, row["prediction_mean_HR"], row["prediction_mean_MAP"]

    def __len__(self):
        return self.df_sample.shape[0]


In [28]:
test_ds = MultiTask_validation(test, norm_dict, id2index, train=False)
test_larib_ds = MultiTask_validation(test_larib, norm_dict, id2index, train = False)

In [29]:
test_dl = DataLoader(test_ds, batch_size=8233)
test_larib_dl = DataLoader(test_larib_ds, batch_size=1597)

In [30]:
val_metrics(model, test_dl, which_y="y1")

(15.620271682739258, 0.9728872, 0.9714915258069596, 0.9742154783762002)

In [31]:
val_metrics(model, test_larib_dl, which_y="y1")

(43.04486846923828, 0.930376, 0.9219575488785043, 0.9379156578016868)

In [32]:
# HR
out1, y1 = predict_y1_one_batch(model, test_dl)
y1 = np.reshape(y1, (-1,1))
arr_hr = np.concatenate((out1, y1) , axis=1)
pd.DataFrame(arr_hr).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/intern_single_obs_pred_HR_15.csv")


out1, y1 = predict_y1_one_batch(model, test_larib_dl)
y1 = np.reshape(y1, (-1,1))
arr_hr = np.concatenate((out1, y1) , axis=1)
pd.DataFrame(arr_hr).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/larib_single_obs_pred_HR_15.csv")

In [33]:
path = PATH/"single_model_15min_y2_r2_95.pth"
load_model(model, path)

In [34]:
def predict_y2_one_batch(model, dl):
    for x_series, x_cont, x_cat, y1, y2 in dl:
        x_series = x_series.float()
        x_cont = x_cont.float()
        x_cat = x_cat.long()
        y2 = y2.float()
        out2 = model(x_series, x_cont, x_cat)
    return out2.detach().numpy(), y2.detach().numpy()

In [35]:
val_metrics(model, test_dl, which_y="y2")

(67.01335906982422, 0.9527012, 0.9502925766221146, 0.9549958270217882)

In [36]:
val_metrics(model, test_larib_dl, which_y="y2")

(107.82295989990234, 0.8906552, 0.8777382254642543, 0.9022785589468852)

In [37]:
out2, y2 = predict_y2_one_batch(model, test_dl)
y2 = np.reshape(y2, (-1,1))
arr_map = np.concatenate((out2, y2) , axis=1)
pd.DataFrame(arr_map).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/intern_single_obs_pred_MAP_15.csv")

out2, y2 = predict_y2_one_batch(model, test_larib_dl)
y2 = np.reshape(y2, (-1,1))
arr_map = np.concatenate((out2, y2) , axis=1)
pd.DataFrame(arr_map).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/larib_single_obs_pred_MAP_15.csv")

## looking at the data

In [38]:
#filename = "data_train_{gap}.pickle".format(gap="5min")
#with open(PATH/filename, 'rb') as f:
#    train5 = pickle.load(f)

In [39]:
#filename = "data_train_{gap}.pickle".format(gap="10min")
#with open(PATH/filename, 'rb') as f:
#    train10 = pickle.load(f)

In [40]:
#cols = ["subject_id", "key", "prediction_mean_HR", "prediction_mean_MAP"]
#train5_s = train5.loc[:, cols]
#train10_s = train10.loc[:, cols]

In [41]:
#train5_s.iloc[:30]

In [42]:
#train10_s.iloc[:30]