In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import time

from pathlib import Path
from sklearn import metrics
import random
from scipy import stats

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import torchvision

from datetime import datetime
from collections import OrderedDict

In [None]:
import pickle

In [None]:
PATH = Path("../../multi-task-romain/data/")
# PATH = Path("/data2/yinterian/multi-task-romain")

In [None]:
gap = "15min"
gap

In [None]:
filename = "data_train_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    train = pickle.load(f)

In [None]:
filename = "data_valid_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    valid = pickle.load(f)

In [None]:
train.shape, valid.shape

In [None]:
subject_id_list = np.sort(np.unique(train.subject_id.values))
id2index = {v: k+1 for k,v in enumerate(subject_id_list)}
num_subjects = len(subject_id_list)

In [None]:
num_subjects

## Dataset

In [None]:
def get_mean_std_series(train):
    ss = np.concatenate(train.series.values)
    ss = ss.reshape(-1,5)
    return ss.mean(axis=0), ss.std(axis=0)

In [None]:
def get_mean_std_static(train):
    res = {}
    for name in ["age", "sapsii", "sofa"]:
        values = train[name].values
        res[name] = (values.mean(), values.std())
    res["series"] = get_mean_std_series(train)
    return res

In [None]:
norm_dict = get_mean_std_static(train)
norm_dict 

In [None]:
class MultiTask(Dataset):
    def __init__(self, df, norm_dict, id2index, k=20, train=True):
        """
        Args:
            df: dataframe with data
            norm_dict: mean and std of all variables to normalize
            
        """
        self.norm_dict = norm_dict
        self.df = df
        self.names = ["age", "sapsii", "sofa"] ## needs normalization
        self.names_binary = ["gender", "amine", "sedation", "ventilation"]
        self.id2index = id2index
        self.train = train
        self.df_sample = self.pick_a_sample(k)
            
    def pick_a_sample(self, k=20):
        """ Picks sample with the same number of observations per patient"""
        if not self.train: # fix seed for validation and test
            np.random.seed(3)
        sample = self.df.groupby("subject_id", group_keys=False).apply(lambda x: x.sample(min(len(x), k)))
        sample = sample.copy()
        if self.train:
            self.subject_index = [self.id2index[subject_id] for subject_id in sample.subject_id.values]
            self.random = np.random.choice(2, sample.shape[0], p=[0.1, 0.9])
            self.subject_index = self.subject_index*self.random
        return sample

    def __getitem__(self, index):
        row = self.df_sample.iloc[index,:]
        x_series = (row.series - self.norm_dict["series"][0])/self.norm_dict["series"][1]
        x_cont = [(row[name]-self.norm_dict[name][0])/self.norm_dict[name][1] for name in self.names]
        x_binary = [row[name] for name in self.names_binary]
        subject_index = 0
        if self.train:
            subject_index = self.subject_index[index]
        x_cat = np.array([row["care_unit"], subject_index])
        x_cont = np.array(x_cont + x_binary)
        return x_series, x_cont, x_cat, row["prediction_mean_HR"], row["prediction_mean_MAP"]

    def __len__(self):
        return self.df_sample.shape[0]

In [None]:
train_ds = MultiTask(train, norm_dict, id2index)
valid_ds = MultiTask(valid, norm_dict, id2index, train=False)

In [None]:
x1, x2, x3, y1, y2 = train_ds[1200]
x1, x2, x3, y1, y2

## Model

In [None]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [None]:
def pearsonr_ci(x,y,alpha=0.05):
    ''' calculate Pearson correlation along with the confidence interval using scipy and numpy
    Parameters
    ----------
    x, y : iterable object such as a list or np.array
      Input for correlation calculation
    alpha : float
      Significance level. 0.05 by default
    Returns
    -------
    r : float
      Pearson's correlation coefficient
    pval : float
      The corresponding p value
    lo, hi : float
      The lower and upper bound of confidence intervals
    '''
    r, p = stats.pearsonr(x,y)
    r_z = np.arctanh(r)
    se = 1/np.sqrt(x.size-3)
    z = stats.norm.ppf(1-alpha/2)
    lo_z, hi_z = r_z-z*se, r_z+z*se
    lo, hi = np.tanh((lo_z, hi_z))
    return r, lo, hi

In [None]:
def val_metrics(model, valid_dl, which_y="y1"):
    model.eval()
    total = 0
    sum_loss = 0
    y_hat = []
    ys = []
    for x_series, x_cont, x_cat, y1, y2 in valid_dl:
        batch = y1.shape[0]
        x_series = x_series.float()
        x_cont = x_cont.float()
        x_cat = x_cat.long()
        y1 = y1.float()
        y2 = y2.float()
        out = model(x_series, x_cont, x_cat)
        if which_y=="y1":
            mse_loss = F.mse_loss(out,  y1.unsqueeze(-1))
            ys.append(y1.view(-1).numpy())
        else:
            mse_loss = F.mse_loss(out, y2.unsqueeze(-1))
            ys.append(y2.view(-1).numpy())
        sum_loss += batch*(mse_loss.item())
        total += batch
        y_hat.append(out.view(-1).detach().numpy())
    
    y_hat = np.concatenate(y_hat)
    ys = np.concatenate(ys)
    #r2 = metrics.r2_score(ys, y_hat)
    r2, lo, hi =  pearsonr_ci(ys, y_hat, alpha=0.05)
    
    return sum_loss/total, r2, lo, hi

In [None]:
def train_epochs(model, train_ds, optimizer, lr=1e-3, epochs = 30, which_y="y1"):
    prev_val_r2 = 0
    for i in range(epochs):
        sum_loss = 0
        total = 0
        train_ds.pick_a_sample()
        train_dl = DataLoader(train_ds, batch_size=5000, shuffle=True)
        for x_series, x_cont, x_cat, y1, y2 in train_dl:
            model.train()
            x_series = x_series.float()
            x_cont = x_cont.float()
            x_cat = x_cat.long()
            y1 = y1.float()
            y2 = y2.float()
            out = model(x_series, x_cont, x_cat)
            if which_y=="y1":
                loss = F.mse_loss(out, y1.unsqueeze(-1))
            else:
                loss = F.mse_loss(out, y2.unsqueeze(-1))
            sum_loss += y1.shape[0] * loss.item()
            
            total += y1.shape[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if i % 1 == 0:
            print("iteration : ", i)
            val_loss, val_r2 , val_lo,  val_hi = val_metrics(model, valid_dl, which_y=which_y)
            print("\tTrain loss: {:.3f} \n \t valid loss: {:.3f} valid r2 {:.3f}[{:.3f}-{:.3f}]".format(
                sum_loss/total, val_loss, val_r2, val_lo, val_hi))
        if val_r2 > prev_val_r2:
            prev_val_r2 = val_r2
            if val_r2 > 0.95 :
                PATH = Path("../../multi-task-romain/2e_analyse/singletask/")
                filename = "single_model_15min_" + which_y
                path = "{0}/{1}_r2_{2:.0f}.pth".format(PATH, filename, 100*val_r2) 
                save_model(model, path)
                print(path)

In [None]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

Model 2

In [None]:
class EventModel2(nn.Module):
    def __init__(self, hidden_size=100):
        super(EventModel2, self).__init__()
        self.embedding1 = nn.Embedding(5, 1)
        self.embedding2 = nn.Embedding(num_subjects+1, 5)
        self.gru = nn.GRU(5, hidden_size, batch_first=True)
        self.num = hidden_size + 1 + 5 + 7
        self.linear1 = nn.Linear(self.num, self.num)
        self.out = nn.Linear(self.num, 1)
        self.bn1 = nn.BatchNorm1d(self.num)
        
    def forward(self, x_series, x_cont, x_cat):
        _, ht = self.gru(x_series)
        x_cat_1 = self.embedding1(x_cat[:,0])
        x_cat_2 = self.embedding2(x_cat[:,1])
        x = torch.cat((ht[-1], x_cat_1, x_cat_2, x_cont), 1)
        x = self.bn1(F.relu(self.linear1(x)))
        return self.out(x)

In [None]:
# model for mean_HR
model = EventModel2()

optimizer = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=15)

#optimizer = torch.optim.Adam(model.parameters(), lr=0.03, weight_decay=1e-5)
#train_epochs(model, train_ds, optimizer, epochs=5)

In [None]:
# model mean_MAP
model = EventModel2()

optimizer = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=10, which_y="y2")

#optimizer = torch.optim.Adam(model.parameters(),lr=0.03, weight_decay=1e-5)
#train_epochs(model, train_ds, optimizer, epochs=5, which_y="y2")

## Test 

In [None]:
PATH = Path("../../multi-task-romain/2e_analyse/singletask/")
path = PATH/"single_model_15min_y1_r2_98.pth"
model = EventModel2()
load_model(model, path)

In [None]:
filename = "../../data/data_test_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    test = pickle.load(f)
    
filename = "../../data/data_validation_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    test_larib = pickle.load(f)
test_larib["care_unit"] = 4
test.shape, test_larib.shape

In [None]:
def predict_y1_one_batch(model, dl):
    for x_series, x_cont, x_cat, y1, y2 in dl:
        x_series = x_series.float()
        x_cont = x_cont.float()
        x_cat = x_cat.long()
        y1 = y1.float()
        out1 = model(x_series, x_cont, x_cat)
    return out1.detach().numpy(), y1.detach().numpy()

class MultiTask_validation(Dataset):
    def __init__(self, df, norm_dict, id2index, k=20, train=True):
        """
        Args:
            df: dataframe with data
            norm_dict: mean and std of all variables to normalize
            
        """
        self.norm_dict = norm_dict
        self.df = df
        self.names = ["age", "sapsii", "sofa"] ## needs normalization
        self.names_binary = ["gender", "amine", "sedation", "ventilation"]
        self.id2index = id2index
        self.train = train
        self.df_sample = self.pick_a_sample(k)
            
    def pick_a_sample(self, k=20):
        """ Picks sample with the same number of observations per patient"""
        if not self.train: # fix seed for validation and test
            np.random.seed(3)
# We don't want the same number of period per patient
        # sample = self.df.groupby("subject_id", group_keys=False).apply(lambda x: x.sample(k, replace=True))
        sample = self.df.copy()
        if self.train:
# 10 percent of the periods have a subject_index == 0
            self.subject_index = [self.id2index[subject_id] for subject_id in sample.subject_id.values]
            self.random = np.random.choice(2, sample.shape[0], p = [0.1, 0.9])
            self.subject_index = self.subject_index*self.random
        return sample

    def __getitem__(self, index):
        row = self.df_sample.iloc[index,:] 
        x_series = (row.series - self.norm_dict["series"][0])/self.norm_dict["series"][1]
        x_cont = [(row[name]-self.norm_dict[name][0])/self.norm_dict[name][1] for name in self.names]
        x_binary = [row[name] for name in self.names_binary]
        subject_index = 0
        if self.train:
            subject_index = self.subject_index[index]
        x_cat = np.array([row["care_unit"], subject_index])
        x_cont = np.array(x_cont + x_binary)
        return x_series, x_cont, x_cat, row["prediction_mean_HR"], row["prediction_mean_MAP"]

    def __len__(self):
        return self.df_sample.shape[0]


In [None]:
test_ds = MultiTask_validation(test, norm_dict, id2index, train=False)
test_larib_ds = MultiTask_validation(test_larib, norm_dict, id2index, train = False)

In [None]:
test_dl = DataLoader(test_ds, batch_size=8233)
test_larib_dl = DataLoader(test_larib_ds, batch_size=1597)

In [None]:
val_metrics(model, test_dl, which_y="y1")

In [None]:
val_metrics(model, test_larib_dl, which_y="y1")

In [None]:
# HR
out1, y1 = predict_y1_one_batch(model, test_dl)
y1 = np.reshape(y1, (-1,1))
arr_hr = np.concatenate((out1, y1) , axis=1)
pd.DataFrame(arr_hr).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/intern_single_obs_pred_HR_15.csv")


out1, y1 = predict_y1_one_batch(model, test_larib_dl)
y1 = np.reshape(y1, (-1,1))
arr_hr = np.concatenate((out1, y1) , axis=1)
pd.DataFrame(arr_hr).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/larib_single_obs_pred_HR_15.csv")

In [None]:
path = PATH/"single_model_15min_y2_r2_95.pth"
load_model(model, path)

In [None]:
def predict_y2_one_batch(model, dl):
    for x_series, x_cont, x_cat, y1, y2 in dl:
        x_series = x_series.float()
        x_cont = x_cont.float()
        x_cat = x_cat.long()
        y2 = y2.float()
        out2 = model(x_series, x_cont, x_cat)
    return out2.detach().numpy(), y2.detach().numpy()

In [None]:
val_metrics(model, test_dl, which_y="y2")

In [None]:
val_metrics(model, test_larib_dl, which_y="y2")

In [None]:
out2, y2 = predict_y2_one_batch(model, test_dl)
y2 = np.reshape(y2, (-1,1))
arr_map = np.concatenate((out2, y2) , axis=1)
pd.DataFrame(arr_map).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/intern_single_obs_pred_MAP_15.csv")

out2, y2 = predict_y2_one_batch(model, test_larib_dl)
y2 = np.reshape(y2, (-1,1))
arr_map = np.concatenate((out2, y2) , axis=1)
pd.DataFrame(arr_map).to_csv("/home/menyssa/Recherche/Mimic-III-Yannet/resultats/2e_analyse/larib_single_obs_pred_MAP_15.csv")

## looking at the data

In [None]:
#filename = "data_train_{gap}.pickle".format(gap="5min")
#with open(PATH/filename, 'rb') as f:
#    train5 = pickle.load(f)

In [None]:
#filename = "data_train_{gap}.pickle".format(gap="10min")
#with open(PATH/filename, 'rb') as f:
#    train10 = pickle.load(f)

In [None]:
#cols = ["subject_id", "key", "prediction_mean_HR", "prediction_mean_MAP"]
#train5_s = train5.loc[:, cols]
#train10_s = train10.loc[:, cols]

In [None]:
#train5_s.iloc[:30]

In [None]:
#train10_s.iloc[:30]