In [1]:
import sys
sys.path.insert(0,"/work/pip")

from __future__ import absolute_import
from __future__ import print_function

import numpy as np
import argparse
import os
import imp
import re

from mimic3models.in_hospital_mortality import utils
from mimic3benchmark.readers import Reader, InHospitalMortalityReader

from mimic3models.preprocessing import Discretizer, Normalizer, DiscretizerContinuous

In [2]:
class PatientEmbeddingReader(Reader):
    def __init__(self, dataset_dir, listfile=None, period_length=48.0):
        """ Reader for in-hospital moratality prediction task.

        :param dataset_dir:   Directory where timeseries files are stored.
        :param listfile:      Path to a listfile. If this parameter is left `None` then
                              `dataset_dir/listfile.csv` will be used.
        :param period_length: Double the Length of the period (in hours) from which the embedding is created.
        """
        Reader.__init__(self, dataset_dir, listfile)
        self._data = [line.split(',') for line in self._data]
        self._data = [(x, int(wt), int(w)) for (x, wt, w) in self._data]
        self._period_length = period_length
        self._input_dim = None
        
    def _read_timeseries(self, ts_filename):
        ret = []
        with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
            header = tsfile.readline().strip().split(',')
            assert header[0] == "Hours"
            for line in tsfile:
                mas = line.strip().split(',')
                ret.append(np.array(mas))
            return (np.stack(ret), header)
    
    def get_input_dim(self):
        if self._input_dim is None:
            name, norm, end_time = self._data[index]
            (X, header) = self._read_timeseries(name)
            self._input_dim = X.shape[1]
        return self._input_dim    
        
    def read_example(self, index):
        """ Reads the example with given index.

        :param index: Index of the line of the listfile to read (counting starts from 0).
        :return: Dictionary with the following keys:
            X : np.array
                2D array containing all events. Each row corresponds to a moment.
                First column is the time and other columns correspond to different
                variables.
            t : float
                Length of the data in hours. Note, in general, it is not equal to the
                timestamp of last event.
            y : int (0 or 1)
                In-hospital mortality.
            header : array of strings
                Names of the columns. The ordering of the columns is always the same.
            name: Name of the sample.
        """
        if index < 0 or index >= len(self._data):
            raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")
            
        name, norm, end_time = self._data[index]
        end_time += 48
        t = self._period_length
        (X, header) = self._read_timeseries(name)

        return {"X": X,
                "t": t,
                "norm": norm,
                "end_time": end_time,
                "header": header,
                "name": name}        

In [3]:
# Data Directories
data_basedir = '/work/MIMIC/processed_data'
data_patient_embedding = os.path.join(data_basedir, 'patient_embeddings_test')

In [4]:
train_reader = PatientEmbeddingReader(dataset_dir=os.path.join(data_patient_embedding, 'train'), 
                                      listfile=os.path.join(data_patient_embedding, 'train', 'listfile.csv'), 
                                      period_length=48.0)

In [5]:
discretizer = DiscretizerContinuous(timestep=1.0,
                                    store_masks=False,
                                    impute_strategy='previous',
                                    start_time='zero')

In [6]:
discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1]
cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

In [7]:
cont_channels

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [8]:
discretizer.transform(train_reader.read_example(0)["X"])[0][1].shape

(17,)

In [9]:
discretizer_header

['Capillary refill rate',
 'Diastolic blood pressure',
 'Fraction inspired oxygen',
 'Glascow coma scale eye opening',
 'Glascow coma scale motor response',
 'Glascow coma scale total',
 'Glascow coma scale verbal response',
 'Glucose',
 'Heart Rate',
 'Height',
 'Mean blood pressure',
 'Oxygen saturation',
 'Respiratory rate',
 'Systolic blood pressure',
 'Temperature',
 'Weight',
 'pH']

In [10]:
normalizer = Normalizer(fields=cont_channels)
normalizer_state = 'ptemb_ts1.0.input_str:previous.start_time:zero.normalizer'
normalizer_state = os.path.join("/work/mimic3-benchmarks/mimic3models/patient_embedding", normalizer_state)
normalizer.load_params(normalizer_state)

In [11]:
normalizer._stds.shape

(17,)

In [12]:
train_reader.read_example(0)["X"]

array([['114.21499999999999', '', '76.0', ..., '37.444445292154946', '',
        ''],
       ['115.21499999999999', '', '83.0', ..., '', '', ''],
       ['116.21499999999999', '', '89.0', ..., '', '', ''],
       ...,
       ['260.215', '', '68.0', ..., '', '', ''],
       ['262.215', '', '80.0', ..., '37.66669845581055', '', ''],
       ['275.21500000000003', '', '', ..., '', '', '']], dtype='<U18')

In [13]:
import torch
from torch.utils import data
import copy

#Pytorch Wrapper for load_data --> Feeds of PatientEmbeddingReader
class PatientEmbeddingDataset(data.Dataset):
    def __init__(self, reader, discretizer, normalizer=None, mask_percent=.15, return_name=False):
        self.reader = reader
        self.discretizer = discretizer
        self.normalizer = normalizer
        self.mask_percent = mask_percent
        self.return_name = return_name
    
    def __len__(self):
        return self.reader.get_number_of_examples()
    
    def get_input_dim(self):
        return self.reader.get_input_dim()
    
    def get_seq_length(self):
        return self.reader._period_length
    
    def __getitem__(self, index):
        ret = self.reader.read_example(index)
        
        X = ret["X"]
        t = ret["t"]
        end = ret["end_time"]
        name = ret["name"]
        norm = ret["norm"]
        
        X = self.discretizer.transform(X, end=end)[0][-int(t):]
        if self.normalizer is not None:
            X = self.normalizer.transform(X)
        
        src = np.array(X[-int(t):-int(t/2)])
        tgt = np.array(X[-int(t/2):])
        
        mask = np.zeros(int(t/2))
        n_masks = round(self.mask_percent*int(t/2))
        mask_ids = np.random.permutation(int(t/2))[:n_masks]
        mask[mask_ids] = 1
        
        src_masked = copy.deepcopy(src)
        src_masked[(mask==1), :] = 0
        
        tgt_input = np.vstack((src[-1,:], tgt[:-1,:]))
        
        if not self.return_name:
            return {'src_masked':src_masked, 'src':src, 
                    'tgt_input':tgt_input, 'tgt':tgt, 
                    'norm':norm, 'mask':mask}
        return {'src_masked':src_masked, 'src':src, 
                    'tgt_input':tgt_input, 'tgt':tgt, 
                    'norm':norm, 'mask':mask, 'name':name}
        
        

In [14]:
PEmbDataset = PatientEmbeddingDataset(reader=train_reader, discretizer=discretizer, normalizer=normalizer, return_name=True)


In [15]:
training_generator = data.DataLoader(PEmbDataset, batch_size=512, shuffle=True, num_workers=12)

In [16]:
for i, data in enumerate(training_generator):
    if i == 1:
        break
    print(data['src_masked'].shape)

torch.Size([512, 24, 17])
