This notebook contains the preprocessing steps. It is based on the notebook provided by Mullenbach et al. and their CAML implementation.

In [1]:
import sys
sys.path.append('../')
import datasets as ds
from dataproc import get_discharge_summaries
from dataproc import concat_and_split
from dataproc import build_vocab
from dataproc import word_embeddings
from dataproc import vocab_index_descriptions
from constants_mimic3 import MIMIC_3_DIR, DATA_DIR

import pickle
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import ast

from collections import Counter, defaultdict
import csv
import math
import operator
from tqdm import tqdm

Let's do some data processing in a much better way, with a notebook.

First, let's define some stuff.

In [2]:
Y = 'full' #use all available labels in the dataset for prediction
notes_file = '%s/NOTEEVENTS.csv' % MIMIC_3_DIR # raw note events downloaded from MIMIC-III
vocab_size = 'full' #don't limit the vocab size to a specific number
vocab_min = 3 #discard tokens appearing in fewer than this many documents

# Data processing

## Combine diagnosis and procedure codes and reformat them

The codes in MIMIC-III are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

In [3]:
dfproc = pd.read_csv('%s/PROCEDURES_ICD.csv' % MIMIC_3_DIR)
dfdiag = pd.read_csv('%s/DIAGNOSES_ICD.csv' % MIMIC_3_DIR)

In [4]:
dfdiag['absolute_code'] = dfdiag.apply(lambda row: str(ds.reformat(str(row[4]), True)), axis=1)
dfproc['absolute_code'] = dfproc.apply(lambda row: str(ds.reformat(str(row[4]), False)), axis=1)

In [5]:
dfcodes = pd.concat([dfdiag, dfproc])

In [6]:
dfcodes.to_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, index=False,
               columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'absolute_code'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'])

In [7]:
# optional prints for sanity checking
#pd.set_option('display.max_colwidth', None,'display.max_columns', 6)
#print(dfcodes.head(100))

## How many codes are there?

In [8]:
#In the full dataset (not just discharge summaries)
df = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, dtype={"ICD9_CODE": str})
len(df['ICD9_CODE'].unique())

8994

## Tokenize and preprocess raw text

Preprocessing time!

This will:
- Select only discharge summaries and their addenda
- remove punctuation and numeric-only tokens, removing 500 but keeping 250mg
- lowercase all tokens

In [9]:
#This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
disch_full_file = get_discharge_summaries.write_discharge_summaries(out_file="%s/disch_full.csv" % MIMIC_3_DIR)

processing notes file
writing to /Users/maltefeucht/PycharmProjects/TBM_ICD9_mimic3/mimicdata/mimic3/disch_full.csv


2083180it [01:37, 21272.39it/s]


Let's read this in and see what kind of data we're working with

In [10]:
df = pd.read_csv('%s/disch_full.csv' % MIMIC_3_DIR)

In [11]:
# check discharge summary file
#pd.set_option('display.max_colwidth', None,'display.max_columns', 6)
#print(df.head(5))

In [12]:
#How many admissions?
len(df['HADM_ID'].unique())

52726

In [13]:
#Tokens and types
types = set()
num_tok = 0
for row in df.itertuples():
    for w in row[4].split():
        types.add(w)
        num_tok += 1

In [14]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 150854
Num tokens 79801387


In [15]:
#Let's sort by SUBJECT_ID and HADM_ID to make a correspondence with the MIMIC-3 label file
df = df.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [16]:
#Sort the label file by the same
dfl = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR)
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [17]:
len(df['HADM_ID'].unique()), len(dfl['HADM_ID'].unique())

(52726, 58976)

## Consolidate labels with set of discharge summaries

Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

In [18]:
#Let's filter out these HADM_ID's
hadm_ids = set(df['HADM_ID'])
with open('%s/ALL_CODES.csv' % MIMIC_3_DIR, 'r') as lf:
    with open('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = int(row[2])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[1:3] + [row[-1], '', ''])

In [19]:
dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index_col=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [20]:
len(dfl['HADM_ID'].unique())

52726

In [21]:
#we still need to sort it by HADM_ID
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index=False)

## Append labels to notes in a single file

In [22]:
#Now let's append each instance with all of its codes
#this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
sorted_file = '%s/disch_full.csv' % MIMIC_3_DIR
df.to_csv(sorted_file, index=False)

In [23]:
labeled = concat_and_split.concat_data('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, sorted_file)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done


In [24]:
#name of the file we just made
print(labeled)

/Users/maltefeucht/PycharmProjects/TBM_ICD9_mimic3/mimicdata/mimic3/notes_labeled.csv


Let's sanity check the combined data we just made. Do we have all hadm id's accounted for, and the same vocab stats?

In [25]:
dfnl = pd.read_csv(labeled)
#Tokens and types
types = set()
num_tok = 0
for row in dfnl.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

In [26]:
print("num types", len(types), "num tokens", num_tok)

num types 150854 num tokens 79801387


In [27]:
print(len(dfnl['HADM_ID'].unique()))

52726


In [28]:
print(dfnl.shape)

(52726, 4)


In [29]:
#dfnl.head()

### Create binarized labels for full (8921) codes 

In [30]:
dfnl = dfnl[dfnl['LABELS'].notnull()]
dfnl['LABELS'] = dfnl['LABELS'].str.split(';', expand = False)
dfnl.to_csv('%s/dfnl_intermediary.csv' % MIMIC_3_DIR, index= False)

In [31]:
# print len of the data frame
print(len(dfnl['HADM_ID'].unique()))

52722


In [32]:
# print shape of the data frame
print(dfnl.shape)

(52722, 4)


In [33]:
# sanity check labels
#dfnl['LABELS']

In [34]:
# assure type of labels is list to fed into mlb binarizer
type(dfnl['LABELS'].iloc[7])

list

In [35]:
# instantiate mlb binarizer and compute binrized labels y
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(dfnl['LABELS'])

In [36]:
# sanity check binarized labels 
print('The shape of the binarized label matrix is',y.shape)

The shape of the binarized label matrix is (52722, 8921)


In [37]:
# Display binarized label matrix 
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [38]:
# display sorted order of multilabel classes
multilabel.classes_

array(['003.0', '003.1', '003.8', ..., 'V90.81', 'V90.89', 'V91.03'],
      dtype=object)

In [39]:
# save compute classes in MLB order to disk
np.save('%s/FULL_LABELS' % MIMIC_3_DIR, multilabel.classes_)

In [40]:
# create df containing mlb matrix
df_labels = pd.DataFrame(y, columns=multilabel.classes_)
#print(dfnl_labels.shape)
#print(dfnl.shape)
#df_labels

In [41]:
dfnl_intermediary = pd.read_csv('%s/dfnl_intermediary.csv' % MIMIC_3_DIR)

In [42]:
# sanity check shape of dfnl_intermediary df
dfnl_intermediary.shape

(52722, 4)

In [43]:
# concatenate binarized matrix with dfnl_intermediary df
dfnl_labels = pd.concat([dfnl_intermediary, df_labels], axis=1)

In [44]:
# check shape of binrized matrix
dfnl_labels.shape
#dfnl_labels['756.12'].iloc[52721]

(52722, 8925)

In [45]:
# Perform sanity checks
#print(dfnl_labels.iloc[52720])
#print(dfnl_labels['428.33'].iloc[52720])
#print(dfnl_labels.iloc[52721])
#dfnl_labels.head(10)

In [46]:
# write full notes_labeled_binarized file to csv: file contains all discharge summaries and the appended binarized full labels
dfnl_labels.to_csv('%s/notes_labeled_binarized.csv' % MIMIC_3_DIR, index = False)

## Create train/dev/test splits 

In [47]:
fname = '%s/notes_labeled.csv' % MIMIC_3_DIR
base_name = "%s/disch" % MIMIC_3_DIR #for output
tr, dv, te = concat_and_split.split_data(fname, base_name=base_name)

SPLITTING
0 read
10000 read
20000 read
30000 read
40000 read
50000 read


### Build vocabulary from training data for word2vec

In [48]:
vocab_min = 3
vname = '%s/vocab.csv' % MIMIC_3_DIR
build_vocab.build_vocab(vocab_min, tr, vname)

reading in data...
removing rare terms
51917 terms qualify out of 140795 total
writing output


### Pre-train word embeddings for word2vec model

Let's train word embeddings on all words

In [49]:
w2v_file = word_embeddings.word_embeddings('Text', 'full', '%s/disch_full.csv' % MIMIC_3_DIR, 100, 0, 20)

building word2vec vocab on /Users/maltefeucht/PycharmProjects/TBM_ICD9_mimic3/mimicdata/mimic3/disch_full.csv...
training...
writing embeddings to /Users/maltefeucht/PycharmProjects/TBM_ICD9_mimic3/mimicdata/mimic3/processed_full.w2v


### Pre-process code descriptions using the vocab

In [50]:
vocab_index_descriptions.vocab_index_descriptions('%s/vocab.csv' % MIMIC_3_DIR,
                                                  '%s/description_vectors.vocab' % MIMIC_3_DIR, 
                                                  '%s/description_vectors_raw.npy' % MIMIC_3_DIR, 
                                                  '%s/description_vectors.csv' % MIMIC_3_DIR)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22266/22266 [00:00<00:00, 83557.20it/s]


### Pre-train word embeddings for ICD-9 codes descriptions

In [51]:
Labels_w2v_file = word_embeddings.word_embeddings('Label', 'full', '%s/description_vectors.csv' % MIMIC_3_DIR, 300, 0, 30)

building word2vec vocab on /Users/maltefeucht/PycharmProjects/TBM_ICD9_mimic3/mimicdata/mimic3/description_vectors.csv...
training...
writing embeddings to /Users/maltefeucht/PycharmProjects/TBM_ICD9_mimic3/mimicdata/mimic3/processed_full.w2v_labels


### Sort each data split by length for batching 

In [52]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (MIMIC_3_DIR, splt)
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (MIMIC_3_DIR, splt), index=False)

### Append binarized labels for train, dev, test split

In [53]:
df_full = pd.read_csv('%s/notes_labeled_binarized.csv' % MIMIC_3_DIR)                                                     

In [54]:
# print shape of full dataframe
print(df_full.shape)

(52722, 8925)


In [55]:
# sanity check full df
#df_full.head()
#df_full.iloc[0,4]

In [56]:
for splt in tqdm(['train', 'dev', 'test']):
    filename = '%s/%s_full.csv' % (MIMIC_3_DIR, splt)
    df = pd.read_csv(filename)
    df_binarized = df_full.merge(df, how = 'inner', on = 'HADM_ID', suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)')
    df_binarized = df_binarized.sort_values(['length'])
    df_binarized.to_csv('%s/%s_full_binarized.csv' % (MIMIC_3_DIR, splt), index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:33<00:00, 71.31s/it]


In [57]:
# perform check df_binarized_test
df_binarized_test = pd.read_csv('%s/test_full_binarized.csv' % MIMIC_3_DIR)

In [58]:
print(df_binarized_test.shape, '\n')
#print(df_binarized_test.iloc[1867], '\n')
print(df_binarized_test['518.0'].iloc[1867], '\n')
df_binarized_test.head(1)

(3372, 8926) 

1 



Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,LABELS,003.0,003.1,003.8,003.9,004.1,004.8,...,V88.01,V88.11,V88.12,V88.21,V90.10,V90.39,V90.81,V90.89,V91.03,length
0,98474,104128,admission date discharge date date of birth se...,"['860.4', '868.03', 'E957.1', '854.05']",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,224


In [59]:
# perform check df_binarized_train
df_binarized_train = pd.read_csv('%s/train_full_binarized.csv' % MIMIC_3_DIR)

In [60]:
print(df_binarized_train.shape, '\n')
#print(df_binarized_train.iloc[29867], '\n')
print(df_binarized_train['788.20'].iloc[29867], '\n')
df_binarized_train.head(1)

(47719, 8926) 

1 



Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,LABELS,003.0,003.1,003.8,003.9,004.1,004.8,...,V88.01,V88.11,V88.12,V88.21,V90.10,V90.39,V90.81,V90.89,V91.03,length
0,158,169433,admission date discharge date date of birth se...,"['532.40', '493.20', 'V45.81', '412', '401.9',...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,51


In [61]:
# perform check df_binarized_dev
df_binarized_dev = pd.read_csv('%s/dev_full_binarized.csv' % MIMIC_3_DIR)

In [62]:
print(df_binarized_dev.shape, '\n')
#print(df_binarized_dev.iloc[671], '\n')
print(df_binarized_dev['274.9'].iloc[671], '\n')
df_binarized_dev.head(1)


(1631, 8926) 

1 



Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,LABELS,003.0,003.1,003.8,003.9,004.1,004.8,...,V88.01,V88.11,V88.12,V88.21,V90.10,V90.39,V90.81,V90.89,V91.03,length
0,86006,111912,admission date discharge date date of birth se...,"['801.35', '348.4', '805.06', '807.01', '998.3...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,230


## Filter each split to the top 50 diagnosis/procedure codes

In [63]:
Y = 50

In [64]:
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_3_DIR)
for row in dfnl.itertuples():
    for label in str(row[4]).split(';'):
        counts[label] += 1

In [65]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [66]:
codes_50 = [code[0] for code in codes_50[:Y]]

In [67]:
codes_50

['401.9',
 '38.93',
 '428.0',
 '427.31',
 '414.01',
 '96.04',
 '96.6',
 '584.9',
 '250.00',
 '96.71',
 '272.4',
 '518.81',
 '99.04',
 '39.61',
 '599.0',
 '530.81',
 '96.72',
 '272.0',
 '285.9',
 '88.56',
 '244.9',
 '486',
 '38.91',
 '285.1',
 '36.15',
 '276.2',
 '496',
 '99.15',
 '995.92',
 'V58.61',
 '507.0',
 '038.9',
 '88.72',
 '585.9',
 '403.90',
 '311',
 '305.1',
 '37.22',
 '412',
 '33.24',
 '39.95',
 '287.5',
 '410.71',
 '276.1',
 'V45.81',
 '424.0',
 '45.13',
 'V15.82',
 '511.9',
 '37.23']

In [68]:
with open('%s/TOP_%s_CODES.csv' % (MIMIC_3_DIR, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

### Create train, dev, test split for Top 50 Codes

In [69]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (MIMIC_3_DIR, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled.csv' % MIMIC_3_DIR, 'r') as f:
        with open('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), 'w') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            #header
            w.writerow(next(r))
            i = 0
            for row in r:
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

train
dev
test


### Create full notes file for Top 50 codes

In [None]:
df_train_50 = pd.read_csv('%s/train_50.csv' % MIMIC_3_DIR)
df_train_50.shape

In [None]:
df_test_50 = pd.read_csv('%s/test_50.csv' % MIMIC_3_DIR)
df_test_50.shape

In [None]:
df_dev_50 = pd.read_csv('%s/dev_50.csv' % MIMIC_3_DIR)
df_dev_50.shape


In [None]:
# create full notes with bianirzed labels file for top-50 codes
df_inter_50 = df_train_50.append(df_test_50)
df_full_50 = df_inter_50.append(df_dev_50)
df_full_50.to_csv('%s/notes_labeled_%s.csv' % (MIMIC_3_DIR, str(Y)), index=False)


In [None]:
df_full_50.shape

### Create Binarized Labels for Top (50) codes

In [None]:
df_full_50 = df_full_50[df_full_50['LABELS'].notnull()]
df_full_50['LABELS'] = df_full_50['LABELS'].str.split(';', expand = False)
df_full_50.to_csv('%s/dfnl_50_intermediary.csv' % MIMIC_3_DIR, index=False)

In [None]:
df_full_50.shape

In [None]:
multilabel_50 = MultiLabelBinarizer()
y_50 = multilabel_50.fit_transform(df_full_50['LABELS'])


In [None]:
y_50.shape

In [None]:
y_50

In [None]:
multilabel_50.classes_

In [None]:
np.save('%s/TOP_50_LABELS' % MIMIC_3_DIR, multilabel_50.classes_)

In [None]:
# create df containing mlb matrix
df_labels_50 = pd.DataFrame(y_50, columns=multilabel_50.classes_)
print(df_labels_50.shape, '\n')
#print(df_labels_50)

In [None]:
# sanity check shape of dfnl_intermediary df
df_intermediary_50 = pd.read_csv('%s/dfnl_50_intermediary.csv' % MIMIC_3_DIR)

In [None]:
df_intermediary_50.shape

In [None]:
# cpncat dfnL_50 intermediary sumamries with mlb dataframe
dfnl_labels_50 = pd.concat([df_intermediary_50, df_labels_50], axis=1)

In [None]:
dfnl_labels_50.shape

In [None]:
# Perform sanity checks
#print(dfnl_labels_50.iloc[7865])
#print(dfnl_labels_50['276.2'].iloc[7865], '\n')
#dfnl_labels_50.head(1)

In [None]:
dfnl_labels_50.to_csv('%s/notes_labeled_50_binarized.csv' % MIMIC_3_DIR, index = False)

### Sort each data split by length for batching 

In [None]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), index=False)

### Append binarized labels for train, dev, test split

In [None]:
df_full_50 = pd.read_csv('%s/notes_labeled_50_binarized.csv' % MIMIC_3_DIR)                                                    

In [None]:
# print shape of the dataset df_full_50
print(df_full_50.shape)

In [None]:
# snaity check df_full_50
#df_full_50.head(1)

In [None]:
for splt in tqdm(['train', 'dev', 'test']):
    filename = '%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df_binarized = df_full_50.merge(df, how = 'inner', on = 'HADM_ID', suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)')
    df_binarized = df_binarized.sort_values(['length'])
    df_binarized.to_csv('%s/%s_%s_binarized.csv' % (MIMIC_3_DIR, splt, str(Y)), index=False)

In [None]:
# perform check df_binarized_test
df_binarized_50_test = pd.read_csv('%s/test_50_binarized.csv' % MIMIC_3_DIR)

In [None]:
print(df_binarized_50_test.shape, '\n')
#print(df_binarized_50_test.iloc[8], '\n')
print(df_binarized_50_test['96.71'].iloc[8], '\n')
df_binarized_50_test.head(1)

In [None]:
# perform check df_binarized_train
df_binarized_50_train = pd.read_csv('%s/train_50_binarized.csv' % MIMIC_3_DIR)

In [None]:
print(df_binarized_50_train.shape, '\n')
#print(df_binarized_50_train.iloc[4763], '\n')
print(df_binarized_50_train['305.1'].iloc[4763], '\n') 
df_binarized_50_train.head(1)

In [None]:
# perform check df_binarized_dev
df_binarized_50_dev = pd.read_csv('%s/dev_50_binarized.csv' % MIMIC_3_DIR)

In [None]:
print(df_binarized_50_dev.shape, '\n')
#print(df_binarized_50_dev.iloc[1572], '\n')
print(df_binarized_50_dev['518.81'].iloc[1572], '\n') 
df_binarized_50_dev.head(1)