In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
df_adm = pd.read_csv('ADMISSIONS.csv', parse_dates = ['ADMITTIME', 'DISCHTIME',
       'DEATHTIME'], infer_datetime_format = True, engine = "c")

In [3]:
df_adm = df_adm[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE']]

In [4]:
df_adm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58976 entries, 0 to 58975
Data columns (total 6 columns):
SUBJECT_ID        58976 non-null int64
HADM_ID           58976 non-null int64
ADMITTIME         58976 non-null datetime64[ns]
DISCHTIME         58976 non-null datetime64[ns]
DEATHTIME         5854 non-null datetime64[ns]
ADMISSION_TYPE    58976 non-null object
dtypes: datetime64[ns](3), int64(2), object(1)
memory usage: 2.7+ MB


#### The next step is to get the next unplanned admission date if it exists. sort by subject_ID and admission date

In [5]:
df_adm = df_adm.sort_values(['SUBJECT_ID','ADMITTIME'])
df_adm = df_adm.reset_index(drop = True)

In [6]:
df_adm.tail(10)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE
58966,99973,150202,2180-11-27 02:30:00,2180-12-01 13:42:00,NaT,EMERGENCY
58967,99982,151454,2156-11-28 11:56:00,2156-12-08 13:45:00,NaT,EMERGENCY
58968,99982,112748,2157-01-05 17:27:00,2157-01-12 13:00:00,NaT,EMERGENCY
58969,99982,183791,2157-02-16 17:31:00,2157-02-22 20:36:00,NaT,EMERGENCY
58970,99983,117390,2193-04-26 11:35:00,2193-04-29 13:30:00,NaT,EMERGENCY
58971,99985,176670,2181-01-27 02:47:00,2181-02-12 17:05:00,NaT,EMERGENCY
58972,99991,151118,2184-12-24 08:30:00,2185-01-05 12:15:00,NaT,ELECTIVE
58973,99992,197084,2144-07-25 18:03:00,2144-07-28 17:56:00,NaT,EMERGENCY
58974,99995,137810,2147-02-08 08:00:00,2147-02-11 13:15:00,NaT,ELECTIVE
58975,99999,113369,2117-12-30 07:15:00,2118-01-04 16:30:00,NaT,ELECTIVE


In [7]:
#### It is the way we want it to be

In [8]:
df_adm['NEXT_ADMITTIME'] = df_adm.groupby('SUBJECT_ID').ADMITTIME.shift(-1)

In [9]:
# get the next admission type
df_adm['NEXT_ADMISSION_TYPE'] = df_adm.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

In [10]:
df_adm.head(200)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NaT,
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NaT,
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,
5,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,NaT,NEWBORN,NaT,
6,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,NaT,NEWBORN,NaT,
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,
8,10,184167,2103-06-28 11:36:00,2103-07-06 12:10:00,NaT,NEWBORN,NaT,
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,


In [11]:
#### Gain we can make sure that it is working the way we want.

In [12]:
#### we want to predict UNPLANNED re-admissions, so we should filter out the ELECTIVE next admissions.

In [13]:
# get rows where next admission is elective and replace with naT or nan

In [14]:
rows = df_adm.NEXT_ADMISSION_TYPE == 'ELECTIVE'
df_adm.loc[rows,'NEXT_ADMITTIME'] = pd.NaT
df_adm.loc[rows,'NEXT_ADMISSION_TYPE'] = np.NaN

In [15]:
df_adm.head(200)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NaT,
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NaT,
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,
5,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,NaT,NEWBORN,NaT,
6,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,NaT,NEWBORN,NaT,
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,
8,10,184167,2103-06-28 11:36:00,2103-07-06 12:10:00,NaT,NEWBORN,NaT,
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,


In [16]:
#### For example for row # 180 the elective readmission is deleted

In [17]:
# sort by subject_ID and admission date
# it is safer to sort right before the fill in case something changed the order above
df_adm = df_adm.sort_values(['SUBJECT_ID','ADMITTIME'])
# back fill (this will take a little while)
df_adm[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = df_adm.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

In [18]:
df_adm.head(200)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NaT,
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NaT,
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,
5,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,NaT,NEWBORN,NaT,
6,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,NaT,NEWBORN,NaT,
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,
8,10,184167,2103-06-28 11:36:00,2103-07-06 12:10:00,NaT,NEWBORN,NaT,
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,


In [19]:
df_adm['DAYS_NEXT_ADMIT']=  (df_adm.NEXT_ADMITTIME - df_adm.DISCHTIME).dt.total_seconds()/(24*60*60)

In [20]:
df_adm.head(200)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NaT,,
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,,
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,,
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NaT,,
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,,
5,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,NaT,NEWBORN,NaT,,
6,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,NaT,NEWBORN,NaT,,
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,,
8,10,184167,2103-06-28 11:36:00,2103-07-06 12:10:00,NaT,NEWBORN,NaT,,
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,,


In [21]:
#### histogram of days between admissions.

In [22]:
use_cols = ['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'TEXT']

In [23]:
df_notes = pd.read_csv("NOTEEVENTS.csv",usecols= use_cols, low_memory = False, engine = "c")

In [24]:
df_notes.columns

Index(['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'TEXT'], dtype='object')

In [25]:
df_notes_dis_sum = df_notes.loc[df_notes.CATEGORY == 'Discharge summary']

In [26]:
#assert df_notes_dis_sum.duplicated(['HADM_ID']).sum() == 0

In [27]:
#### For simlicity I am going to use only the last discharge note:

In [28]:
df_notes_dis_sum_last = (df_notes_dis_sum.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()

In [29]:
assert df_notes_dis_sum_last.duplicated(['HADM_ID']).sum() == 0, 'Multiple discharge summaries per admission'

In [30]:
df_adm_notes = pd.merge(df_adm, df_notes_dis_sum_last, on = ['SUBJECT_ID','HADM_ID'],how = 'left')

In [31]:
assert len(df_adm) == len(df_adm_notes), 'Number of rows increased'

In [32]:
df_adm_notes.TEXT.isnull().sum() / len(df_adm_notes)

0.1059753119913185

In [33]:
#### 10.6 % of the admissions are missing 

In [34]:
df_adm_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/df_adm_notes.groupby('ADMISSION_TYPE').size()

ADMISSION_TYPE
ELECTIVE     0.048663
EMERGENCY    0.037983
NEWBORN      0.536691
URGENT       0.042665
dtype: float64

In [35]:
#### 53% of the NEWBORN admissions were missing discharge summaries vs ~4% for the others. 

In [36]:
df_adm_notes_clean = df_adm_notes.loc[df_adm_notes.ADMISSION_TYPE != "NEWBORN"] 

In [37]:
df_adm_notes_clean['OUTPUT_LABEL'] = (df_adm_notes_clean.DAYS_NEXT_ADMIT < 30).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df_adm_notes_clean.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT,CATEGORY,TEXT,OUTPUT_LABEL
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,,,Discharge summary,Admission Date: [**2101-10-20**] Discharg...,0
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,,,Discharge summary,Admission Date: [**2191-3-16**] Discharge...,0
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,,,Discharge summary,Admission Date: [**2175-5-30**] Dischar...,0
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,,,Discharge summary,"Name: [**Known lastname 10050**], [**Known fi...",0
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,,,Discharge summary,Admission Date: [**2178-4-16**] ...,0


In [39]:
df_adm_notes_clean.OUTPUT_LABEL.value_counts()

0    48109
1     3004
Name: OUTPUT_LABEL, dtype: int64

In [40]:
#### This indicates that we have an imbalanced dataset, which is a common occurrence in healthcare data science.

In [41]:
# shuffle the samples

In [42]:
df_adm_notes_clean = df_adm_notes_clean.sample(n = len(df_adm_notes_clean), random_state = 42)

In [43]:
df_adm_notes_clean = df_adm_notes_clean.reset_index(drop = True)

In [44]:
# Save 30% of the data as validation and test data 
df_valid = df_adm_notes_clean.sample(frac=0.2,random_state=42)

In [45]:
# df_test = df_valid_test.sample(frac = 0.5, random_state = 42)
df_valid = df_valid_test.drop(df_test.index)
# use the rest of the data as training data
# df_train_all=df_adm_notes_clean.drop(df_valid_test.index)

NameError: name 'df_valid_test' is not defined

In [None]:
# split the training data into positive and negative
rows_pos = df_train_all.OUTPUT_LABEL == 1
df_train_pos = df_train_all.loc[rows_pos]
df_train_neg = df_train_all.loc[~rows_pos]

In [None]:
# merge the balanced data
df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

In [None]:
# merge the balanced data
df_train_DNN = pd.concat([df_train_pos, df_train_neg.sample(n = 3* len(df_train_pos), random_state = 42)],axis = 0)

In [None]:
# shuffle the order of training samples 
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop = True)
df_train_DNN = df_train_DNN.sample(n = len(df_train_DNN), random_state = 42).reset_index(drop = True)

In [None]:
#### Preprocess the unstructured notes using a bag-of-words approach

In [None]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    return df

In [None]:
# preprocess the text to deal with known issues
df_train = preprocess_text(df_train)
df_valid = preprocess_text(df_valid)
#df_test = preprocess_text(df_test)
df_train_DNN = preprocess_text(df_train_DNN)

In [None]:
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams


# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from numpy import array
from scipy.sparse import csr_matrix
from time import time

# keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard

In [None]:
### We will write our own tokenizer function to
### replace punctuation with spaces
### replace numbers with spaces
### lower case all words

In [None]:
df_train.TEXT[0]

In [None]:
import string
PS = PorterStemmer()
WL = WordNetLemmatizer()
def tokenizer_better(text):
    # tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
    
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    
    tokens = [PS.stem(word) for word in tokens] ##Stemming
    tokens = [WL.lemmatize(word) for word in tokens] ##Lemmatization  try with pos ='v'
    return tokens

#### We will use the built in CountVectorizer from scikit-learn package. This vectorizer simply counts how many times each word occurs in the note. There is also a TfidfVectorizer which takes into how often words are used across all notes, but for this project let’s use the simpler one (I got similar results with the second one too).

In [None]:

vocabulary_size = 20000

### Only to find Stop-Words:

In [None]:
# Only to find Stop-Words: 


# fit our vectorizer. This will take a while depending on your computer. 
# vect = CountVectorizer(max_features = 3000, tokenizer = tokenizer_better)
# vect.fit(df_train.TEXT.values)
# X_train_tf = vect.transform(df_train.TEXT.values)

#### Finding most frequent words:

In [None]:
## sum_words = X_train_tf.sum(axis=0) 
## words_freq = [(word, sum_words[0, idx]) for word, idx in     vect.vocabulary_.items()]
## words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
## words_freq_plot = words_freq[:30]
## values = [value[0] for value in words_freq_plot]
## freq = [value[1] for value in words_freq_plot]
## fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (6,8), sharex=True)
## plt.barh(range(len(values)),freq, align = 'center')
## plt.yticks(range(len(values)),values)
## plt.title("Top 30 Frequent Words")
## plt.gca().invert_yaxis()
## plt.xlabel('Frequncy')
## plt.tight_layout()
## plt.show()


In [None]:
my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
                 'is','patient','s','he','at','as','or','one','she','his','her','am',
                 'were','you','pt','pm','by','be','had','your','this','date',
                 'from','there','an','that','p','are','have','has','h','but','o',
                 'namepattern','which','every','also']

In [None]:
vect = CountVectorizer(max_features = 3000, 
                       tokenizer = tokenizer_better, 
                       stop_words = my_stop_words,
                       ngram_range= (1,3))
vect.fit(df_train.TEXT.values)

In [None]:
X_train_tf = vect.transform(df_train.TEXT.values)
X_valid_tf = vect.transform(df_valid.TEXT.values)

# Response:

y_train = df_train.OUTPUT_LABEL
y_valid = df_valid.OUTPUT_LABEL

In [None]:
vect_DNN = CountVectorizer(max_features = 3000,
                           tokenizer = tokenizer_better,
                           stop_words = my_stop_words,
                           ngram_range= (1,3))
vect_DNN.fit(df_train_DNN.TEXT.values)
X_train_DNN_tf = vect_DNN.transform(df_train_DNN.TEXT.values)

In [None]:
# X_train_DNN_tf = vect_DNN.transform(df_train_DNN.TEXT.values)
X_valid_DNN_tf = vect_DNN.transform(df_valid.TEXT.values)


# Response:

y_train_DNN = df_train_DNN.OUTPUT_LABEL
y_valid_DNN = y_valid

In [None]:
#### Step 3: Build a simple predictive model

In [None]:
# logistic regression

clf=LogisticRegression(C = 0.0001, penalty = 'l2', random_state = 42)
clf.fit(X_train_tf, y_train)

In [None]:
y_valid_preds = clf.predict_proba(X_valid_tf)[:, 1]

In [None]:
fpr, tpr, _ = metrics.roc_curve(np.array(y_valid), y_valid_preds)

In [None]:
auc = metrics.auc(fpr,tpr)
auc

In [None]:
y_train_binary = to_categorical(y_train)
y_valid_binary = to_categorical(y_valid)

In [None]:
X_train_Array = X_train_tf.todense()
X_valid_Array = X_valid_tf.todense()

In [None]:
X_train_DNN_Array = X_train_DNN_tf.todense()
y_train_DNN_binary = to_categorical(y_train_DNN)

In [None]:
model = Sequential()
model.add(Dense(500, input_dim = X_train_tf.shape[1] , activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1000, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(500, activation = 'relu'))
model.add(Dense(2, activation = 'softmax'))



In [None]:
model.compile(optimizer=Adam(0.00001), loss= 'categorical_crossentropy', metrics=['accuracy'])

In [None]:
tensorboard = TensorBoard(log_dir = 'log/{}'.format(time()))

In [None]:
model.fit(X_train_Array, y_train_binary, epochs=500, batch_size=128, validation_data=(X_valid_Array, y_valid_binary), callbacks= [tensorboard])

In [None]:
y_pred = model.predict_proba(X_valid_Array)
fpr, tpr, _ = metrics.roc_curve(np.array(y_valid), y_pred[:,1])
auc = metrics.auc(fpr,tpr)
auc

### Imbalanced Data:

In [None]:
model.fit(X_train_DNN_Array, y_train_DNN_binary, epochs=500, batch_size=128, class_weight = {0:1, 1:3}, validation_data=(X_valid_Array, y_valid_binary))

In [None]:
y_pred = model.predict_proba(X_valid_Array)
fpr, tpr, _ = metrics.roc_curve(np.array(y_valid), y_pred[:,1])
auc = metrics.auc(fpr,tpr)
auc