In [47]:
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from sklearn import preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import pandas as pd

df_add = pd.read_csv('/content/drive/MyDrive/Proj Data/ADMISSIONS.csv') # 58976 rows of data
df_notes = pd.read_csv('/content/drive/MyDrive/Proj Data/NOTEEVENTS.csv.gz',dtype='unicode') # 2083180 rows of data
df_codes = pd.read_csv('/content/drive/MyDrive/Proj Data/DIAGNOSES_ICD.csv') # 651047 rows of data

# diagnosis dataframe


In [48]:
# including only rows of data with heart failure ICD-9 codes
hf_codes = ['39891', '40201', '40211', '40291', '40401', '40403', '40411', '40413', '40491', '40493', '4280', '4281', '42820','42821', '42822', '42823', '42830', '42831', '42832', '42833', '42840', '42841', '42842', '42843','4289']
df_codes = df_codes.loc[df_codes.ICD9_CODE.isin(hf_codes)] # 651047 -> 21274 rows of data

# list of subject_ids asociated with hf_codes
hf_pid_list = df_codes["SUBJECT_ID"].tolist() 

# admissions dataframe

In [49]:
# change to standard datetime format
df_add.ADMITTIME = pd.to_datetime(df_add.ADMITTIME)
df_add.DISCHTIME = pd.to_datetime(df_add.DISCHTIME)

# remove elective admissions- we only want urgent and emergency
df_adm = df_add.loc[df_add.ADMISSION_TYPE != 'ELECTIVE']

# sort by subject id and admittime
df_adm = df_add.sort_values(['SUBJECT_ID','ADMITTIME'])
df_adm = df_adm.reset_index(drop = True)

# add a column for next admit_time (readmissions) and readmission id
df_adm['NEXT_ADMITTIME'] = df_adm.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
df_adm['NEXT_HADM_ID'] = df_adm.groupby('SUBJECT_ID').HADM_ID.shift(-1)
df_adm = df_adm.sort_values(['SUBJECT_ID','ADMITTIME'])

# Back fill. This will take a little while.
df_adm[['NEXT_ADMITTIME','NEXT_HADM_ID']] = df_adm.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_HADM_ID']].fillna(method = 'bfill')
df_adm['DAYS_TIL_NEXT_ADMIT'] = (df_adm.NEXT_ADMITTIME - df_adm.DISCHTIME).dt.total_seconds()/(24*60*60)

# notes dataframe

In [50]:
# Choosing only discharge summary clinical notes
df_notes_dis_sum = df_notes.loc[df_notes.CATEGORY == 'Discharge summary'] # 2083180 -> 59652; 

# changing type to ints so it aligns with the datatype of the other dataframes
df_notes_dis_sum['SUBJECT_ID'] = df_notes_dis_sum['SUBJECT_ID'].astype(int)
df_notes_dis_sum['HADM_ID'] = df_notes_dis_sum['HADM_ID'].astype(int)

# selecting the last discharge summary for each admission, if there are multiple
df_notes_dis_sum_last = (df_notes_dis_sum.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes_dis_sum['SUBJECT_ID'] = df_notes_dis_sum['SUBJECT_ID'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes_dis_sum['HADM_ID'] = df_notes_dis_sum['HADM_ID'].astype(int)


# merging notes, admissions, and diagnoses df

In [51]:
# first selecting admissions for subjects that have hf
df_hf_adm = df_adm.loc[df_adm.SUBJECT_ID.isin(hf_pid_list)] # now 58976 -> 51113 -> 45321 -> 14746 rows of data

# concatenating ICD-9 codes for patient admissions with multiple hf diagnoses
df_subj_concat_icd_codes = df_codes[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']].copy()
df_subj_concat_icd_codes = df_subj_concat_icd_codes.groupby(['SUBJECT_ID', 'HADM_ID'])['ICD9_CODE'].agg(' '.join).reset_index() # # 651047 -> 21274 -> 14040 rows of data

# merge the admissions and icd9-codes tables to get admissions involving hf diagnoses
df_hf_admissions = pd.merge(df_adm[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','ADMISSION_TYPE','DEATHTIME', 'NEXT_ADMITTIME', 'NEXT_HADM_ID', 'DAYS_TIL_NEXT_ADMIT']],
                        df_subj_concat_icd_codes, 
                        on = ['SUBJECT_ID', 'HADM_ID'],
                        how = 'inner')
# merge the admissions+icd-9 codes table with the discharge sumaries
df_hf_adm_notes = pd.merge(df_hf_admissions[['SUBJECT_ID','HADM_ID', 'NEXT_ADMITTIME', 'NEXT_HADM_ID', 'DAYS_TIL_NEXT_ADMIT']], df_notes_dis_sum_last, 
                        on = ['SUBJECT_ID', 'HADM_ID'],
                        how = 'inner')



In [52]:
# finally, create a labels for 30-day readmission
df_hf_adm_notes['OUTPUT_LABEL'] = (df_hf_adm_notes.DAYS_TIL_NEXT_ADMIT < 30).astype('int') # consists of ____ 30-day readmission rows, and ___ without 30-day readmission rows

# drop unnecessary columns
df_hf_adm_notes.drop(columns=['NEXT_ADMITTIME', 'NEXT_HADM_ID', 'ROW_ID', 'CHARTDATE', 'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'DAYS_TIL_NEXT_ADMIT'] )

df_hf_adm_notes['id'] = df_hf_adm_notes.index

# Preprocessing

In [54]:
# # shuffle input
df_adm_notes_merged = df_hf_adm_notes.sample(n=len(df_hf_adm_notes), random_state=42)
df_adm_notes_merged = df_adm_notes_merged.reset_index(drop=True)

# finalized dataset
df_final = df_adm_notes_merged.copy(deep=False)

Preprocessing text to remove any HTML tags, non-word characters, numbers and convert all text to lowercase and tokenzies notes 

In [122]:
import re

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    return text

#replace notes column with processed text
df_final['text2'] = df_final['TEXT'].apply(preprocessor)

# Create tokens
token_review=[]
for i in range(df_final['text2'].shape[0]):
    review = df_final['text2'][i]
    token_review.append([i for i in review.split()])

#list of lists, were each list contains the tokens for notes in each row
len(token_review)

13755

# Word2Vec and CNN Model

Train word2Vec model on tokenzied notes from dataset references: https://github.com/lsy3/clinical-notes-diagnosis-dl-nlp, https://www.kaggle.com/code/jagannathrk/word2vec-cnn-text-classification

In [57]:
from gensim.models import Word2Vec
from gensim import utils
from time import time
import random

size = 300  #change to 100 and 600 to generate vectors with those dimensions

#instantiate our  model
model_w2v = Word2Vec(min_count=10, window=5, vector_size=size, sample=1e-3, negative=5, workers=4, sg=0)

#build vocab over all reviews
model_w2v.build_vocab(token_review)

#We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
Idx=list(range(len(token_review)))

t0 = time()
perm_sentences = [token_review[i] for i in Idx]
model_w2v.train(perm_sentences,total_examples=model_w2v.corpus_count,epochs=5)
elapsed=time() - t0
print("Time taken for Word2vec training: ", elapsed, "seconds.")

Time taken for Word2vec training:  90.208571434021 seconds.


In [120]:
df_final['text2']

0        admission date    discharge date    date of bi...
1        admission date    discharge date    date of bi...
2        admission date    discharge date    date of bi...
3        admission date    discharge date    date of bi...
4        admission date    discharge date    date of bi...
                               ...                        
13750    admission date    discharge date    date of bi...
13751    admission date    discharge date    date of bi...
13752    admission date    discharge date    date of bi...
13753    admission date    discharge date    date of bi...
13754    admission date    discharge date    date of bi...
Name: text2, Length: 13755, dtype: object

In [60]:
#Create input sequeces for notes and pad sequences 
words=list(model_w2v.wv.index_to_key)
# len(words)
len_vocab = len(words)
token = Tokenizer(len_vocab)
token.fit_on_texts(df_final['text2'])
text = token.texts_to_sequences(df_final['text2'])
text = pad_sequences(text, 75)
print(text[:2])

[[  152   920   713    48    66    29    48   507   559   277   143    47
     58   508   924   138    14  2514  1877    55    47   988     3  1352
      4   446   924   476   837   587  2771  1103   491   432  6510    49
     88   476   837   587   521   587   122   419   117   109    49    88
      6    94    33    22   101    77    20    10  2771  1103    20     6
     94    33    22   101   156   103    51   494    22   800   156   103
   1161   425    42]
 [  331     1    15     5   100    42     1 11988    22   244   712     8
     20     1   137  2531    50  3760     1   289    94    33    22   101
     33    22   101  2346    66     8    43   793     9    17   208     9
     62   638     9    17  2145     9    17   135     9   141   148   991
    610    17   141  2123   846     9  7162   172    52  1756   212   141
  11466   172    52    28   309   712    28   220   712    28   206    28
    143   425    42]]


In [61]:
#Create input sequeces for output label
le = preprocessing.LabelEncoder()
y = le.fit_transform(df_final['OUTPUT_LABEL'])
y = to_categorical(y)
y[:2]

array([[0., 1.],
       [1., 0.]], dtype=float32)

In [62]:
#split dataset to test and train sets
x_train, x_test, y_train, y_test = train_test_split(np.array(text), y, test_size=0.2, stratify=y)

In [121]:
len(y_train) #11004
len(y_test) #2751
len(x_train) #11004
len(x_test) #2751

2751

In [67]:
from tensorflow.keras.layers import Embedding

def gensim_to_keras_embedding(model, train_embeddings=False):
    """Get a Keras 'Embedding' layer with weights set from Word2Vec model's learned word embeddings.

    Parameters
    ----------
    train_embeddings : bool
        If False, the returned weights are frozen and stopped from being updated.
        If True, the weights can / will be further updated in Keras.

    Returns
    -------
    `keras.layers.Embedding`
        Embedding layer, to be used as input to deeper network layers.

    """
    keyed_vectors = model  # structure holding the result of training
    weights = keyed_vectors.vectors  # vectors themselves, a 2D numpy array    
    index_to_key = keyed_vectors.index_to_key  # which row in `weights` corresponds to which word?

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=train_embeddings,
    )
    return layer

In [112]:
keras_model = Sequential()
keras_model.add(gensim_to_keras_embedding(model_w2v.wv,True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(filters=50, kernel_size=1, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(filters=100, kernel_size=2, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(filters=200, kernel_size=3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=16, epochs=3, validation_data=(x_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x20091f4bfd0>

# Evalution 

In [115]:
Y_pred = np.argmax(keras_model.predict(x_test),axis=1)
y_true = []
for i in y_test:
    if(i[0] == 1):
        y_true.append(0)
    else:
        y_true.append(1)




In [118]:
cnn_report1=classification_report(y_true,Y_pred,output_dict=True)
df_cnn1=pd.DataFrame(cnn_report1).transpose()
df_cnn1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.916758,1.0,0.956571,2522.0
1,0.0,0.0,0.0,229.0
accuracy,0.916758,0.916758,0.916758,0.916758
macro avg,0.458379,0.5,0.478286,2751.0
weighted avg,0.840444,0.916758,0.876944,2751.0


# Word2Vec (clinical-embeddings) + CNN

 reference: https://pubmed.ncbi.nlm.nih.gov/34920127/

In [81]:
from gensim.models import FastText, Word2Vec, KeyedVectors # KeyedVectors are used to load the GloVe models

# Load the model
model = Word2Vec.load('w2v_oa_cr_100d.bin')

In [106]:
keras_model = Sequential()
keras_model.add(gensim_to_keras_embedding(model.wv,True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(filters=50, kernel_size=1, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(filters=100, kernel_size=2, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(filters=200, kernel_size=3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=16, epochs=3, validation_data=(x_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x200b27db580>

# Evalution

In [113]:
Y_pred = np.argmax(keras_model.predict(x_test),axis=1)
y_true = []
for i in y_test:
    if(i[0] == 1):
        y_true.append(0)
    else:
        y_true.append(1)




In [114]:
cnn_report2 =classification_report(y_true,Y_pred,output_dict=True)
df_cnn2=pd.DataFrame(cnn_report2).transpose()
df_cnn2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.916758,1.0,0.956571,2522.0
1,0.0,0.0,0.0,229.0
accuracy,0.916758,0.916758,0.916758,0.916758
macro avg,0.458379,0.5,0.478286,2751.0
weighted avg,0.840444,0.916758,0.876944,2751.0


# Tf-idf and Random Forest

In [123]:
# https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
# https://www.kaggle.com/code/onadegibert/sentiment-analysis-with-tfidf-and-random-forest
## for processing
import re
import nltk
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

# split into test and training df
dtf_train, dtf_test = model_selection.train_test_split(df_final, test_size=0.2)

# ## get target values
y_train = dtf_train["OUTPUT_LABEL"].values
y_test = dtf_test["OUTPUT_LABEL"].values
X_train = dtf_train["text2"]
X_test = dtf_test["text2"]

In [125]:
print((df_final.OUTPUT_LABEL.sum()/len(df_final))*100)

print(df_final.OUTPUT_LABEL.sum())

print(df_final.OUTPUT_LABEL.sum())


8.316975645219921
1144
1144


In [126]:
# transform training data into tf-idf vector - takes 1 minute to run
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))
corpus = X_train # make sure using the right train data
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [127]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

# Evaluation

In [128]:
x_test = vectorizer.transform(X_test)

In [129]:
predicted = rf.predict(x_test)
ones = [x for x in predicted if x ==1]
len(ones)
ones = [x for x in y_test if x ==1]
len(ones)

98

In [130]:
from sklearn.metrics import classification_report
report = classification_report(y_test, predicted, output_dict=True)
df = pd.DataFrame(report).transpose()
df.to_latex()
df

  df.to_latex()


Unnamed: 0,precision,recall,f1-score,support
0,0.952756,0.568075,0.711765,1278.0
1,0.100977,0.632653,0.174157,98.0
accuracy,0.572674,0.572674,0.572674,0.572674
macro avg,0.526867,0.600364,0.442961,1376.0
weighted avg,0.892091,0.572674,0.673476,1376.0


In [131]:
from sklearn.metrics import precision_recall_fscore_support as score

precision,recall,fscore,support=score(y_test, predicted,average='macro')
print(precision)

0.5268665521044397


In [132]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

get_classification_report(y_test, predicted)

Unnamed: 0,precision,recall,f1-score,support
0,0.952756,0.568075,0.711765,1278.0
weighted avg,0.892091,0.572674,0.673476,1376.0
accuracy,0.572674,0.572674,0.572674,0.572674
macro avg,0.526867,0.600364,0.442961,1376.0
1,0.100977,0.632653,0.174157,98.0
