In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import pickle

In [2]:
# add path to preprocessed mimic data
mimic_data_dir = './'

# Get data
data, meta, train_ind, valid_ind, test_ind = pd.read_pickle(mimic_data_dir+'mimic_iii_preprocessed_text.pkl')


In [10]:
data

Unnamed: 0,ts_ind,hour,variable,value,TABLE,mean,std
0,8778,467.816667,Text,Admission Date: [**2119-5-4**] D...,noteevents,,
1,15859,28.016667,Text,Admission Date: [**2112-12-8**] ...,noteevents,,
2,36467,155.166667,Text,Admission Date: [**2194-7-18**] ...,noteevents,,
3,20513,52.383333,Text,Admission Date: [**2194-1-7**] D...,noteevents,,
4,2048,73.133333,Text,Admission Date: [**2186-6-7**] Discharge ...,noteevents,,
...,...,...,...,...,...,...,...
74780605,49404,20.400000,MBP,0.152084,chart,79.406058,17.055932
74780606,49404,20.400000,O2 Saturation,-0.639678,chart,96.833354,4.429348
74780607,49404,20.400000,RR,1.473732,chart,19.555516,6.408548
74780608,49404,20.400000,SBP,-0.489288,chart,121.726318,23.966085


In [11]:
meta

Unnamed: 0,ts_ind,HADM_ID,SUBJECT_ID,in_hospital_sepsis
0,0,110404,268,1
1,1,188028,270,0
2,2,173727,271,0
3,3,164716,272,0
4,4,158689,273,0
...,...,...,...,...
49624,41725,143774,94944,0
49625,41726,123750,94950,0
49626,44940,196881,94953,0
49627,41727,118475,94954,0


In [36]:

print(f"Number of Trainpatients: {train_ind.shape[0]}\nNumber of Validationpatients: {valid_ind.shape[0]}\nNumber of Testpatients: {test_ind.shape[0]}")

Number of Trainpatients: 26635
Number of Validationpatients: 6646
Number of Testpatients: 8448


In [3]:
mimic_notext = data.loc[~data.variable.isin(["Text"])]
pickle.dump([mimic_notext, meta, train_ind, valid_ind, test_ind], open('mimic_notext.pkl','wb'))


In [3]:
# get all text data
unprocessed_text = data.loc[data["variable"]=="Text"]

In [41]:
Counter(unprocessed_text["TABLE"])

Counter({'noteevents': 988215})

In [45]:
Counter(unprocessed_text["hour"])

Counter({4.0: 311,
         3.3666666666666667: 310,
         3.15: 298,
         3.5166666666666666: 297,
         3.0833333333333335: 294,
         3.6333333333333333: 293,
         4.083333333333333: 290,
         3.7333333333333334: 288,
         2.433333333333333: 288,
         2.7333333333333334: 288,
         3.8: 287,
         3.3833333333333333: 287,
         3.55: 286,
         6.45: 285,
         3.4833333333333334: 285,
         2.55: 281,
         3.716666666666667: 280,
         2.316666666666667: 279,
         3.9833333333333334: 279,
         2.3833333333333333: 278,
         4.916666666666667: 277,
         6.666666666666667: 277,
         3.466666666666667: 277,
         2.95: 276,
         2.466666666666667: 276,
         3.4: 276,
         3.216666666666667: 275,
         2.8666666666666667: 274,
         4.466666666666667: 274,
         3.45: 274,
         4.25: 274,
         4.233333333333333: 272,
         2.15: 272,
         2.6333333333333333: 272,
         3.5

In [108]:
unprocessed_text["value"].head()

KeyError: 'value'

In [47]:
# text cleaning
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# stopwords: remove mentions of sepsis and some very common phrases
stop = stopwords.words('english')
additional_stop = ['sepsis', "septic", "admission", "discharge", "date", "birth", "sex", "service"]
stop.extend(additional_stop)
import re


def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_num = re.sub('[0-9]+', '', cleantext)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [x for x in tokens if len(x) > 2 if not x in stop]
    return " ".join(filtered_words)




[nltk_data] Downloading package stopwords to /Users/pablo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pablo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# rename value column, as we use it later for processed text
unprocessed_text.rename(columns={"value": "unprocessed_text"}, inplace=True)

In [131]:
# apply preprocessing to text
unprocessed_text["clean_text"] = unprocessed_text['unprocessed_text'].apply(lambda x: preprocess(x)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unprocessed_text["clean_text"] = unprocessed_text['unprocessed_text'].apply(lambda x: preprocess(x))


In [134]:
unprocessed_text["clean_text"]

0         cardiothoracic allergies amlodipine attending ...
1         medicine allergies sulfonamides attending firs...
2         medicine allergies atorvastatin penicillins co...
3         medicine allergies patient recorded known alle...
4         hospital ward name icu chief complaint black s...
                                ...                        
988210    title chief complaint hour events chest pelvis...
988211    chief complaint hour events fax request form h...
988212    title chief complaint hour events difficult ox...
988213    title chief complaint hour events difficult ox...
988214    title chief complaint abd pain nausea cholecys...
Name: clean_text, Length: 988215, dtype: object

In [132]:
# save as checkpoint
#unprocessed_text.to_pickle("mimic-iii_cleantext.pkl")

In [4]:
unprocessed_text = pd.read_pickle("mimic-iii_cleantext.pkl")

#### sentence bert embeddings
##### https://www.sbert.net/examples/training/distillation/README.html

In [48]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import gzip
import csv
import random
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [49]:
sentences = list(unprocessed_text["clean_text"])
# Model for which we apply dimensionality reduction
model = SentenceTransformer("all-MiniLM-L6-v2")
# New size for the embeddings
new_dimension = 50

  return self.fget.__get__(instance, owner)()


In [50]:
random.shuffle(sentences)
# To determine the PCA matrix, we need some example sentence embeddings.
# Here, we compute the embeddings for 20k random sentences from the mimic-iii dataset
pca_train_sentences = sentences[0:20000]
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

In [51]:
# Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

In [52]:
# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(
    in_features=model.get_sentence_embedding_dimension(),
    out_features=new_dimension,
    bias=False,
    activation_function=torch.nn.Identity(),
)
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module("dense", dense)


In [11]:
# Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences[:5])
# Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: respiratory care note patient rested increased psv assist control due decreased tidal volumes overnight rsbi left psv tidal volumes around last cxr noted rll consol effusion suctioned small amount thick pale yellow sputum tube cuff pressure bilat decreased bases
Embedding: [-0.03857879 -0.02877028 -0.22183815  0.19591323 -0.09103374 -0.16148004
 -0.25434768  0.07379907  0.05973304 -0.07349437 -0.03239922 -0.31744716
 -0.13007957  0.07044557 -0.04243442 -0.15639265  0.02291434  0.09322543
  0.05804721  0.07837962  0.2580278  -0.06519458  0.07751609  0.18088096
 -0.03801044 -0.02063168 -0.00621178 -0.11165434 -0.08482026  0.09881619
  0.06342077 -0.14559497 -0.02096635  0.00861705  0.01271349 -0.14202681
 -0.02017987 -0.14777979 -0.02171187 -0.06888226  0.04263135 -0.09578017
 -0.09965988  0.05353272 -0.09118355 -0.150098   -0.14980638  0.00250866
  0.03698735  0.04162715]

Sentence: respiratory care note patient well mask bipap ventilator discontinued continues lpm nasal albut

#### Tf-Idf representations

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import classification_report as report
from sklearn.ensemble import RandomForestClassifier


In [6]:
from tqdm import tqdm

unprocessed_text_labels = unprocessed_text.copy()

# get labels
for id in tqdm(np.unique(unprocessed_text_labels["ts_ind"])):
    label = meta["in_hospital_sepsis"].loc[meta["ts_ind"] == id].item()
    unprocessed_text_labels.loc[unprocessed_text_labels["ts_ind"] == id, 'in_hospital_sepsis'] = label



100%|██████████| 47156/47156 [00:34<00:00, 1351.70it/s]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(unprocessed_text_labels.clean_text, unprocessed_text_labels.in_hospital_sepsis, test_size=0.30, random_state=100,stratify=unprocessed_text_labels.in_hospital_sepsis)
print(f"Shape of X_train: {len(X_train)}")
print(f"Shape of X_test: {len(X_test)}")
print("Class Balance in Training Data:", Counter(y_train).items())
print("Class Balance in Test Data:", Counter(y_test).items())

Shape of X_train: 691750
Shape of X_test: 296465
Class Balance in Training Data: dict_items([(1.0, 158259), (0.0, 533491)])
Class Balance in Test Data: dict_items([(0.0, 228639), (1.0, 67826)])


In [19]:
X_train = unprocessed_text_labels["clean_text"].loc[unprocessed_text_labels["ts_ind"].isin(train_ind)]
X_test = unprocessed_text_labels["clean_text"].loc[unprocessed_text_labels["ts_ind"].isin(test_ind)]

y_train = unprocessed_text_labels["in_hospital_sepsis"].loc[unprocessed_text_labels["ts_ind"].isin(train_ind)]
y_test = unprocessed_text_labels["in_hospital_sepsis"].loc[unprocessed_text_labels["ts_ind"].isin(test_ind)]
print(f"Shape of X_train: {len(X_train)}")
print(f"Shape of X_test: {len(X_test)}")
print("Class Balance in Training Data:", Counter(y_train).items())
print("Class Balance in Test Data:", Counter(y_test).items())

Shape of X_train: 612273
Shape of X_test: 189036
Class Balance in Training Data: dict_items([(0.0, 467586), (1.0, 144687)])
Class Balance in Test Data: dict_items([(0.0, 146596), (1.0, 42440)])


In [37]:
clf = Pipeline([
    ("vct", TfidfVectorizer(analyzer="word", stop_words=stop,  max_features=50, max_df=0.7, min_df=0.02, strip_accents='ascii')),
    ("sparse2dense", FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
    ("forest", RandomForestClassifier(random_state=200, n_jobs=-1, verbose=1))
])

In [38]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(report(y_test, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   45.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s


              precision    recall  f1-score   support

         0.0       0.79      0.97      0.87    146596
         1.0       0.47      0.09      0.15     42440

    accuracy                           0.77    189036
   macro avg       0.63      0.53      0.51    189036
weighted avg       0.71      0.77      0.71    189036



[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.1s finished


In [39]:
importances = clf.named_steps["forest"].feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.named_steps["forest"]], axis=0)
forest_importances = pd.DataFrame(importances, index=list(clf.named_steps["vct"].vocabulary_.keys()), columns=["importance_score"])
df = pd.pivot_table(forest_importances, index=forest_importances.index,aggfunc='sum')
# look at the most important vocabs
df.sort_values(by=["importance_score"], ascending=False, inplace=True)
df[:50]

Unnamed: 0,importance_score
hct,0.034433
fluid,0.029231
total,0.028685
clip,0.028477
icu,0.026879
cont,0.025905
status,0.025435
meq,0.025433
plan,0.025317
examination,0.024791


In [40]:
# compute embeddings
tfidf_embeddings = pd.DataFrame(clf.named_steps["vct"].transform(unprocessed_text_labels["clean_text"]).toarray(), index=unprocessed_text_labels.index).add_prefix('tfidf:')
tfidf_embeddings

Unnamed: 0,tfidf:0,tfidf:1,tfidf:2,tfidf:3,tfidf:4,tfidf:5,tfidf:6,tfidf:7,tfidf:8,tfidf:9,...,tfidf:40,tfidf:41,tfidf:42,tfidf:43,tfidf:44,tfidf:45,tfidf:46,tfidf:47,tfidf:48,tfidf:49
0,0.0,0.0,0.000000,0.000000,0.000000,0.050759,0.160224,0.055658,0.0,0.130801,...,0.109636,0.000000,0.101137,0.000000,0.000000,0.050076,0.000000,0.067118,0.109539,0.000000
1,0.0,0.0,0.000000,0.000000,0.746235,0.045713,0.240493,0.100250,0.0,0.000000,...,0.098737,0.000000,0.000000,0.000000,0.051481,0.225488,0.000000,0.000000,0.000000,0.000000
2,0.0,0.0,0.111707,0.000000,0.145810,0.062524,0.131575,0.034280,0.0,0.000000,...,0.000000,0.000000,0.218016,0.038790,0.140828,0.123366,0.000000,0.082675,0.000000,0.339510
3,0.0,0.0,0.190648,0.000000,0.074655,0.320126,0.101050,0.035103,0.0,0.000000,...,0.103718,0.114921,0.318928,0.158885,0.036052,0.094746,0.000000,0.084660,0.000000,0.104298
4,0.0,0.0,0.025366,0.000000,0.173828,0.085187,0.022408,0.046704,0.0,0.000000,...,0.000000,0.000000,0.021217,0.026425,0.071952,0.168081,0.000000,0.028160,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988210,0.0,0.0,0.061828,0.050984,0.121057,0.051910,0.163857,0.000000,0.0,0.000000,...,0.056061,0.000000,0.155146,0.000000,0.000000,0.102422,0.184231,0.137280,0.000000,0.112749
988211,0.0,0.0,0.228047,0.037610,0.044650,0.076585,0.000000,0.041989,0.0,0.000000,...,0.041355,0.000000,0.076299,0.000000,0.129374,0.037777,0.181204,0.101268,0.041318,0.083173
988212,0.0,0.0,0.440015,0.072568,0.086152,0.073885,0.000000,0.000000,0.0,0.000000,...,0.159588,0.000000,0.147217,0.000000,0.000000,0.072891,0.000000,0.195396,0.079723,0.080240
988213,0.0,0.0,0.113925,0.046972,0.111530,0.047824,0.000000,0.052441,0.0,0.000000,...,0.206597,0.000000,0.000000,0.059341,0.000000,0.141543,0.113155,0.126476,0.051603,0.103876


In [45]:

# grab meta data
meta_data = unprocessed_text[["ts_ind","hour"]]

# split sentence embeddings and concat with meta data
meta_tfidf_embeddings = pd.concat([meta_data, tfidf_embeddings], axis=1)

# get meta data and sentence embeddings in strats format
meta_and_tfidf = (meta_tfidf_embeddings.melt(id_vars=['hour', 'ts_ind'], var_name='variable',value_name='value', ignore_index=False)
       .sort_values(['ts_ind', 'hour'])
       .reset_index(drop=True))

# combine mimic data and new sentence embeddings
mimic_and_tfidf = pd.concat([data[["ts_ind", "hour", "variable", "value"]], meta_and_tfidf], axis=0)

# remove Text variable, add sepsis label and sort by ts_ind and hour
mimic_and_tfidf = mimic_and_tfidf.loc[~mimic_and_tfidf.variable.isin(["Text"])]
#mimic_and_tfidf["in_hospital_sepsis"] = unprocessed_text_labels["in_hospital_sepsis"].values
mimic_and_tfidf.sort_values(by=["ts_ind", "hour"])

# compute new mean and std including sbert, then merge
mean_stds_tfidf = mimic_and_tfidf.groupby('variable').agg({'value':['mean', 'std']})
mean_stds_tfidf.columns = [col[1] for col in mean_stds_tfidf.columns]
mimic_and_tfidf = mimic_and_tfidf.merge(mean_stds_tfidf.reset_index(), on='variable', how='left')

# save
pickle.dump([mimic_and_tfidf, meta, train_ind, valid_ind, test_ind], open('mimic_and_tfidf_for_thesis.pkl','wb'))

mimic_and_tfidf


Unnamed: 0,ts_ind,hour,variable,value,mean,std
0,0,0.000000,Age,66.0,74.449104,54.324803
1,0,0.000000,Gender,1.0,0.435381,0.495812
2,0,0.033333,DBP,-0.571963,-0.0,1.000000
3,0,0.033333,GCS_eye,0.679176,-0.0,1.000000
4,0,0.033333,GCS_motor,0.515222,-0.0,1.000000
...,...,...,...,...,...,...
123203140,49404,14.716667,tfidf:45,0.0,0.053536,0.095498
123203141,49404,14.716667,tfidf:46,0.0,0.054494,0.127372
123203142,49404,14.716667,tfidf:47,0.0,0.036949,0.087733
123203143,49404,14.716667,tfidf:48,0.0,0.06657,0.136188


In [53]:
# compute embeddings
sentence_embeddings = model.encode(sentences)

# grab meta data
meta_data = unprocessed_text[["ts_ind","hour"]]

# split sentence embeddings and concat with meta data
sentence_embeddings_df = pd.DataFrame(sentence_embeddings).add_prefix('sBert:')
meta_sentence_embeddings = pd.concat([meta_data, sentence_embeddings_df], axis=1)

# get meta data and sentence embeddings in strats format
meta_and_sbert = (meta_sentence_embeddings.melt(id_vars=['hour', 'ts_ind'], var_name='variable',value_name='value', ignore_index=False)
       .sort_values(['ts_ind', 'hour'])
       .reset_index(drop=True))

# combine mimic data and new sentence embeddings
mimic_and_sbert = pd.concat([data[["ts_ind", "hour", "variable", "value"]], meta_and_sbert], axis=0)

# remove Text variable, add sepsis label and sort by ts_ind and hour
mimic_and_sbert = mimic_and_sbert.loc[~mimic_and_sbert.variable.isin(["Text"])]
#mimic_and_sbert["in_hospital_sepsis"] = unprocessed_text_labels["in_hospital_sepsis"].values
mimic_and_sbert.sort_values(by=["ts_ind", "hour"])

# compute new mean and std including sbert, then merge
mean_stds = mimic_and_sbert.groupby('variable').agg({'value':['mean', 'std']})
mean_stds.columns = [col[1] for col in mean_stds.columns]
mimic_and_sbert = mimic_and_sbert.merge(mean_stds.reset_index(), on='variable', how='left')

# save
pickle.dump([mimic_and_sbert, meta, train_ind, valid_ind, test_ind], open('mimic_and_sbert_for_thesis.pkl','wb'))

mimic_and_sbert


Unnamed: 0,ts_ind,hour,variable,value,mean,std
0,0,0.000000,Age,66.0,74.449104,54.324803
1,0,0.000000,Gender,1.0,0.435381,0.495812
2,0,0.033333,DBP,-0.571963,-0.0,1.000000
3,0,0.033333,GCS_eye,0.679176,-0.0,1.000000
4,0,0.033333,GCS_motor,0.515222,-0.0,1.000000
...,...,...,...,...,...,...
123203140,49404,14.716667,sBert:45,-0.068908,-0.01839,0.047982
123203141,49404,14.716667,sBert:46,-0.071315,0.0006,0.048094
123203142,49404,14.716667,sBert:47,-0.042605,-0.026331,0.047849
123203143,49404,14.716667,sBert:48,-0.044426,-0.035598,0.047058
