In [1]:
import argparse
import pyprind

import numpy as np
import pandas as pd

from scipy.spatial.distance import cdist
from scipy.interpolate import interp1d
from scipy import stats

from fancyimpute import KNN

## Datapreprocessing to combine cohorts and add text below, text preprocessing afterwards

In [None]:
print('Loading processed files created from database using "preprocess.py"')
text           = pd.read_csv('processed_files/note_events_epochdates.csv',           sep = '|')
demog         = pd.read_csv('processed_files/demog.csv',         sep = '|')

In [4]:
# Initial data manipulations
demog['morta_90'].fillna(0, inplace=True)
demog['morta_hosp'].fillna(0, inplace=True)
demog['elixhauser'].fillna(0, inplace=True)
text['icustay_id'] = np.nan
# Keep only the first icustay of an admission (CRITICAL FIX FROM MATLAB CODE)
demog = demog.drop_duplicates(subset=['admittime','dischtime'],keep='first')

# Get list of all icustayids since that's what we iterate over through the rest of this script
icustayidlist = list(demog.icustay_id.values)

# Calculate the accurate readmission using the demographics data 
# (the SQL code from Komorowski, et al incorrectly cumulatively counts how many icu stays each patient has (preprocess.py:line 414) 
# and does a coarse boolean check if this number is >1). A readmission is now correctly defined by 
# whether the patient has returned to the ICU within 30 days of being previously discharged.

# This is done by grouping all the discharge times for each patient and using them in a comparison 
# with the current row's admission time to see if it's within the 30 day cutoff
subj_dischtime_list = demog.sort_values(by='admittime').groupby('subject_id').apply(lambda df: np.unique(df.dischtime.values)) # Create list of discharge times for each patient (output is a dict keyed by 'subject_id')

def determine_readmission(s, dischtimes=subj_dischtime_list,cutoff=3600*24*30):
    '''
    determine_readmisson evaluates each row of the provided dataframe (designed to operate on the demographics table)
    and chooses whether the current admission occurs within the cutoff of the previous discharge 
    (here, cutoff=30 days is the default)
    '''
    subject, admission, discharge = s[['subject_id','admittime','dischtime']]
    
    # Check for readmission
    subj_stay_idx = np.where(dischtimes[subject]==discharge)[0][0]
    s['re_admission'] = 0
    if subj_stay_idx > 0:
        if (admission - dischtimes[subject][subj_stay_idx-1]) <= cutoff:
            s['re_admission'] = 1
            
    return s
# Apply the above function to determine the appropriate readmissions
demog = demog.apply(determine_readmission,axis=1)

In [5]:
# Fill-in missing ICUSTAY IDs in text
print('Filling-in missing ICUSTAY IDs in text')
bar = pyprind.ProgBar(len(text.index.tolist()))
# Raw Translation
for i in text.index.tolist():
    bar.update()
    if np.isnan(text.loc[i, 'icustay_id']):
        o         = text.loc[i, 'charttime'] 
        subjectid = text.loc[i, 'subject_id']
        hadmid    = text.loc[i, 'hadm_id']
        ii        = demog.index[demog['subject_id'] == subjectid].tolist()
        jj        = demog.index[(demog['subject_id'] == subjectid) & (demog['hadm_id'] == hadmid)].tolist()
        for j in range(len(ii)):
            if (o >= demog.loc[ii[j], 'intime'] - 48*3600) and (o <= demog.loc[ii[j], 'outtime'] + 48*3600):
                text.loc[i,'icustay_id'] = demog.loc[ii[j], 'icustay_id']
            elif len(ii)==1:   # If we cant confirm from admission and discharge time but there is only 1 admission: it's the one!!
                text.loc[i,'icustay_id'] = demog.loc[ii[j], 'icustay_id']

print('Filling-in missing ICUSTAY IDs in bacterio - 2')                
bar = pyprind.ProgBar(len(text.index.tolist()))
for i in text.index.tolist():
    bar.update()
    if np.isnan(text.loc[i, 'icustay_id']):
        subjectid = text.loc[i, 'subject_id']
        hadmid    = text.loc[i, 'hadm_id']
        jj        = demog.index[(demog['subject_id'] == subjectid) & (demog['hadm_id'] == hadmid)].tolist()
        if len(jj) == 1:
            text.loc[i,'icustay_id'] = demog.loc[jj[0], 'icustay_id']

text.to_csv('noteevents_icustays.csv', index=False)

Filling-in missing ICUSTAY IDs in text


0% [##############################] 100% | ETA: 00:00:00

Filling-in missing ICUSTAY IDs in bacterio - 2



Total time elapsed: 01:17:53
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:07:36


In [6]:
culture       = pd.read_csv('processed_files/culture.csv',       sep = '|')
microbio      = pd.read_csv('processed_files/microbio.csv',      sep = '|')
microbio['charttime'] = microbio['charttime'].fillna(microbio['chartdate'])
del microbio['chartdate']
bacterio = pd.concat([microbio, culture], sort=False, ignore_index=True)

In [4]:
#noteevents           = pd.read_csv('noteevents_icustays.csv')
noteevents           = pd.read_csv('noteevents_full_icustays.csv')
sepsis_events =  pd.read_csv('sepsis_final_data_withTimes_1hr_bucket.csv')
no_sepsis_events = pd.read_csv('no_sepsis_final_data_withTimes_1hr_bucket.csv')

In [15]:
sepsis_ids = sorted(np.unique(sepsis_events["m:icustayid"].values))
no_sepsis_ids = sorted(np.unique(no_sepsis_events["m:icustayid"].values))
text_ids = sorted(np.unique(noteevents["icustay_id"].values))
set(sepsis_ids) & set(no_sepsis_ids)

set()

In [6]:
noteevents.isna().sum()

subject_id           0
hadm_id         108055
category             0
description          0
iserror        1956349
text                 0
charttime       266783
chartdate            0
storetime       713862
icustay_id           0
dtype: int64

In [7]:
print("length of ","text_ids:",len(text_ids),"sepsis_ids:", len(sepsis_ids),"no_sepsis_ids:", len(no_sepsis_ids))
print("matches between text and no_sepsis:",len(set(text_ids) & set(no_sepsis_ids)))
print("matches between text and sepsis:",len(set(text_ids) & set(sepsis_ids)))
print("matches between sepsis and no_sepsis:",len(set(sepsis_ids) & set(no_sepsis_ids)))

length of  text_ids: 57402 sepsis_ids: 18986 no_sepsis_ids: 24402
matches between text and no_sepsis: 24366
matches between text and sepsis: 18911
matches between sepsis and no_sepsis: 0


In [7]:
noteevents.dropna(subset="icustay_id", inplace=True)

In [8]:
noteevents.to_csv('noteevents_full_icustays.csv', index=False)

In [13]:
set(no_sepsis_events.columns) & set(sepsis_events.columns)

{'a:action',
 'm:charttime',
 'm:icustayid',
 'm:presumed_onset',
 'o:Arterial_BE',
 'o:Arterial_lactate',
 'o:Arterial_pH',
 'o:BUN',
 'o:Calcium',
 'o:Chloride',
 'o:Creatinine',
 'o:DiaBP',
 'o:FiO2_1',
 'o:GCS',
 'o:Glucose',
 'o:HCO3',
 'o:HR',
 'o:Hb',
 'o:INR',
 'o:Magnesium',
 'o:MeanBP',
 'o:PT',
 'o:PTT',
 'o:PaO2_FiO2',
 'o:Platelets_count',
 'o:Potassium',
 'o:RR',
 'o:SGOT',
 'o:SGPT',
 'o:SIRS',
 'o:SOFA',
 'o:Shock_Index',
 'o:Sodium',
 'o:SpO2',
 'o:SysBP',
 'o:Temp_C',
 'o:Total_bili',
 'o:WBC_count',
 'o:Weight_kg',
 'o:age',
 'o:cumulated_balance',
 'o:gender',
 'o:input_4hourly',
 'o:input_total',
 'o:max_dose_vaso',
 'o:mechvent',
 'o:output_4hourly',
 'o:output_total',
 'o:paCO2',
 'o:paO2',
 'o:re_admission',
 'r:reward',
 'step',
 'traj'}

In [9]:
print("length of ","text_ids:",len(text_ids),"sepsis_ids:", len(sepsis_ids),"no_sepsis_ids:", len(no_sepsis_ids))
print("matches between text and no_sepsis:",len(set(text_ids) & set(no_sepsis_ids)))
print("matches between text and sepsis:",len(set(text_ids) & set(sepsis_ids)))
print("matches between sepsis and no_sepsis:",len(set(sepsis_ids) & set(no_sepsis_ids)))

length of  text_ids: 57402 sepsis_ids: 18986 no_sepsis_ids: 24402
matches between text and no_sepsis: 24366
matches between text and sepsis: 18911
matches between sepsis and no_sepsis: 0


In [2]:
# lists of icustayids that we want to fill with text
sepsis_text_match = set(text_ids) & set(sepsis_ids)
no_sepsis_text_match = set(text_ids) & set(no_sepsis_ids)

NameError: name 'text_ids' is not defined

In [2]:
noteevents = pd.read_csv("noteevents_full_icustays.csv")
noteevents.rename({"icustay_id":"m:icustayid"}, axis="columns", inplace=True)

In [5]:
temp = pd.concat([sepsis_events, no_sepsis_events], ignore_index=True, sort=True)

In [18]:
print(len(np.unique(temp["m:icustayid"])))
print(len(sepsis_ids+no_sepsis_ids))

43388
43388


In [19]:
temp.to_csv("combined_patients_no_text")

In [3]:
from tqdm import tqdm
tqdm.pandas()

In [4]:
import math 
def fill_in_text(df, text_df, ids):
    return_df = df.copy()
    return_df["text"] = np.nan
    for id in tqdm(ids):
        # get slices per icustayid
        tempdf = df[df["m:icustayid"] == id]
        temp_text_df = text_df[text_df["m:icustayid"] == id]
        times = np.unique(tempdf["m:charttime"])
        #print("anzahl der zeitschritte", len(times), times[0], times[-1], times)
        for time in times:
            text = " ".join(list(temp_text_df["text"].loc[temp_text_df["chartdate"] <= time]))
            #print(text)
            return_df.loc[(return_df["m:icustayid"]==id) & (return_df["m:charttime"]==time),"text"] = text
            
    return return_df    
            
            #return_df["text"][return_df["m:icustayid"] == id] & [return_df["charttime"] <= time]




In [153]:
def fill_in_text2(df, text_df, ids):
    return_df = df.copy()
    return_df["text"] = np.nan
    
    # Merge the two DataFrames on "m:icustayid"
    merged_df = pd.merge(return_df, text_df, how="inner", on="m:icustayid", suffixes=("", "_text"))
    
    for id in ids:
        tempdf = merged_df[merged_df["m:icustayid"] == id]
        times = np.unique(tempdf["m:charttime"])
        
        for time in times:
            print(tempdf["chartdate"], time)
            text = " ".join(tempdf.loc[(tempdf["chartdate"] <= time), "text_text"])
            
            # Use boolean indexing to update the "text" column
            return_df.loc[(return_df["m:icustayid"]==id) & (return_df["m:charttime"]==time), "text"] = text
    
    return return_df

In [5]:
patients = pd.read_csv("combined_patients_no_text.csv")

In [9]:
ids = np.unique(patients["m:icustayid"])
ids = ids[:21694]
test = fill_in_text(df=patients, text_df=noteevents, ids=ids)
test.to_csv("combined_patients_text_pt1.csv")

100%|██████████| 21694/21694 [1:54:23<00:00,  3.16it/s]  


In [6]:
ids = np.unique(patients["m:icustayid"])
ids = ids[21694:]
test = fill_in_text(df=patients, text_df=noteevents, ids=ids)
test.to_csv("combined_patients_text_pt2.csv")

100%|██████████| 21694/21694 [1:51:26<00:00,  3.24it/s]  


In [8]:
patients.head()

Unnamed: 0.1,Unnamed: 0,a:action,m:charttime,m:icustayid,m:presumed_onset,o:Arterial_BE,o:Arterial_lactate,o:Arterial_pH,o:BUN,o:Calcium,...,o:max_dose_vaso,o:mechvent,o:output_4hourly,o:output_total,o:paCO2,o:paO2,o:re_admission,r:reward,step,traj
0,0,10,7245486000.0,200003.0,7245590000.0,1.071661,-0.560359,1.06987,-0.380498,0.094949,...,-2.302585,-0.5,0.635895,0.727866,-0.374712,-0.689066,-0.5,0.0,0,1
1,1,10,7245490000.0,200003.0,7245590000.0,1.071661,-0.560359,1.06987,-0.380498,0.094949,...,-2.302585,-0.5,0.844605,0.732548,-0.374712,-0.689066,-0.5,0.0,1,1
2,2,10,7245493000.0,200003.0,7245590000.0,1.071661,-0.560359,1.06987,-0.380498,0.094949,...,-2.302585,-0.5,1.021651,0.740802,-0.374712,-0.689066,-0.5,0.0,2,1
3,3,10,7245497000.0,200003.0,7245590000.0,1.071661,-0.560359,1.06987,-0.380498,0.094949,...,-2.302585,-0.5,0.986171,0.747959,-0.374712,-0.689066,-0.5,0.0,3,1
4,4,10,7245500000.0,200003.0,7245590000.0,1.071661,-0.560359,1.06987,-0.380498,0.094949,...,-2.302585,-0.5,0.690782,0.750601,-0.374712,-0.689066,-0.5,0.0,4,1


In [166]:
test = fill_in_text(df=patients, text_df=noteevents, ids=ids)

KeyboardInterrupt: 

In [138]:
np.unique(text_df["chartdate"])

array([5.9854464e+09, 5.9855328e+09])

In [156]:
text_df

Unnamed: 0,subject_id,hadm_id,category,description,iserror,text,charttime,chartdate,storetime,m:icustayid
865406,20707,129310.0,Nursing/other,Report,,"CCU Nursing PRogress Note\nS-""I am alittle ner...",4390557000.0,4390502000.0,4390558000.0,200007.0
865407,20707,129310.0,Nursing/other,Report,,NSG NOTE\n\nNEURO: ON INITIAL ROUNDS PT TALKAT...,4390603000.0,4390589000.0,4390604000.0,200007.0
865408,20707,129310.0,Radiology,CHEST (PORTABLE AP),,[**2109-2-18**] 7:58 AM\n CHEST (PORTABLE AP) ...,4390617000.0,4390589000.0,,200007.0
865409,20707,129310.0,ECG,Report,,"Sinus rhythm\nAnterseptal myocardial infarct, ...",,4390675000.0,,200007.0
865410,20707,129310.0,Discharge summary,Report,,Admission Date: [**2109-2-17**] Discharge...,,4390762000.0,,200007.0
865411,20707,129310.0,Echo,Report,,PATIENT/TEST INFORMATION:\nIndication: Coronar...,,4390675000.0,,200007.0
865412,20707,129310.0,ECG,Report,,Sinus tachycardia\nPremature ventricular contr...,,4390589000.0,,200007.0
865413,20707,129310.0,ECG,Report,,Sinus rhythm\nRecent anteroseptal infarct\nLat...,,4390589000.0,,200007.0
865414,20707,129310.0,ECG,Report,,Sinus rhythm\nProbable old anteroseptal infarc...,,4390502000.0,,200007.0
865415,20707,129310.0,ECG,Report,,Sinus rhythm\nProbable acute anteroseptal infa...,,4390502000.0,,200007.0


In [161]:
for i in range(len(test["text"])):

    print(test["text"].iloc[i])

CCU Nursing PRogress Note
S-"I am alittle nervous now"
O- Neuro initally unusually happy and joking around with staff but by afternoon admitted to being afraid and anxious. Received ativan 1mg po at 1400.
CV-VSS on IV NTG and heparin on. PTT at 0900 26 rebolused with heparin at 4800u and increased gtt to 1500u. ALso received plavix 300mg po.
Painfree and without SOB or nausea.
Resp- LS clear with O2 sats 96% on 2l np
ID afebrile
GU-voiding
GI NPO except for meds/ice chips
Skin-no impairments
Social-married with one daughter 11years old. Spoke to wife on phone.
A/P-stable post AMI
Check CPK at 1600, 0100
Check PTT  [**2106**]

 Sinus rhythm
Probable old anteroseptal infarct
Lateral T wave changes consistent with myocardial ischemia
Since previous tracing of [**2109-2-17**], no significant change

 Sinus rhythm
Probable acute anteroseptal infarct
Lateral T wave changes are borderline
No previous tracing


CCU Nursing PRogress Note
S-"I am alittle nervous now"
O- Neuro initally unusually 

## Preprocessing of text + Removal of sentences containing mentions of sepsis

In [17]:
corpus = noteevents["text"][:]
corpus.head()

0    Neonatology Attending Triage Note\n\nBaby [**N...
1    Nursing Transfer note\n\n\nPt admitted to NICU...
2    [**2101-10-20**] 5:49 PM\n CT ABDOMEN W/O CONT...
3    [**2101-10-20**] 6:16 PM\n CHEST (PORTABLE AP)...
4    [**2101-10-20**] 10:23 PM\n CHEST (PORTABLE AP...
Name: text, dtype: object

In [19]:
import nltk
from nltk import tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from tqdm import tqdm
tqdm.pandas()

# clinical text preprocessing
def preprocess_text(text):
    text = str(text)
    # case normalization
    text = text.lower().split()
    text = " ".join(text)

    # avoid label leackage: remove senteces with septic or sepsis
    sentences = tokenize.sent_tokenize(text)
    clean_sentences = []
    for sentence in sentences:
        if 'septic' in sentence or 'sepsis' in sentence:
            continue
        else:
            clean_sentences.append(sentence)

    text = ' '.join(clean_sentences)

    # special words and chars
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"don't", "do not ", text)
    text = re.sub(r"aren't", "are not ", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"%", " percent ", text)
    text = re.sub(r"that's", "that is ", text)
    text = re.sub(r"doesn't", "does not ", text)
    text = re.sub(r"he's", "he is ", text)
    text = re.sub(r"she's", "she is ", text)
    text = re.sub(r"it's", "it is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    # text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.lower().split()
    text = [w for w in text if len(w) >= 2]

    # remove stop words
    text = [word for word in text if word not in stopwords.words('english')]
    text = " ".join(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
c = corpus.progress_apply(preprocess_text)

  0%|          | 3851/1957231 [01:31<12:50:16, 42.27it/s]


KeyboardInterrupt: 

In [25]:
c

0    asked evaluate baby dr first name4 namepattern...
1    please see attending note details regarding ma...
2    2101 10 20 49 pm ct abdomen contrast; ct pelvi...
3    2101 10 20 16 pm chest portable ap clip clip n...
4    2101 10 20 10 23 pm chest portable ap clip cli...
Name: text, dtype: object

In [26]:
# original note containing sepsis mentions
corpus[1]

"Nursing Transfer note\n\n\nPt admitted to NICU for sepsis eval. Please see Attending\nnote for details regarding maternal history and delivery\ndetails.\n\nInfant stable in RA. RR 30-40's, sats 96-100%. LS clear/=.\nNo retractions noted. HR 140's. No murmur. Infant [** 5**], well\nperfused. BW 3865g. CBC and BC sent, pending at this time.\nInfant on 48 r/o sepsis with abx Amp and Gent. PIV placed in\nLeft hand, meds administered as ordered. D Stick 72. Infant\nstable for transfer to NBN. Continue to monitor for s/s of\nsepsis.\n\n\n"

In [27]:
# preprocessed note with sentences mentioning sepsis removed -> loss of information at the benefit of avoiding label leakage
c[1]

'please see attending note details regarding maternal history delivery details infant stable ra rr 30 40 sats 96 100 percent ls clear retractions noted hr 140 murmur infant well perfused bw 3865g cbc bc sent pending time piv placed left hand meds administered ordered stick 72 infant stable transfer nbn'

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(c)
vectorizer.vocabulary_
all_feature_names = vectorizer.get_feature_names_out()

for word in all_feature_names:
    indx = vectorizer.vocabulary_.get(word)
    print(f"{word} {vectorizer.idf_[indx]}")

10 1.4054651081081644
100 1.6931471805599454
102 2.09861228866811
137 2.09861228866811
140 2.09861228866811
145 2.09861228866811
15 2.09861228866811
16 2.09861228866811
20 1.4054651081081644
2101 1.4054651081081644
23 2.09861228866811
30 2.09861228866811
34 2.09861228866811
35 2.09861228866811
38 2.09861228866811
3865g 2.09861228866811
40 2.09861228866811
46358 2.09861228866811
46359 2.09861228866811
47629 2.09861228866811
48 2.09861228866811
49 2.09861228866811
72 1.6931471805599454
76 1.4054651081081644
96 2.09861228866811
98 2.09861228866811
aaa 2.09861228866811
ab 2.09861228866811
abd 2.09861228866811
abdomen 2.09861228866811
abdominal 2.09861228866811
acquired 2.09861228866811
acute 2.09861228866811
additionally 2.09861228866811
administered 2.09861228866811
administration 2.09861228866811
adrenal 2.09861228866811
af 2.09861228866811
aga 2.09861228866811
air 2.09861228866811
aneurysm 2.09861228866811
aneurysmal 2.09861228866811
antibiotics 2.09861228866811
aorta 2.09861228866811
a