In [4]:
import os
import pandas as pd 
import numpy as np

In [5]:
DATA_FILEPATH     = './data/all_hourly_data.h5'
NOTES_FILEPATH     = './data/notes/'
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']

In [6]:
notes_files = [os.path.join(NOTES_FILEPATH,file) for file in os.listdir(os.path.join(NOTES_FILEPATH))]

In [7]:
patients = pd.read_hdf(DATA_FILEPATH, 'patients').reset_index()
patients.shape

(34472, 31)

In [8]:
patients = patients[patients.max_hours > WINDOW_SIZE + GAP_TIME]
patients.shape

(23944, 31)

In [10]:
notes_df = pd.concat((pd.read_csv(f,delimiter=';') for f in notes_files), ignore_index=True)
notes_df.shape

(2083180, 11)

In [11]:
notes_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
       'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT'],
      dtype='object')

In [12]:
patients.columns

Index(['subject_id', 'hadm_id', 'icustay_id', 'gender', 'ethnicity', 'age',
       'insurance', 'admittime', 'diagnosis_at_admission', 'dischtime',
       'discharge_location', 'fullcode_first', 'dnr_first', 'fullcode', 'dnr',
       'dnr_first_charttime', 'cmo_first', 'cmo_last', 'cmo', 'deathtime',
       'intime', 'outtime', 'los_icu', 'admission_type', 'first_careunit',
       'mort_icu', 'mort_hosp', 'hospital_expire_flag', 'hospstay_seq',
       'readmission_30', 'max_hours'],
      dtype='object')

In [40]:
patient_notes = patients.merge(notes_df[['SUBJECT_ID','HADM_ID','CHARTDATE', 'CHARTTIME','CATEGORY','TEXT']],
                                          right_on=['SUBJECT_ID','HADM_ID'],
                                            left_on=['subject_id','hadm_id'],how='left')
print(patient_notes['subject_id'].nunique())


23944


In [41]:
patient_notes.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,...,hospital_expire_flag,hospstay_seq,readmission_30,max_hours,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,CATEGORY,TEXT
0,3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,...,0,1,0,145,3.0,145834.0,2101-10-23,2101-10-23 17:06:00,Nursing/other,Respiratory Care Note:\n\nPt received on A/C a...
1,3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,...,0,1,0,145,3.0,145834.0,2101-10-20,2101-10-20 22:23:00,Radiology,[**2101-10-20**] 10:23 PM\n CHEST (PORTABLE AP...
2,3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,...,0,1,0,145,3.0,145834.0,2101-10-20,,ECG,Sinus rhythm\nInferior/lateral T changes are n...
3,3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,...,0,1,0,145,3.0,145834.0,2101-10-24,2101-10-24 17:00:00,Nursing/other,"npn 7-7p\n\nneuro: Pt is alert, follows comman..."
4,3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,...,0,1,0,145,3.0,145834.0,2101-10-27,,ECG,Sinus rhythm\nP-R interval increased\nLate R w...


In [42]:
patient_notes = patient_notes[patient_notes['CATEGORY']!='Discharge summary']
patient_notes = patient_notes[~patient_notes['CHARTTIME'].isna()]
patient_notes.shape
patient_notes['subject_id'].nunique()

23110

In [47]:
patient_notes['clinical_note_nday'] = ((pd.to_datetime(patient_notes['CHARTTIME'])-patient_notes['intime']).dt.days)
patient_notes = patient_notes[patient_notes['clinical_note_nday'] <  1].reset_index()
patient_notes['subject_id'].nunique()

22911

In [60]:
text = str(patient_notes['TEXT'][1000])
text = text.replace("_","")
text = text.strip()
text = text.replace("\n",'')
text


'CCUNURSING PROGRESS  NOTE11 PM - 7 AMS/P STENT TO RIGHT ICAS " DENIES  COMPLAINTS "O PLS SEE  CAREVIEW FLOWSHEET FOR ALL OBJ/NUMERICAL  DATAHR 50-90\'S...SBP  GOAL > 130 SYSTOLIC ..ON LOW DOSE NEO GTT ..ABLE TO WEAN IV NEO  TO OFF AT 0600..D/T SBP > 150...LUNGS CLEARNEURO ..ALERT AND ORIENTED TIMES 3 ..WITHOUT  CHANGE IN NEURO STATUS /EXAMVOIDING QS IN  URINALSLEPT WITH AMBIENA TRANSIENT BRADYCARDIA/HYPOTENSION S/P RIGHT STENT ICA DEPLOYMENTP RESTART CARDIAC MEDS'

In [61]:
import spacy

med7 = spacy.load("en_core_med7_lg")

# create distinct colours for labels
col_dict = {}
seven_colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']
for label, colour in zip(med7.pipe_labels['ner'], seven_colours):
    col_dict[label] = colour

options = {'ents': med7.pipe_labels['ner'], 'colors':col_dict}

#text = 'A patient was prescribed Magnesium hydroxide 400mg/5ml suspension PO of total 30ml bid for the next 5 days.'
#text = ' Hypertension, V-Fib rest, s/p right subclavian line placement of hydroxide.\n\n PORTABLE CHEST:  Comparison is made to previous films from  four hours'
doc = med7(text)

spacy.displacy.render(doc, style='ent', jupyter=True, options=options)

[(ent.text, ent.label_) for ent in doc.ents]

[('LOW DOSE', 'DOSAGE'),
 ('NEO', 'DRUG'),
 ('GTT', 'ROUTE'),
 ('IV', 'ROUTE'),
 ('NEO', 'DRUG')]