## Notebook with various summary statistics for results section

In [1]:
from fastai.text import *

In [2]:
# pandas doesn't understand ~, so provide full path
base_path = Path.home() / 'mimic'

# files used during processing - all aggregated here
lm_file = 'mimic_lm.pickle' # actual file is at base_path/lm_file but due to fastai function, have to pass file name separately
admissions_file = base_path/'ADMISSIONS.csv'
notes_file = base_path/'NOTEEVENTS.csv'
notes_pickle_file = base_path/'noteevents.pickle'

bs = 96

In [3]:
tmpfile = base_path/lm_file

if os.path.isfile(tmpfile):
    print('loading existing language model')
    data_lm = load_data(base_path, lm_file, bs=bs)

loading existing language model


### Compare vocabulary between pre-trained WT-103 and MIMIC

In [4]:
f = '/home/seth/models/wt103-fwd/itos_wt103.pkl'
if os.path.isfile(f):
    with open(f, 'rb') as f:
        wt103_itos = pickle.load(f)

In [5]:
len(wt103_itos)

60000

In [6]:
len(data_lm.vocab.itos)

60000

In [7]:
len(set(wt103_itos) & set(data_lm.vocab.itos))
#set(wt103_itos) & set(data_lm.vocab.itos)

14655

In [8]:
len(set(data_lm.vocab.itos) - set(wt103_itos))
#set(data_lm.vocab.itos) - set(wt103_itos)

45345

In [9]:
if 'bronchiectasis' in wt103_itos:
    print('found in wt103 lm')
if 'bronchiectasis' in data_lm.vocab.itos:
    print('found in mimic lm')

found in mimic lm


### Overlap between clinical notes used for language model and clinical notes used for DESCRIPTION classifier

In [10]:
# original data set too large to work with in reasonable time due to limted GPU resources
pct_data_sample = 0.1
# for repeatability - different seed than used with language model
desc_seed = 1776
lm_seed = 42

In [11]:
orig_df = pd.DataFrame()
if os.path.isfile(notes_pickle_file):
    print('Loading noteevent pickle file')
    orig_df = pd.read_pickle(notes_pickle_file)
    print(orig_df.shape)
else:
    print('Could not find noteevent pickle file; creating it')
    # run this the first time to covert CSV to Pickle file
    orig_df = pd.read_csv(notes_file, low_memory=False, memory_map=True)
    orig_df.to_pickle(notes_pickle_file)

Loading noteevent pickle file
(2083180, 11)


In [12]:
desc_df = orig_df.sample(frac=pct_data_sample, random_state=desc_seed)

In [17]:
desc_df.shape

(208318, 11)

In [13]:
lm_df = orig_df.sample(frac=pct_data_sample, random_state=lm_seed)

In [18]:
lm_df.shape

(208318, 11)

In [16]:
len(set(desc_df.ROW_ID.unique()) & set(lm_df.ROW_ID.unique()))

20648