In [1]:
import sys
import os
# sys.path.append('../')
from data.utils import reformat
from icd_classifier.settings import MIMIC_3_DIR, DATA_DIR
from data import extract_wvs
from data import get_discharge_summaries
from data import build_vocab
from data import vocab_index_descriptions
from data import word_embeddings
from data import concat_and_split

import numpy as np
import pandas as pd

from collections import Counter, defaultdict
import csv
import math
import operator

# Shared Steps


In [2]:
# sys and python path magic
print(os.getcwd())
corrected_path = os.path.split(os.getcwd())[0]
os.chdir(corrected_path)
print(os.getcwd())

print(MIMIC_3_DIR)
print(DATA_DIR)

/home/modestas.filipavicius/dev/icd-classifier/icd_classifier
/home/modestas.filipavicius/dev/icd-classifier
data/raw
data/processed


Let's do some data processing in a much better way, with a notebook.

First, let's define some stuff.

In [3]:
Y = 'full'  # use all available labels in the dataset for prediction
vocab_size = 'full'  # don't limit the vocab size to a specific number
vocab_min = 3  # discard tokens appearing in fewer than this many documents

# Data processing

## Combine diagnosis and procedure codes and reformat them

The codes in MIMIC-III are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

In [4]:

dfproc = pd.read_csv('%s/PROCEDURES_ICD.csv.gz' % MIMIC_3_DIR)
dfdiag = pd.read_csv('%s/DIAGNOSES_ICD.csv.gz' % MIMIC_3_DIR)

In [5]:
dfdiag['absolute_code'] = dfdiag.apply(lambda row: str(reformat(str(row[4]), True)), axis=1)
dfproc['absolute_code'] = dfproc.apply(lambda row: str(reformat(str(row[4]), False)), axis=1)

In [6]:
dfcodes = pd.concat([dfdiag, dfproc])

In [7]:
dfcodes

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,absolute_code
0,1297,109,172335,1.0,40301,403.01
1,1298,109,172335,2.0,486,486
2,1299,109,172335,3.0,58281,582.81
3,1300,109,172335,4.0,5855,585.5
4,1301,109,172335,5.0,4254,425.4
...,...,...,...,...,...,...
240090,228330,67415,150871,5.0,3736,37.36
240091,228331,67415,150871,6.0,3893,38.93
240092,228332,67415,150871,7.0,8872,88.72
240093,228333,67415,150871,8.0,3893,38.93


In [8]:
# use below if COLUMNS ARE in LOWER CASE!
dfcodes.to_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, index=False,
               columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'absolute_code'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'])

# dfcodes.to_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, index=False,
#                columns=['row_id', 'subject_id', 'hadm_id', 'seq_num', 'absolute_code'],
#                header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'])

## How many codes are there?

In [9]:
# In the full dataset (not just discharge summaries)
df = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, dtype={"ICD9_CODE": str})
# 8994 unique codes
len(df['ICD9_CODE'].unique())

8994

In [13]:
codes_df = pd.DataFrame(df['ICD9_CODE'])
codes_df.value_counts()[:20]
# TODO: find a way to plot Zipfs law
# ICD9_CODE
# 401.9        20703
# 38.93        14731
# 428.0        13111
# 427.31       12891
# 414.01       12429
# 96.04        10333
# 96.6          9300


ICD9_CODE
401.9        20703
38.93        14731
428.0        13111
427.31       12891
414.01       12429
96.04        10333
96.6          9300
584.9         9119
96.71         9100
250.00        9058
272.4         8690
518.81        7497
99.04         7244
39.61         6838
599.0         6555
530.81        6326
96.72         6048
272.0         5930
99.55         5842
V05.3         5779
dtype: int64

## Tokenize and preprocess raw text --> `disch_full.csv`

Preprocessing time!

This will:
- Select only discharge summaries and their addenda
- remove punctuation and numeric-only tokens, removing 500 but keeping 250mg
- lowercase all tokens

In [10]:
notes_file = "%s/NOTEEVENTS.csv.gz" % MIMIC_3_DIR
out_file = "%s/disch_full.csv" % MIMIC_3_DIR

In [11]:
# This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
# 2083180it
disch_full_file = get_discharge_summaries.write_discharge_summaries(notes_file, out_file)

processing notes file
writing to data/raw/disch_full.csv


2083180it [01:17, 26844.24it/s]


Let's read this in and see what kind of data we're working with

In [12]:
df = pd.read_csv('%s/disch_full.csv' % MIMIC_3_DIR)

In [13]:
df.info()
# Data columns (total 4 columns):
#  #   Column      Non-Null Count  Dtype  
# ---  ------      --------------  -----  
#  0   SUBJECT_ID  59652 non-null  int64  
#  1   HADM_ID     59652 non-null  int64  
#  2   CHARTTIME   0 non-null      float64
#  3   TEXT        59652 non-null  object 
# dtypes: float64(1), int64(2), object(1)
# memory usage: 1.8+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59652 entries, 0 to 59651
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SUBJECT_ID  59652 non-null  int64  
 1   HADM_ID     59652 non-null  int64  
 2   CHARTTIME   0 non-null      float64
 3   TEXT        59652 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 1.8+ MB


In [14]:
# How many admissions?
# 58,976
len(df['HADM_ID'].unique())

52726

In [15]:
# Total word tokens and unique words from df['TEXT']
# unique_words: {'hemodynacmially', 'walkway', 'costaphrenic', .., 'lately', 'constipaiton', 'excoriate'}
unique_words = set()
num_tok = 0
for row in df.itertuples():
    for word in row[4].split():
        unique_words.add(word)
        num_tok += 1

In [17]:
# Num unique words 150,854
# Num word tokens 79,801,387
print("Num unique words: ", len(unique_words))
print("Num word tokens: ", str(num_tok))

Num unique words:  150854
Num word tokens:  79801387


In [18]:
# Let's sort by SUBJECT_ID and HADM_ID to make a correspondence
# with the MIMIC-3 label file
df = df.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [19]:
# Sort the label file by the same
dfl = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR)
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])

  dfl = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR)


In [21]:
# (52726, 58976)
len(df['HADM_ID'].unique()), len(dfl['HADM_ID'].unique())

(52726, 58976)

## Consolidate labels with set of discharge summaries --> `ALL_CODES_filtered.csv`

Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

In [22]:
#Let's filter out these HADM_ID's
hadm_ids = set(df['HADM_ID'])
with open('%s/ALL_CODES.csv' % MIMIC_3_DIR, 'r') as lf:
    with open('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        # header
        next(r)
        for i, row in enumerate(r):
            hadm_id = int(row[2])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[1:3] + [row[-1], '', ''])

In [23]:
dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index_col=None)

  dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index_col=None)


In [24]:
# 52726
len(dfl['HADM_ID'].unique())

52726

In [25]:
# we still need to sort it by HADM_ID
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index=False)

## Append labels to notes in a single file --> `notes_labeled.csv`

In [26]:
# Now let's append each instance with all of its codes
# this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
sorted_file = '%s/disch_full.csv' % MIMIC_3_DIR
outfilename = '%s/notes_labeled.csv' % MIMIC_3_DIR
df.to_csv(sorted_file, index=False)


In [27]:
# CONCATENATING
# 0 done
# couldn't find matching hadm_id. data is probably not sorted correctly
labeled = concat_and_split.concat_data('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, sorted_file, outfilename)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done


In [28]:
# name of the file we just made
# data/raw/notes_labeled.csv
print(labeled)

data/raw/notes_labeled.csv


Let's sanity check the combined data we just made. Do we have all hadm id's accounted for, and the same vocab stats?

In [29]:
dfnl = pd.read_csv(labeled)
#  Tokens and unique_words
unique_words = set()
num_tok = 0
for row in dfnl.itertuples():
    for w in row[3].split():
        unique_words.add(w)
        num_tok += 1

In [30]:
# num unique_words 150854 num tokens 79801387
print("num unique_words", len(unique_words), "num tokens", num_tok)

num unique_words 150854 num tokens 79801387


In [31]:
len(dfnl['HADM_ID'].unique())

52726

## Create train/dev/test splits --> `disch_train/dev/test_split.csv`

In [4]:
fname = '%s/notes_labeled.csv' % MIMIC_3_DIR
base_name = "%s/disch" % MIMIC_3_DIR # for output
print(fname, base_name)
tr, dv, te = concat_and_split.split_data(fname, base_name=base_name)

[2022-12-13 09:53:36 concat_and_split.py->split_data():43]INFO: Splitting
[2022-12-13 09:53:36 concat_and_split.py->split_data():64]INFO: reading in from files: data/raw/train_full_hadm_ids.csv
[2022-12-13 09:53:36 concat_and_split.py->split_data():64]INFO: reading in from files: data/raw/dev_full_hadm_ids.csv
[2022-12-13 09:53:36 concat_and_split.py->split_data():64]INFO: reading in from files: data/raw/test_full_hadm_ids.csv
[2022-12-13 09:53:36 concat_and_split.py->split_data():76]INFO: Row 0 read. HADM_ID = 145834.
Row: ['3', '145834', 'admission date discharge date date of birth sex m service medicine chief complaint admitted from rehabilitation for hypotension systolic blood pressure to the 70s and decreased urine output history of present illness the patient is a year old male who had been hospitalized at the hospital1 from through of after undergoing a left femoral at bypass graft and was subsequently discharged to a rehabilitation facility on he presented again to the hospital

data/raw/notes_labeled.csv data/raw/disch
train split name:  data/raw/disch_train_split.csv
labeled name:  data/raw/notes_labeled.csv


[2022-12-13 09:53:37 concat_and_split.py->split_data():76]INFO: Row 10000 read. HADM_ID = 100184.
Row: ['9566', '100184', 'admission date discharge date date of birth sex f service cardiothoracic allergies morphine heparin agents attending first name3 lf chief complaint mrs known lastname is s p cabg and now has increasing sob doe she underwent cardiac catheterization which showed patent lima lad totally occluded svg om and ectatic svg pda and an aortic valve area of 59cm2 she was admitted to hospital hospital for diuresis due to an elevated wedge pressure and then was transferred to hospital1 for surgery major surgical or invasive procedure s p redo sternotomy cabgx1 svg pda avr 21mm pericardial history of present illness mrs known lastname is s p cabg and now has increasing sob doe she underwent cardiac catheterization which showed patent lima lad totally occluded svg om and ectatic svg pda and an aortic valve area of 59cm2 she was admitted to hospital hospital for diuresis due to an

## Build vocabulary from training data --> `vocab.csv`

In [5]:
# after building, 51917 words qualify out of 140795 total
vocab_min = 3
vocab_filename = '%s/vocab.csv' % MIMIC_3_DIR
build_vocab.build_vocab(vocab_min, tr, vocab_filename)

reading in data...


[2022-12-13 09:54:33 build_vocab.py->build_vocab():80]INFO: 51917 word tokens occur in 3 or more documents, out 140795 total words


removing rare terms


[2022-12-13 09:54:33 build_vocab.py->build_vocab():89]INFO: Writing vocab to data/raw/vocab.csv
[2022-12-13 09:54:34 build_vocab.py->build_vocab():93]INFO: Done! Some vocab examples, head: ['admission' 'date' 'discharge' 'of' 'birth'], tail: ['tracheomalatia' 'schwannomatosis' 'ghetto' 'pyocystitis' 'turon']


## Sort each data split by length for batching

In [6]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (MIMIC_3_DIR, splt)
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (MIMIC_3_DIR, splt), index=False)

## Pre-train word embeddings

Let's train word embeddings on all words

In [7]:
w2v_file = word_embeddings.word_embeddings('full', '%s/disch_full.csv' % MIMIC_3_DIR, 100, 0, 5)

[2022-12-13 10:00:46 utils.py->add_lifecycle_event():444]DEBUG: starting a new internal lifecycle event log for Word2Vec
[2022-12-13 10:00:46 utils.py->add_lifecycle_event():448]INFO: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2022-12-13T10:00:46.578650', 'gensim': '4.2.0', 'python': '3.10.5 (main, Jul 11 2022, 10:41:02) [GCC 9.4.0]', 'platform': 'Linux-5.15.0-56-generic-x86_64-with-glibc2.31', 'event': 'created'}
[2022-12-13 10:00:46 word2vec.py->scan_vocab():579]INFO: collecting all words and their counts
[2022-12-13 10:00:46 word2vec.py->_scan_vocab():562]INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


building word2vec vocab on data/raw/disch_full.csv...


[2022-12-13 10:00:48 word2vec.py->_scan_vocab():562]INFO: PROGRESS: at sentence #10000, processed 11230545 words, keeping 52828 word types
[2022-12-13 10:00:50 word2vec.py->_scan_vocab():562]INFO: PROGRESS: at sentence #20000, processed 22408775 words, keeping 73469 word types
[2022-12-13 10:00:52 word2vec.py->_scan_vocab():562]INFO: PROGRESS: at sentence #30000, processed 33786509 words, keeping 90109 word types
[2022-12-13 10:00:55 word2vec.py->_scan_vocab():562]INFO: PROGRESS: at sentence #40000, processed 47287714 words, keeping 109614 word types
[2022-12-13 10:00:58 word2vec.py->_scan_vocab():562]INFO: PROGRESS: at sentence #50000, processed 63882861 words, keeping 132475 word types
[2022-12-13 10:01:01 word2vec.py->scan_vocab():585]INFO: collected 150854 word types from a corpus of 79801387 raw words and 59652 sentences
[2022-12-13 10:01:01 word2vec.py->prepare_vocab():634]INFO: Creating a fresh vocabulary
[2022-12-13 10:01:01 utils.py->add_lifecycle_event():448]INFO: Word2Vec li

training...


[2022-12-13 10:01:04 word2vec.py->_log_progress():1594]INFO: EPOCH 0 - PROGRESS: at 3.97% examples, 2165013 words/s, in_qsize 6, out_qsize 2
[2022-12-13 10:01:05 word2vec.py->_log_progress():1594]INFO: EPOCH 0 - PROGRESS: at 8.00% examples, 2196565 words/s, in_qsize 7, out_qsize 0
[2022-12-13 10:01:06 word2vec.py->_log_progress():1594]INFO: EPOCH 0 - PROGRESS: at 12.20% examples, 2232418 words/s, in_qsize 7, out_qsize 0
[2022-12-13 10:01:07 word2vec.py->_log_progress():1594]INFO: EPOCH 0 - PROGRESS: at 16.26% examples, 2232782 words/s, in_qsize 8, out_qsize 1
[2022-12-13 10:01:08 word2vec.py->_log_progress():1594]INFO: EPOCH 0 - PROGRESS: at 20.50% examples, 2249299 words/s, in_qsize 7, out_qsize 0
[2022-12-13 10:01:09 word2vec.py->_log_progress():1594]INFO: EPOCH 0 - PROGRESS: at 24.68% examples, 2263972 words/s, in_qsize 7, out_qsize 0
[2022-12-13 10:01:10 word2vec.py->_log_progress():1594]INFO: EPOCH 0 - PROGRESS: at 28.91% examples, 2271637 words/s, in_qsize 7, out_qsize 0
[2022-12

writing embeddings to data/raw/processed_full.w2v


## Write pre-trained word embeddings with new vocab --> `processed_full.embed`

In [8]:
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_3_DIR, '%s/vocab.csv' % MIMIC_3_DIR, Y)

[2022-12-13 10:05:29 utils.py->load():482]INFO: loading Word2Vec object from data/raw/processed_full.w2v
[2022-12-13 10:05:29 smart_open_lib.py->open():166]DEBUG: {'uri': 'data/raw/processed_full.w2v', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
[2022-12-13 10:05:29 utils.py->_load_specials():516]INFO: loading wv recursively from data/raw/processed_full.w2v.wv.* with mmap=None
[2022-12-13 10:05:29 utils.py->_load_specials():521]INFO: loading vectors from data/raw/processed_full.w2v.wv.vectors.npy with mmap=None
[2022-12-13 10:05:29 utils.py->_load_specials():521]INFO: loading syn1neg from data/raw/processed_full.w2v.syn1neg.npy with mmap=None
[2022-12-13 10:05:29 utils.py->_load_specials():554]INFO: setting ignored attribute cum_table to None
[2022-12-13 10:05:30 utils.py->add_lifecycle_event():448]INFO: Word2Vec lifecycle event {'fname': 'data/raw/pro

## Pre-process code descriptions using the vocab --> ERROR MISSING!!

In [1]:
# done similar to Mullenbach 2018, get embeddings for labels!
vocab_index_descriptions.vocab_index_descriptions('%s/vocab.csv' % MIMIC_3_DIR,
                                                  '%s/description_vectors.vocab' % MIMIC_3_DIR)

NameError: name 'vocab_index_descriptions' is not defined

## Filter each split to the top 50 diagnosis/procedure codes --> `train/dev/test_50.csv`

In [10]:
Y = 50

In [11]:
# first calculate the top k
counts = Counter()
dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_3_DIR)
for row in dfnl.itertuples():
    for label in str(row[4]).split(';'):
        counts[label] += 1

In [12]:
counts.most_common()[0:20]

[('401.9', 20053),
 ('38.93', 14444),
 ('428.0', 12842),
 ('427.31', 12594),
 ('414.01', 12179),
 ('96.04', 9932),
 ('96.6', 9161),
 ('584.9', 8907),
 ('250.00', 8784),
 ('96.71', 8619),
 ('272.4', 8504),
 ('518.81', 7249),
 ('99.04', 7147),
 ('39.61', 6809),
 ('599.0', 6442),
 ('530.81', 6156),
 ('96.72', 5926),
 ('272.0', 5766),
 ('285.9', 5296),
 ('88.56', 5240)]

In [13]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [14]:
codes_50 = [code[0] for code in codes_50[:Y]]

In [15]:
codes_50

['401.9',
 '38.93',
 '428.0',
 '427.31',
 '414.01',
 '96.04',
 '96.6',
 '584.9',
 '250.00',
 '96.71',
 '272.4',
 '518.81',
 '99.04',
 '39.61',
 '599.0',
 '530.81',
 '96.72',
 '272.0',
 '285.9',
 '88.56',
 '244.9',
 '486',
 '38.91',
 '285.1',
 '36.15',
 '276.2',
 '496',
 '99.15',
 '995.92',
 'V58.61',
 '507.0',
 '038.9',
 '88.72',
 '585.9',
 '403.90',
 '311',
 '305.1',
 '37.22',
 '412',
 '33.24',
 '39.95',
 '287.5',
 '410.71',
 '276.1',
 'V45.81',
 '424.0',
 '45.13',
 'V15.82',
 '511.9',
 '37.23']

In [16]:
with open('%s/TOP_%s_CODES.csv' % (MIMIC_3_DIR, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

In [17]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (MIMIC_3_DIR, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled.csv' % MIMIC_3_DIR, 'r') as f:
        with open('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), 'w') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            # header
            w.writerow(next(r))
            i = 0
            for row in r:
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

train
dev
test


In [18]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), index=False)

## Done!