# MIMIC-III EDA
<p>Notebook used for working on features to include into magritte pipeline
    dataset location
    wget -r -N -c -np --user [username] --ask-password https://physionet.org/files/mimiciii/1.4/
</p>

# Initialize environment

When doing magritte dev work, I am currently using two repos:
- magritte (for core code changes)
- magritte scratch (for scratch notebooks for development work)

Configs below are setup so that you checkout both repos in parallel directories
    

In [1]:
import sys
import os

WORKING_DIR = f'{os.getcwd()}' # assume notebook is in root of magritte scratch dir
MAGRITTE_DIR = f'{WORKING_DIR}/../../magritte'
UTILITIES_DIR = f'{MAGRITTE_DIR}/utilities'

DATA_DIR = f'{MAGRITTE_DIR}/../data/mimiciii/working'

# Add the UTILITY_DIR to the path to import files
sys.path.append(UTILITIES_DIR)

In [2]:
import pandas as pd
import DataUtils
import pickle

# Load Data (MIMIC-III Dataset)

In [3]:
%%time
# Loading three tables from MIMIC-III
# 1) DIAGNOSES_ICD.csv.gz
# 2) NOTEEVENTS.csv.gz
# 3) D_ICD_DIAGNOSES.csv.gz

diagnosisICD_DF = pd.read_csv(f'{DATA_DIR}/DIAGNOSES_ICD.csv.gz',
                              #dtype = {'ROW_ID': int, 'SUBJECT_ID':int, 'HADM_ID':int, 'SEQ_NUM':float, 'ICD9_CODE': str},
                              compression='gzip'
                             )


DATE_COLS=['CHARTTIME','STORETIME', 'CHARTDATE']
notesDF = pd.read_csv(f'{DATA_DIR}/NOTEEVENTS.csv.gz',
                      parse_dates=DATE_COLS,
                      compression='gzip'
                     )

D_ICDDiagnosis_DF = pd.read_csv(f'{DATA_DIR}/D_ICD_DIAGNOSES.csv.gz', compression='gzip')

FileNotFoundError: [Errno 2] No such file or directory: '/home/magni/ML_Root/project_root/mimic-iii_LL/notebooks/../../magritte/../data/mimiciii/working/DIAGNOSES_ICD.csv.gz'

## DIAGNOSES_ICD.csv.gz summary

In [4]:
DataUtils.exploreDataframe(diagnosisICD_DF)

NameError: name 'diagnosisICD_DF' is not defined

In [None]:
DataUtils.showUniqueColVals(diagnosisICD_DF, 'ICD9_CODE')

## NOTEEVENTS.csv.gz summary

In [None]:
DataUtils.exploreDataframe(notesDF)

In [None]:
DataUtils.showUniqueColVals(notesDF, 'HADM_ID')

In [None]:
DataUtils.showUniqueColVals(notesDF, 'CATEGORY', showRecords=15)

## D_ICD_DIAGNOSES.csv.gz summary

In [None]:
DataUtils.exploreDataframe(D_ICDDiagnosis_DF)

In [None]:
DataUtils.showUniqueColVals(D_ICDDiagnosis_DF, 'ICD9_CODE')

# NOTEEVENTS.csv.gz explore, prep, and clean

## Explore

In [None]:
DataUtils.exploreDataframe(notesDF, showRecords=1)

In [None]:
# Find out how many entries by CATEGORY for each HADM_ID
sumDF = notesDF.groupby(['HADM_ID','CATEGORY']).size().to_frame('record_count')
sumDF.reset_index(inplace=True)
sumDF.head(10)

In [None]:
# How many times does "Discharge summary" appear per HADM_ID?
# Entry of: record_count=5, sum_by_record_count=30 means there were 30 HADM_ID's that had 5 "Discharge summary"

sumDF2 = sumDF[(sumDF['CATEGORY'] == 'Discharge summary')]
sumDF2 = sumDF2.groupby(['CATEGORY', 'record_count']).size().to_frame('sum_by_record_count')
sumDF2.reset_index(inplace=True)
sumDF2.head(10)

# Discharge summary may not provide helpful support for this experiment

## Clean and trim

In [None]:
notesDF_working = notesDF[['HADM_ID', 'TEXT']]
DataUtils.exploreDataframe(notesDF_working, showRecords=1)

In [None]:
# Drop nulls for HADM_ID
notesDF_working = DataUtils.dropNullRows(notesDF_working)

In [None]:
DataUtils.showUniqueColVals(dataFrame=notesDF_working, colName='HADM_ID')

In [None]:
# Combine notes for each HADM_ID into one record. 
# Should now be only one record per HADM_ID (100% unique in showUniqueColVals)
notesDF_compressed = notesDF_working.groupby('HADM_ID').agg({
                                             'TEXT': lambda x: ' '.join(x)
                                            })
notesDF_compressed.reset_index(inplace=True)

DataUtils.showUniqueColVals(dataFrame=notesDF_compressed,
                            colName='HADM_ID')

# DIAGNOSES_ICD.csv.gz expore, prep, clean


In [None]:
DataUtils.exploreDataframe(diagnosisICD_DF)

In [None]:
# Choose only the ones where SEQ_NUM = 1
# Filtering for only sequence 1 of records. Sequence 1 has the "primary" diagnosis for the patient
diagnosisICD_DF_working = diagnosisICD_DF[(diagnosisICD_DF['SEQ_NUM'] == 1.0)]
DataUtils.showUniqueColVals(diagnosisICD_DF_working, 'SEQ_NUM', showRecords=10)

In [None]:
DataUtils.showUniqueColVals(diagnosisICD_DF_working, 'HADM_ID', showRecords=1)

In [None]:
# Drop nulls values (47 in each of SEQ_NUM and ICD9_CODE)
diagnosisICD_DF_working = diagnosisICD_DF_working[['HADM_ID','ICD9_CODE']]
diagnosisICD_DF_working = DataUtils.dropNullRows(diagnosisICD_DF_working)

# Merge datasets for two outputs
- Output 1: All notes remain separate but will the ICD9 code from SEQ_NO 1
- Output 2: Notes are combined together. One set of notes for each HADM_ID.

## Output 1: All notes remain separate but with the ICD9 code from SEQ_NO 1

In [None]:
mimic3_notes_separateDF = pd.merge(notesDF_working, diagnosisICD_DF_working, on='HADM_ID', how='inner')
DataUtils.exploreDataframe(mimic3_notes_separateDF, showRecords=2)  

In [None]:
# Don't need the HADM_ID column
mimic3_notes_separateDF.drop(['HADM_ID'], axis=1, inplace=True)
DataUtils.exploreDataframe(mimic3_notes_separateDF, showRecords=2)

## Output 2: Notes are combined together. One set of notes for each HADM_ID.

In [None]:
mimic3_notes_combinedDF = pd.merge(notesDF_compressed, diagnosisICD_DF_working, on='HADM_ID', how='inner')
DataUtils.exploreDataframe(mimic3_notes_combinedDF, showRecords=2)                                   

In [None]:
# Don't need the HADM_ID column
mimic3_notes_combinedDF.drop(['HADM_ID'], axis=1, inplace=True)
DataUtils.exploreDataframe(mimic3_notes_combinedDF, showRecords=2)

# Get top ICD9 codes for filtering

In [None]:
topICD_codes, bottomICD_codesList = DataUtils.showUniqueColVals(mimic3_notes_separateDF, colName='ICD9_CODE', showRecords=5)

# Filter and persist the datasets

## Output 1: All notes remain separate but will the ICD9 code from SEQ_NO

In [None]:
# Apply filter
mimic_separate_DF = mimic3_notes_separateDF[mimic3_notes_separateDF['ICD9_CODE'].isin(topICD_codes)]
_, _ = DataUtils.showUniqueColVals(mimic_separate_DF, 'ICD9_CODE')


In [None]:
mimic_separate_DF.to_pickle(f'{DATA_DIR}/notes_separate.pkl.gz')

## Output 2: Notes are combined together. One set of notes for each HADM_ID.

In [None]:
# Apply filter
mimic_combined_DF = mimic3_notes_combinedDF[mimic3_notes_combinedDF['ICD9_CODE'].isin(topICD_codes)]
_, _ = DataUtils.showUniqueColVals(mimic_combined_DF, 'ICD9_CODE')


In [None]:
mimic_combined_DF.to_pickle(f'{DATA_DIR}/notes_combined.pkl.gz')

In [None]:
is_414 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['00414','0414', '414']))]
display(is_414)

In [None]:
filteredICD414 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("414", case=False)]
display(filteredICD414)

In [None]:
filteredICD414Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("ischemic", case=False)]
display(filteredICD414Word)

In [None]:
is_038 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['0038','038', '38']))]
display(is_038)

In [None]:
filteredICD038 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("38", case=False)]
display(filteredICD038)

In [None]:
filteredICD038Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("septicemia", case=False)]
display(filteredICD038Word)

In [None]:
is_410 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['00410','0410', '410']))]
display(is_410)

In [None]:
filteredICD410 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("410", case=False)]
display(filteredICD410)

In [None]:
filteredICD410Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("myocardial", case=False)]
display(filteredICD410Word)

In [None]:
is_424 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['00424','0424', '424']))]
display(is_424)

In [None]:
filteredICD424 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("424", case=False)]
display(filteredICD424)

In [None]:
filteredICD424Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("endocardium", case=False)]
display(filteredICD424Word)