# MIMIC-III EDA
<p>Notebook used for working on features to include into magritte pipeline
    dataset location
    wget -r -N -c -np --user [username] --ask-password https://physionet.org/files/mimiciii/1.4/
</p>

# Initialize environment

When doing magritte dev work, I am currently using two repos:
- magritte (for core code changes)
- magritte scratch (for scratch notebooks for development work)

Configs below are setup so that you checkout both repos in parallel directories
    

In [1]:
import sys
import os

WORKING_DIR = f'{os.getcwd()}' # assume notebook is in root of magritte scratch dir
MAGRITTE_DIR = f'{WORKING_DIR}/../../magritte'
UTILITIES_DIR = f'{MAGRITTE_DIR}/utilities'

DATA_DIR = f'{WORKING_DIR}/../../data/mimiciii'

# Add the UTILITY_DIR to the path to import files
sys.path.append(UTILITIES_DIR)

In [2]:
import pandas as pd
import DataUtils
import pickle

# Load Data (MIMIC-III Dataset)

In [3]:
%%time
# Loading three tables from MIMIC-III
# 1) DIAGNOSES_ICD.csv.gz
# 2) NOTEEVENTS.csv.gz
# 3) D_ICD_DIAGNOSES.csv.gz

diagnosisICD_DF = pd.read_csv(f'{DATA_DIR}/DIAGNOSES_ICD.csv.gz',
                              #dtype = {'ROW_ID': int, 'SUBJECT_ID':int, 'HADM_ID':int, 'SEQ_NUM':float, 'ICD9_CODE': str},
                              compression='gzip'
                             )


DATE_COLS=['CHARTTIME','STORETIME', 'CHARTDATE']
notesDF = pd.read_csv(f'{DATA_DIR}/NOTEEVENTS.csv.gz',
                      parse_dates=DATE_COLS,
                      compression='gzip'
                     )

D_ICDDiagnosis_DF = pd.read_csv(f'{DATA_DIR}/D_ICD_DIAGNOSES.csv.gz', compression='gzip')

CPU times: user 30.8 s, sys: 1.31 s, total: 32.2 s
Wall time: 33.1 s


## DIAGNOSES_ICD.csv.gz summary

In [4]:
DataUtils.exploreDataframe(diagnosisICD_DF)

dataframe shape: (651047, 5)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651047 entries, 0 to 651046
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ROW_ID      651047 non-null  int64  
 1   SUBJECT_ID  651047 non-null  int64  
 2   HADM_ID     651047 non-null  int64  
 3   SEQ_NUM     651000 non-null  float64
 4   ICD9_CODE   651000 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 24.8+ MB
None

First 5 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254



Last 5 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
651042,639798,97503,188195,2.0,20280
651043,639799,97503,188195,3.0,V5869
651044,639800,97503,188195,4.0,V1279
651045,639801,97503,188195,5.0,5275
651046,639802,97503,188195,6.0,5569



Null value count by column:


ROW_ID         0
SUBJECT_ID     0
HADM_ID        0
SEQ_NUM       47
ICD9_CODE     47
dtype: int64

In [5]:
DataUtils.showUniqueColVals(diagnosisICD_DF, 'ICD9_CODE')

Data type of column [ICD9_CODE] is: object
Total number of rows: 651047
Unique values in column: 6985 [percent unique: 1.0999999999999999%]
Null values in column: 47
List of unique values:
['40301' '486' '58281' ... 'E0070' '6940' '20930']

Top 5 records by frequency for ICD9_CODE
     ICD9_CODE  record_count
1962      4019         20703
2109      4280         13111
2098     42731         12891
2019     41401         12429
2957      5849          9119

Bottom 5 records by frequency for ICD9_CODE
     ICD9_CODE  record_count
6983     V9103             1
6854      V562             1
1336      3060             1
202      07953             1
1338      3062             1


(['4019', '4280', '42731', '41401', '5849'],
 ['V9103', 'V562', '3060', '07953', '3062'])

## NOTEEVENTS.csv.gz summary

In [6]:
DataUtils.exploreDataframe(notesDF)

dataframe shape: (2083180, 11)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2083180 entries, 0 to 2083179
Data columns (total 11 columns):
 #   Column       Dtype         
---  ------       -----         
 0   ROW_ID       int64         
 1   SUBJECT_ID   int64         
 2   HADM_ID      float64       
 3   CHARTDATE    datetime64[ns]
 4   CHARTTIME    datetime64[ns]
 5   STORETIME    datetime64[ns]
 6   CATEGORY     object        
 7   DESCRIPTION  object        
 8   CGID         float64       
 9   ISERROR      float64       
 10  TEXT         object        
dtypes: datetime64[ns](3), float64(3), int64(2), object(3)
memory usage: 174.8+ MB
None

First 5 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,NaT,NaT,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,NaT,NaT,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,NaT,NaT,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,NaT,NaT,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,NaT,NaT,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...



Last 5 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
2083175,2070657,31097,115637.0,2132-01-21,2132-01-21 03:27:00,2132-01-21 03:38:00,Nursing/other,Report,17581.0,,NPN\n\n\n#1 Infant remains in RA with O2 sats...
2083176,2070658,31097,115637.0,2132-01-21,2132-01-21 09:50:00,2132-01-21 09:53:00,Nursing/other,Report,19211.0,,"Neonatology\nDOL #5, CGA 36 weeks.\n\nCVR: Con..."
2083177,2070659,31097,115637.0,2132-01-21,2132-01-21 16:42:00,2132-01-21 16:44:00,Nursing/other,Report,20104.0,,Family Meeting Note\nFamily meeting held with ...
2083178,2070660,31097,115637.0,2132-01-21,2132-01-21 18:05:00,2132-01-21 18:16:00,Nursing/other,Report,16023.0,,NPN 1800\n\n\n#1 Resp: [**Known lastname 2243*...
2083179,2070661,31097,115637.0,2132-01-21,2132-01-21 18:05:00,2132-01-21 18:31:00,Nursing/other,Report,16023.0,,NPN 1800\nNursing Addendum:\n[**Known lastname...



Null value count by column:


ROW_ID               0
SUBJECT_ID           0
HADM_ID         231836
CHARTDATE            0
CHARTTIME       316566
STORETIME       836776
CATEGORY             0
DESCRIPTION          0
CGID            836776
ISERROR        2082294
TEXT                 0
dtype: int64

In [7]:
DataUtils.showUniqueColVals(notesDF, 'HADM_ID')

Data type of column [HADM_ID] is: float64
Total number of rows: 2083180
Unique values in column: 58362 [percent unique: 2.8000000000000003%]
Null values in column: 231836
List of unique values:
[167853. 107527. 167118. ... 132855. 115098. 175166.]

Top 5 records by frequency for HADM_ID
        HADM_ID  record_count
10266  117448.0          1233
23848  140792.0          1144
22421  138363.0          1099
44071  175448.0          1084
53982  192431.0          1073

Bottom 5 records by frequency for HADM_ID
        HADM_ID  record_count
29180  149973.0             1
56293  196399.0             1
38232  165601.0             1
38166  165500.0             1
38108  165375.0             1


([117448.0, 140792.0, 138363.0, 175448.0, 192431.0],
 [149973.0, 196399.0, 165601.0, 165500.0, 165375.0])

In [8]:
DataUtils.showUniqueColVals(notesDF, 'CATEGORY', showRecords=15)

Data type of column [CATEGORY] is: object
Total number of rows: 2083180
Unique values in column: 15 [percent unique: 0.0%]
Null values in column: 0
List of unique values:
['Discharge summary' 'Echo' 'ECG' 'Nursing' 'Physician ' 'Rehab Services'
 'Case Management ' 'Respiratory ' 'Nutrition' 'General' 'Social Work'
 'Pharmacy' 'Consult' 'Radiology' 'Nursing/other']

Top 15 records by frequency for CATEGORY
             CATEGORY  record_count
7       Nursing/other        822497
11          Radiology        522279
6             Nursing        223556
3                 ECG        209051
10         Physician         141624
2   Discharge summary         59652
4                Echo         45794
13       Respiratory          31739
8           Nutrition          9418
5             General          8301
12     Rehab Services          5431
14        Social Work          2670
0    Case Management            967
9            Pharmacy           103
1             Consult            98

Bottom 15 reco

(['Nursing/other',
  'Radiology',
  'Nursing',
  'ECG',
  'Physician ',
  'Discharge summary',
  'Echo',
  'Respiratory ',
  'Nutrition',
  'General',
  'Rehab Services',
  'Social Work',
  'Case Management ',
  'Pharmacy',
  'Consult'],
 ['Consult',
  'Pharmacy',
  'Case Management ',
  'Social Work',
  'Rehab Services',
  'General',
  'Nutrition',
  'Respiratory ',
  'Echo',
  'Discharge summary',
  'Physician ',
  'ECG',
  'Nursing',
  'Radiology',
  'Nursing/other'])

## D_ICD_DIAGNOSES.csv.gz summary

In [9]:
DataUtils.exploreDataframe(D_ICDDiagnosis_DF)

dataframe shape: (14567, 4)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14567 entries, 0 to 14566
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ROW_ID       14567 non-null  int64 
 1   ICD9_CODE    14567 non-null  object
 2   SHORT_TITLE  14567 non-null  object
 3   LONG_TITLE   14567 non-null  object
dtypes: int64(1), object(3)
memory usage: 455.3+ KB
None

First 5 in dataframe


Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."



Last 5 in dataframe


Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
14562,14432,V7399,Scrn unspcf viral dis,Special screening examination for unspecified ...
14563,14433,V740,Screening for cholera,Screening examination for cholera
14564,14434,V741,Screening-pulmonary TB,Screening examination for pulmonary tuberculosis
14565,14435,V742,Screening for leprosy,Screening examination for leprosy (Hansen's di...
14566,14436,V743,Screening for diphtheria,Screening examination for diphtheria



Null value count by column:


ROW_ID         0
ICD9_CODE      0
SHORT_TITLE    0
LONG_TITLE     0
dtype: int64

In [10]:
DataUtils.showUniqueColVals(D_ICDDiagnosis_DF, 'ICD9_CODE')

Data type of column [ICD9_CODE] is: object
Total number of rows: 14567
Unique values in column: 14567 [percent unique: 100.0%]
Null values in column: 0
List of unique values:
['01166' '01170' '01171' ... 'V741' 'V742' 'V743']

Top 5 records by frequency for ICD9_CODE
     ICD9_CODE  record_count
0         0010             1
9730     80186             1
9704     80154             1
9705     80155             1
9706     80156             1

Bottom 5 records by frequency for ICD9_CODE
   ICD9_CODE  record_count
0       0010             1
28      0054             1
27      0053             1
13     00323             1
2       0019             1


(['0010', '80186', '80154', '80155', '80156'],
 ['0010', '0054', '0053', '00323', '0019'])

# NOTEEVENTS.csv.gz explore, prep, and clean

## Explore

In [11]:
DataUtils.exploreDataframe(notesDF, showRecords=1)

dataframe shape: (2083180, 11)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2083180 entries, 0 to 2083179
Data columns (total 11 columns):
 #   Column       Dtype         
---  ------       -----         
 0   ROW_ID       int64         
 1   SUBJECT_ID   int64         
 2   HADM_ID      float64       
 3   CHARTDATE    datetime64[ns]
 4   CHARTTIME    datetime64[ns]
 5   STORETIME    datetime64[ns]
 6   CATEGORY     object        
 7   DESCRIPTION  object        
 8   CGID         float64       
 9   ISERROR      float64       
 10  TEXT         object        
dtypes: datetime64[ns](3), float64(3), int64(2), object(3)
memory usage: 174.8+ MB
None

First 1 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,NaT,NaT,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...



Last 1 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
2083179,2070661,31097,115637.0,2132-01-21,2132-01-21 18:05:00,2132-01-21 18:31:00,Nursing/other,Report,16023.0,,NPN 1800\nNursing Addendum:\n[**Known lastname...



Null value count by column:


ROW_ID               0
SUBJECT_ID           0
HADM_ID         231836
CHARTDATE            0
CHARTTIME       316566
STORETIME       836776
CATEGORY             0
DESCRIPTION          0
CGID            836776
ISERROR        2082294
TEXT                 0
dtype: int64

In [12]:
# Find out how many entries by CATEGORY for each HADM_ID
sumDF = notesDF.groupby(['HADM_ID','CATEGORY']).size().to_frame('record_count')
sumDF.reset_index(inplace=True)
sumDF.head(10)

Unnamed: 0,HADM_ID,CATEGORY,record_count
0,100001.0,Discharge summary,1
1,100001.0,Radiology,1
2,100003.0,Discharge summary,1
3,100003.0,ECG,1
4,100003.0,Echo,1
5,100003.0,Nursing,12
6,100003.0,Physician,9
7,100003.0,Radiology,2
8,100006.0,Discharge summary,2
9,100006.0,ECG,1


In [13]:
# How many times does "Discharge summary" appear per HADM_ID?
# Entry of: record_count=5, sum_by_record_count=30 means there were 30 HADM_ID's that had 5 "Discharge summary"

sumDF2 = sumDF[(sumDF['CATEGORY'] == 'Discharge summary')]
sumDF2 = sumDF2.groupby(['CATEGORY', 'record_count']).size().to_frame('sum_by_record_count')
sumDF2.reset_index(inplace=True)
sumDF2.head(10)

# Discharge summary may not provide helpful support for this experiment

Unnamed: 0,CATEGORY,record_count,sum_by_record_count
0,Discharge summary,1,47006
1,Discharge summary,2,4782
2,Discharge summary,3,732
3,Discharge summary,4,161
4,Discharge summary,5,30
5,Discharge summary,6,13
6,Discharge summary,7,2


## Clean and trim

In [14]:
notesDF_working = notesDF[['HADM_ID', 'TEXT']]
DataUtils.exploreDataframe(notesDF_working, showRecords=1)

dataframe shape: (2083180, 2)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2083180 entries, 0 to 2083179
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   HADM_ID  float64
 1   TEXT     object 
dtypes: float64(1), object(1)
memory usage: 31.8+ MB
None

First 1 in dataframe


Unnamed: 0,HADM_ID,TEXT
0,167853.0,Admission Date: [**2151-7-16**] Dischar...



Last 1 in dataframe


Unnamed: 0,HADM_ID,TEXT
2083179,115637.0,NPN 1800\nNursing Addendum:\n[**Known lastname...



Null value count by column:


HADM_ID    231836
TEXT            0
dtype: int64

In [15]:
# Drop nulls for HADM_ID
notesDF_working = DataUtils.dropNullRows(notesDF_working)

Dropping rows where any column is null

Original dataFrame shape: (2083180, 2)
Original null value count by column:


HADM_ID    231836
TEXT            0
dtype: int64


*** Rows with nulls meeting criteria have been dropped

New values:
dataframe shape: (1851344, 2)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851344 entries, 0 to 2083179
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   HADM_ID  float64
 1   TEXT     object 
dtypes: float64(1), object(1)
memory usage: 42.4+ MB
None

First 1 in dataframe


Unnamed: 0,HADM_ID,TEXT
0,167853.0,Admission Date: [**2151-7-16**] Dischar...



Last 1 in dataframe


Unnamed: 0,HADM_ID,TEXT
2083179,115637.0,NPN 1800\nNursing Addendum:\n[**Known lastname...



Null value count by column:


HADM_ID    0
TEXT       0
dtype: int64

New dataframe returned


In [16]:
DataUtils.showUniqueColVals(dataFrame=notesDF_working, colName='HADM_ID')

Data type of column [HADM_ID] is: float64
Total number of rows: 1851344
Unique values in column: 58361 [percent unique: 3.2%]
Null values in column: 0
List of unique values:
[167853. 107527. 167118. ... 132855. 115098. 175166.]

Top 5 records by frequency for HADM_ID
        HADM_ID  record_count
10266  117448.0          1233
23848  140792.0          1144
22421  138363.0          1099
44071  175448.0          1084
53982  192431.0          1073

Bottom 5 records by frequency for HADM_ID
        HADM_ID  record_count
29180  149973.0             1
56293  196399.0             1
38232  165601.0             1
38166  165500.0             1
38108  165375.0             1


([117448.0, 140792.0, 138363.0, 175448.0, 192431.0],
 [149973.0, 196399.0, 165601.0, 165500.0, 165375.0])

In [17]:
# Combine notes for each HADM_ID into one record. 
# Should now be only one record per HADM_ID (100% unique in showUniqueColVals)
notesDF_compressed = notesDF_working.groupby('HADM_ID').agg({
                                             'TEXT': lambda x: ' '.join(x)
                                            })
notesDF_compressed.reset_index(inplace=True)

DataUtils.showUniqueColVals(dataFrame=notesDF_compressed,
                            colName='HADM_ID')

Data type of column [HADM_ID] is: float64
Total number of rows: 58361
Unique values in column: 58361 [percent unique: 100.0%]
Null values in column: 0
List of unique values:
[100001. 100003. 100006. ... 199995. 199998. 199999.]

Top 5 records by frequency for HADM_ID
        HADM_ID  record_count
0      100001.0             1
38926  166803.0             1
38900  166757.0             1
38901  166758.0             1
38902  166760.0             1

Bottom 5 records by frequency for HADM_ID
     HADM_ID  record_count
0   100001.0             1
28  100040.0             1
27  100039.0             1
13  100021.0             1
2   100006.0             1


([100001.0, 166803.0, 166757.0, 166758.0, 166760.0],
 [100001.0, 100040.0, 100039.0, 100021.0, 100006.0])

# DIAGNOSES_ICD.csv.gz expore, prep, clean


In [18]:
DataUtils.exploreDataframe(diagnosisICD_DF)

dataframe shape: (651047, 5)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651047 entries, 0 to 651046
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ROW_ID      651047 non-null  int64  
 1   SUBJECT_ID  651047 non-null  int64  
 2   HADM_ID     651047 non-null  int64  
 3   SEQ_NUM     651000 non-null  float64
 4   ICD9_CODE   651000 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 24.8+ MB
None

First 5 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254



Last 5 in dataframe


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
651042,639798,97503,188195,2.0,20280
651043,639799,97503,188195,3.0,V5869
651044,639800,97503,188195,4.0,V1279
651045,639801,97503,188195,5.0,5275
651046,639802,97503,188195,6.0,5569



Null value count by column:


ROW_ID         0
SUBJECT_ID     0
HADM_ID        0
SEQ_NUM       47
ICD9_CODE     47
dtype: int64

In [19]:
# Choose only the ones where SEQ_NUM = 1
# Filtering for only sequence 1 of records. Sequence 1 has the "primary" diagnosis for the patient
diagnosisICD_DF_working = diagnosisICD_DF[(diagnosisICD_DF['SEQ_NUM'] == 1.0)]
DataUtils.showUniqueColVals(diagnosisICD_DF_working, 'SEQ_NUM', showRecords=10)

Data type of column [SEQ_NUM] is: float64
Total number of rows: 58929
Unique values in column: 1 [percent unique: 0.0%]
Null values in column: 0
List of unique values:
[1.]

Top 10 records by frequency for SEQ_NUM
   SEQ_NUM  record_count
0      1.0         58929

Bottom 10 records by frequency for SEQ_NUM
   SEQ_NUM  record_count
0      1.0         58929


([1.0], [1.0])

In [20]:
DataUtils.showUniqueColVals(diagnosisICD_DF_working, 'HADM_ID', showRecords=1)

Data type of column [HADM_ID] is: int64
Total number of rows: 58929
Unique values in column: 58929 [percent unique: 100.0%]
Null values in column: 0
List of unique values:
[172335 173633 174105 ... 189314 168949 188195]

Top 1 records by frequency for HADM_ID
   HADM_ID  record_count
0   100001             1

Bottom 1 records by frequency for HADM_ID
   HADM_ID  record_count
0   100001             1


([100001], [100001])

In [21]:
# Drop nulls values (47 in each of SEQ_NUM and ICD9_CODE)
diagnosisICD_DF_working = diagnosisICD_DF_working[['HADM_ID','ICD9_CODE']]
diagnosisICD_DF_working = DataUtils.dropNullRows(diagnosisICD_DF_working)

Dropping rows where any column is null

Original dataFrame shape: (58929, 2)
Original null value count by column:


HADM_ID      0
ICD9_CODE    0
dtype: int64


*** Rows with nulls meeting criteria have been dropped

New values:
dataframe shape: (58929, 2)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58929 entries, 0 to 651041
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   HADM_ID    58929 non-null  int64 
 1   ICD9_CODE  58929 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB
None

First 1 in dataframe


Unnamed: 0,HADM_ID,ICD9_CODE
0,172335,40301



Last 1 in dataframe


Unnamed: 0,HADM_ID,ICD9_CODE
651041,188195,7842



Null value count by column:


HADM_ID      0
ICD9_CODE    0
dtype: int64

New dataframe returned


# Merge datasets for two outputs
- Output 1: All notes remain separate but will the ICD9 code from SEQ_NO 1
- Output 2: Notes are combined together. One set of notes for each HADM_ID.

## Output 1: All notes remain separate but with the ICD9 code from SEQ_NO 1

In [22]:
mimic3_notes_separateDF = pd.merge(notesDF_working, diagnosisICD_DF_working, on='HADM_ID', how='inner')
DataUtils.exploreDataframe(mimic3_notes_separateDF, showRecords=2)  

dataframe shape: (1851243, 3)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851243 entries, 0 to 1851242
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   HADM_ID    float64
 1   TEXT       object 
 2   ICD9_CODE  object 
dtypes: float64(1), object(2)
memory usage: 56.5+ MB
None

First 2 in dataframe


Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
0,167853.0,Admission Date: [**2151-7-16**] Dischar...,1193
1,167853.0,Admission Date: [**2151-7-16**] Dischar...,1193



Last 2 in dataframe


Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
1851241,175166.0,Neonatology\nBaby Girl [**Known lastname 1672*...,V3001
1851242,175166.0,NPN NICU\nTerm female admitted to NICU for sep...,V3001



Null value count by column:


HADM_ID      0
TEXT         0
ICD9_CODE    0
dtype: int64

In [23]:
# Don't need the HADM_ID column
mimic3_notes_separateDF.drop(['HADM_ID'], axis=1, inplace=True)
DataUtils.exploreDataframe(mimic3_notes_separateDF, showRecords=2)

dataframe shape: (1851243, 2)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851243 entries, 0 to 1851242
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   TEXT       object
 1   ICD9_CODE  object
dtypes: object(2)
memory usage: 42.4+ MB
None

First 2 in dataframe


Unnamed: 0,TEXT,ICD9_CODE
0,Admission Date: [**2151-7-16**] Dischar...,1193
1,Admission Date: [**2151-7-16**] Dischar...,1193



Last 2 in dataframe


Unnamed: 0,TEXT,ICD9_CODE
1851241,Neonatology\nBaby Girl [**Known lastname 1672*...,V3001
1851242,NPN NICU\nTerm female admitted to NICU for sep...,V3001



Null value count by column:


TEXT         0
ICD9_CODE    0
dtype: int64

## Output 2: Notes are combined together. One set of notes for each HADM_ID.

In [24]:
mimic3_notes_combinedDF = pd.merge(notesDF_compressed, diagnosisICD_DF_working, on='HADM_ID', how='inner')
DataUtils.exploreDataframe(mimic3_notes_combinedDF, showRecords=2)                                   

dataframe shape: (58328, 3)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58328 entries, 0 to 58327
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   HADM_ID    58328 non-null  float64
 1   TEXT       58328 non-null  object 
 2   ICD9_CODE  58328 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.8+ MB
None

First 2 in dataframe


Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
0,100001.0,Admission Date: [**2117-9-11**] ...,25013
1,100003.0,Admission Date: [**2150-4-17**] ...,53100



Last 2 in dataframe


Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
58326,199998.0,Admission Date: [**2119-2-18**] ...,41401
58327,199999.0,Admission Date: [**2136-4-4**] D...,48284



Null value count by column:


HADM_ID      0
TEXT         0
ICD9_CODE    0
dtype: int64

In [25]:
# Don't need the HADM_ID column
mimic3_notes_combinedDF.drop(['HADM_ID'], axis=1, inplace=True)
DataUtils.exploreDataframe(mimic3_notes_combinedDF, showRecords=2)

dataframe shape: (58328, 2)

dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58328 entries, 0 to 58327
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   TEXT       58328 non-null  object
 1   ICD9_CODE  58328 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB
None

First 2 in dataframe


Unnamed: 0,TEXT,ICD9_CODE
0,Admission Date: [**2117-9-11**] ...,25013
1,Admission Date: [**2150-4-17**] ...,53100



Last 2 in dataframe


Unnamed: 0,TEXT,ICD9_CODE
58326,Admission Date: [**2119-2-18**] ...,41401
58327,Admission Date: [**2136-4-4**] D...,48284



Null value count by column:


TEXT         0
ICD9_CODE    0
dtype: int64

# Get top ICD9 codes for filtering

In [26]:
topICD_codes, bottomICD_codesList = DataUtils.showUniqueColVals(mimic3_notes_separateDF, colName='ICD9_CODE', showRecords=5)

Data type of column [ICD9_CODE] is: object
Total number of rows: 1851243
Unique values in column: 2769 [percent unique: 0.1%]
Null values in column: 0
List of unique values:
['01193' '5191' '51884' ... 'V502' 'V292' '7765']

Top 5 records by frequency for ICD9_CODE
     ICD9_CODE  record_count
2732     V3001        157174
2735     V3101        109767
2731     V3000         87594
47        0389         76775
833      41401         64187

Bottom 5 records by frequency for ICD9_CODE
     ICD9_CODE  record_count
2513      9471             1
2221      8364             1
2413      8749             1
131       1430             1
235       1723             1


# Filter and persist the datasets

## Output 1: All notes remain separate but will the ICD9 code from SEQ_NO

In [27]:
# Apply filter
mimic_separate_DF = mimic3_notes_separateDF[mimic3_notes_separateDF['ICD9_CODE'].isin(topICD_codes)]
_, _ = DataUtils.showUniqueColVals(mimic_separate_DF, 'ICD9_CODE')


Data type of column [ICD9_CODE] is: object
Total number of rows: 495497
Unique values in column: 5 [percent unique: 0.0%]
Null values in column: 0
List of unique values:
['41401' '0389' 'V3000' 'V3101' 'V3001']

Top 5 records by frequency for ICD9_CODE
  ICD9_CODE  record_count
3     V3001        157174
4     V3101        109767
2     V3000         87594
0      0389         76775
1     41401         64187

Bottom 5 records by frequency for ICD9_CODE
  ICD9_CODE  record_count
1     41401         64187
0      0389         76775
2     V3000         87594
4     V3101        109767
3     V3001        157174


In [28]:
mimic_separate_DF.to_pickle(f'{DATA_DIR}/working/notes_separate.pkl.gz')

## Output 2: Notes are combined together. One set of notes for each HADM_ID.

In [29]:
# Apply filter
mimic_combined_DF = mimic3_notes_combinedDF[mimic3_notes_combinedDF['ICD9_CODE'].isin(topICD_codes)]
_, _ = DataUtils.showUniqueColVals(mimic_combined_DF, 'ICD9_CODE')


Data type of column [ICD9_CODE] is: object
Total number of rows: 12655
Unique values in column: 5 [percent unique: 0.0%]
Null values in column: 0
List of unique values:
['41401' 'V3000' 'V3001' '0389' 'V3101']

Top 5 records by frequency for ICD9_CODE
  ICD9_CODE  record_count
1     41401          3497
2     V3000          3427
3     V3001          2695
0      0389          2043
4     V3101           993

Bottom 5 records by frequency for ICD9_CODE
  ICD9_CODE  record_count
4     V3101           993
0      0389          2043
3     V3001          2695
2     V3000          3427
1     41401          3497


In [30]:
mimic_combined_DF.to_pickle(f'{DATA_DIR}/working/notes_combined.pkl.gz')

In [31]:
is_414 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['00414','0414', '414']))]
display(is_414)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE


In [32]:
filteredICD414 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("414", case=False)]
display(filteredICD414)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
447,498,04141,Shiga txn-produce E.coli,Shiga toxin-producing Escherichia coli [E. col...
448,499,04142,Shga txn prod E.coli NEC,Other specified Shiga toxin-producing Escheric...
449,500,04143,Shga txn prod E.coli NOS,Shiga toxin-producing Escherichia coli [E. col...
450,501,04149,E.coli infection NEC/NOS,Other and unspecified Escherichia coli [E. coli]
1204,926,1414,Mal neo ant 2/3 tongue,Malignant neoplasm of anterior two-thirds of t...
4218,4219,4414,Abdom aortic aneurysm,Abdominal aneurysm without mention of rupture
4372,4373,41400,Cor ath unsp vsl ntv/gft,Coronary atherosclerosis of unspecified type o...
4373,4374,41401,Crnry athrscl natve vssl,Coronary atherosclerosis of native coronary ar...
4374,4375,41402,Crn ath atlg vn bps grft,Coronary atherosclerosis of autologous vein by...
4375,4376,41403,Crn ath nonatlg blg grft,Coronary atherosclerosis of nonautologous biol...


In [33]:
filteredICD414Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("ischemic", case=False)]
display(filteredICD414Word)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
4367,4368,41189,Ac ischemic hrt dis NEC,Other acute and subacute forms of ischemic hea...
4387,4388,4148,Chr ischemic hrt dis NEC,Other specified forms of chronic ischemic hear...
4388,4389,4149,Chr ischemic hrt dis NOS,"Chronic ischemic heart disease, unspecified"
4599,4598,38802,Trans ischemic deafness,Transient ischemic deafness
4823,4822,37741,Ischemic optic neuropthy,Ischemic optic neuropathy
5333,5062,4371,Ac cerebrovasc insuf NOS,Other generalized ischemic cerebrovascular dis...
8410,8697,76870,Hypoxc-ischem enceph NOS,"Hypoxic-ischemic encephalopathy, unspecified"
8411,8698,76871,Mild hypox-ischem enceph,Mild hypoxic-ischemic encephalopathy
8412,8699,76872,Mod hypox-ischem enceph,Moderate hypoxic-ischemic encephalopathy
8413,8700,76873,Sev hypox-ischem enceph,Severe hypoxic-ischemic encephalopathy


In [34]:
is_038 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['0038','038', '38']))]
display(is_038)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
76,249,38,Salmonella infection NEC,Other specified salmonella infections


In [35]:
filteredICD038 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("38", case=False)]
display(filteredICD038)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
76,249,0038,Salmonella infection NEC,Other specified salmonella infections
370,421,09381,Syphilitic pericarditis,Syphilitic pericarditis
371,422,09382,Syphilitic myocarditis,Syphilitic myocarditis
372,423,09389,Cardiovascular syph NEC,Other specified cardiovascular syphilis
526,577,0538,H zoster complicated NOS,Herpes zoster with unspecified complication
...,...,...,...,...
14313,14183,E9838,Undet circ-suffocate NEC,Strangulation or suffocation by other specifie...
14369,14239,8738,Open wound of head NEC,Other and unspecified open wound of head witho...
14530,14400,E9938,War inj:explosion NEC,Injury due to war operations by other specifie...
14559,14429,V7388,Scrn oth spcf chlmyd dis,Special screening examination for other specif...


In [36]:
filteredICD038Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("septicemia", case=False)]
display(filteredICD038Word)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
69,242,31,Salmonella septicemia,Salmonella septicemia
542,593,545,Herpetic septicemia,Herpetic septicemia
595,646,380,Streptococcal septicemia,Streptococcal septicemia
596,647,3810,Staphylcocc septicem NOS,"Staphylococcal septicemia, unspecified"
597,648,3811,Meth susc Staph aur sept,Methicillin susceptible Staphylococcus aureus ...
598,649,3812,MRSA septicemia,Methicillin resistant Staphylococcus aureus se...
599,650,3819,Staphylcocc septicem NEC,Other staphylococcal septicemia
600,651,382,Pneumococcal septicemia,Pneumococcal septicemia [Streptococcus pneumon...
601,652,383,Anaerobic septicemia,Septicemia due to anaerobes
602,653,3840,Gram-neg septicemia NOS,"Septicemia due to gram-negative organism, unsp..."


In [37]:
is_410 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['00410','0410', '410']))]
display(is_410)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE


In [38]:
filteredICD410 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("410", case=False)]
display(filteredICD410)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
440,491,04109,Other streptococcus,Streptococcus infection in conditions classifi...
529,580,05410,Genital herpes NOS,"Genital herpes, unspecified"
626,677,04100,Streptococcus unspecf,Streptococcus infection in conditions classifi...
627,678,04101,Streptococcus group a,Streptococcus infection in conditions classifi...
628,679,04102,Streptococcus group b,Streptococcus infection in conditions classifi...
...,...,...,...,...
12945,10939,79410,Abn stimul response NOS,Nonspecific abnormal response to nerve stimula...
13725,10670,E8410,Pow aircraft acc-spcrft,"Accident to powered aircraft, other and unspec..."
13929,11365,92410,Contusion of lower leg,Contusion of lower leg
14137,9309,86410,Liver injury NOS-open,"Injury to liver with open wound into cavity, u..."


In [39]:
filteredICD410Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("myocardial", case=False)]
display(filteredICD410Word)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
4334,4335,41000,"AMI anterolateral,unspec",Acute myocardial infarction of anterolateral w...
4335,4336,41001,"AMI anterolateral, init",Acute myocardial infarction of anterolateral w...
4336,4337,41002,"AMI anterolateral,subseq",Acute myocardial infarction of anterolateral w...
4337,4338,41010,"AMI anterior wall,unspec",Acute myocardial infarction of other anterior ...
4338,4339,41011,"AMI anterior wall, init",Acute myocardial infarction of other anterior ...
4339,4340,41012,"AMI anterior wall,subseq",Acute myocardial infarction of other anterior ...
4340,4341,41020,"AMI inferolateral,unspec",Acute myocardial infarction of inferolateral w...
4341,4342,41021,"AMI inferolateral, init",Acute myocardial infarction of inferolateral w...
4342,4343,41022,"AMI inferolateral,subseq",Acute myocardial infarction of inferolateral w...
4343,4344,41030,"AMI inferopost, unspec",Acute myocardial infarction of inferoposterior...


In [40]:
is_424 = D_ICDDiagnosis_DF[(D_ICDDiagnosis_DF['ICD9_CODE'].isin(['00424','0424', '424']))]
display(is_424)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE


In [41]:
filteredICD424 = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['ICD9_CODE'].str.contains("424", case=False)]
display(filteredICD424)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
1784,1787,24240,Thyrotox-ect nod no cris,Thyrotoxicosis from ectopic thyroid nodule wit...
1785,1788,24241,Thyrotox-ect nod w cris,Thyrotoxicosis from ectopic thyroid nodule wit...
3665,4049,36424,Vogt-koyanagi syndrome,Vogt-koyanagi syndrome
3872,3569,38424,Mult perf tympanic memb,Multiple perforations of tympanic membrane
4422,4423,4240,Mitral valve disorder,Mitral valve disorders
4423,4424,4241,Aortic valve disorder,Aortic valve disorders
4424,4425,4242,Nonrheum tricusp val dis,"Tricuspid valve disorders, specified as nonrhe..."
4425,4426,4243,Pulmonary valve disorder,Pulmonary valve disorders
4426,4427,42490,Endocarditis NOS,"Endocarditis, valve unspecified, unspecified c..."
4427,4428,42491,Endocarditis in oth dis,Endocarditis in diseases classified elsewhere


In [42]:
filteredICD424Word = D_ICDDiagnosis_DF.loc[D_ICDDiagnosis_DF['LONG_TITLE'].str.contains("endocardium", case=False)]
display(filteredICD424Word)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
4570,4569,3979,Rheum endocarditis NOS,"Rheumatic diseases of endocardium, valve unspe..."
