In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint
import os

pd.set_option('display.max_colwidth',100000) #https://stackoverflow.com/questions/54692405/output-truncation-in-google-colab

# Below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML



In [3]:
from datetime import datetime
def convert_time(x):
  date_time_str = x
  if date_time_str[-1] == ':':
    date_time_str += '00'
  date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
  return date_time_obj

In [4]:
MV = pd.read_csv('Downloads/derived_MV.csv')
MV.head(5)

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,stay_id
0,0,10001884,26184834,37510196
1,1,10002013,23581541,39060235
2,2,10002428,23473524,35479615
3,3,10002428,28662225,38875437
4,4,10002760,28094813,31831386


In [5]:
print(MV['stay_id'].nunique())

26892


In [6]:
pO2 = pd.read_csv('Downloads/derived_pO2.csv')
pO2.charttime = pO2.charttime.apply(convert_time)
pO2.head(5)

Unnamed: 0.1,Unnamed: 0,subject_id,po2,charttime
0,0,10000935,86.0,2187-10-22 15:40:00
1,1,10001884,72.0,2131-01-10 13:15:00
2,2,10001884,65.0,2131-01-12 21:04:00
3,3,10001884,69.0,2131-01-13 02:28:00
4,4,10001884,74.0,2131-01-11 03:42:00


In [7]:
print(pO2['subject_id'].nunique())

40029


In [8]:
fiO2 = pd.read_csv('Downloads/derived_fiO2.csv')
fiO2.charttime = fiO2.charttime.apply(convert_time)
fiO2.head(5)

Unnamed: 0.1,Unnamed: 0,subject_id,fio2_chartevents,charttime,converted_fiO2
0,0,10000935,,2187-10-22 15:40:00,
1,1,10001884,,2131-01-10 13:15:00,
2,2,10001884,40.0,2131-01-12 21:04:00,0.4
3,3,10001884,40.0,2131-01-13 02:28:00,0.4
4,4,10001884,,2131-01-11 03:42:00,


In [9]:
print(fiO2['subject_id'].nunique())

40029


In [10]:
merged_tables = pO2.merge(fiO2,how='inner',on='subject_id', suffixes=('_pO2', '_fiO2'))
merged_tables = merged_tables.drop(['Unnamed: 0_pO2', 'Unnamed: 0_fiO2', 'fio2_chartevents'], 
               axis=1)
merged_tables.head(5)

Unnamed: 0,subject_id,po2,charttime_pO2,charttime_fiO2,converted_fiO2
0,10000935,86.0,2187-10-22 15:40:00,2187-10-22 15:40:00,
1,10001884,72.0,2131-01-10 13:15:00,2131-01-10 13:15:00,
2,10001884,72.0,2131-01-10 13:15:00,2131-01-12 21:04:00,0.4
3,10001884,72.0,2131-01-10 13:15:00,2131-01-13 02:28:00,0.4
4,10001884,72.0,2131-01-10 13:15:00,2131-01-11 03:42:00,


In [11]:
merged_tables['time_diff'] = merged_tables.charttime_pO2 - merged_tables.charttime_fiO2
merged_tables = merged_tables[merged_tables['time_diff'] > pd.Timedelta(0)]

merged_tables['time_diff'] = merged_tables['time_diff'] / pd.Timedelta(minutes=1)
merged_tables['min_time_diff'] = merged_tables.groupby('subject_id')['time_diff'].transform('min')

PF_table = merged_tables.where(merged_tables['time_diff'] == merged_tables['min_time_diff'])

PF_table['PF_ratio'] = PF_table.po2 / PF_table.converted_fiO2

PF_table = PF_table[PF_table['PF_ratio'].notna()]

PF_table.head(20)

Unnamed: 0,subject_id,po2,charttime_pO2,charttime_fiO2,converted_fiO2,time_diff,min_time_diff,PF_ratio
12,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5
118,10002155.0,108.0,2131-03-10 01:46:00,2131-03-10 00:15:00,0.6,91.0,91.0,180.0
819,10002428.0,151.0,2156-05-12 19:12:00,2156-05-12 18:11:00,0.4,61.0,61.0,377.5
2256,10004235.0,114.0,2196-02-25 15:10:00,2196-02-25 14:28:00,0.7,42.0,42.0,162.857143
3462,10004606.0,99.0,2159-02-21 04:17:00,2159-02-20 20:11:00,0.4,486.0,486.0,247.5
3476,10004720.0,140.0,2186-11-13 07:43:00,2186-11-13 00:26:00,0.5,437.0,437.0,280.0
3544,10005348.0,199.0,2130-10-27 20:25:00,2130-10-27 18:52:00,0.5,93.0,93.0,398.0
4994,10006053.0,86.0,2111-11-14 02:45:00,2111-11-14 02:03:00,0.8,42.0,42.0,107.5
5267,10007818.0,273.0,2146-06-22 17:51:00,2146-06-22 17:13:00,0.5,38.0,38.0,546.0
6383,10007928.0,72.0,2129-04-07 12:32:00,2129-04-07 06:53:00,0.95,339.0,339.0,75.789474


In [12]:
print(PF_table['subject_id'].nunique())

9137


In [13]:
PF_ARDS = PF_table.where(PF_table['PF_ratio'] < 300)
PF_ARDS = PF_ARDS[PF_ARDS['PF_ratio'].notna()]
PF_ARDS.head(5)

Unnamed: 0,subject_id,po2,charttime_pO2,charttime_fiO2,converted_fiO2,time_diff,min_time_diff,PF_ratio
12,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5
118,10002155.0,108.0,2131-03-10 01:46:00,2131-03-10 00:15:00,0.6,91.0,91.0,180.0
2256,10004235.0,114.0,2196-02-25 15:10:00,2196-02-25 14:28:00,0.7,42.0,42.0,162.857143
3462,10004606.0,99.0,2159-02-21 04:17:00,2159-02-20 20:11:00,0.4,486.0,486.0,247.5
3476,10004720.0,140.0,2186-11-13 07:43:00,2186-11-13 00:26:00,0.5,437.0,437.0,280.0


In [14]:
print(PF_ARDS['subject_id'].nunique())

6736


In [15]:
ARDS_patients = PF_ARDS.merge(MV, how='inner', on='subject_id', suffixes=(None, '_MV'))
ARDS_patients = ARDS_patients.drop(['Unnamed: 0'], 
               axis=1)
ARDS_patients.head(5)

Unnamed: 0,subject_id,po2,charttime_pO2,charttime_fiO2,converted_fiO2,time_diff,min_time_diff,PF_ratio,hadm_id,stay_id
0,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834,37510196
1,10004235.0,114.0,2196-02-25 15:10:00,2196-02-25 14:28:00,0.7,42.0,42.0,162.857143,24181354,34100191
2,10004606.0,99.0,2159-02-21 04:17:00,2159-02-20 20:11:00,0.4,486.0,486.0,247.5,29242151,30213599
3,10004720.0,140.0,2186-11-13 07:43:00,2186-11-13 00:26:00,0.5,437.0,437.0,280.0,22081550,35009126
4,10006053.0,86.0,2111-11-14 02:45:00,2111-11-14 02:03:00,0.8,42.0,42.0,107.5,22942076,34617352


In [16]:
print(ARDS_patients['subject_id'].nunique())

6250


In [17]:
mimic_cxr = pd.read_csv('Downloads/mimic_cxr.csv', low_memory=False)
mimic_cxr = mimic_cxr.rename(columns={"PatientID": "subject_id"})

In [18]:
ARDS_Xray = ARDS_patients.merge(mimic_cxr, how='inner', on='subject_id')
pd.set_option('display.max_columns', None)
ARDS_Xray.head(5)

Unnamed: 0.1,subject_id,po2,charttime_pO2,charttime_fiO2,converted_fiO2,time_diff,min_time_diff,PF_ratio,hadm_id,stay_id,Unnamed: 0,dicom,StudyID,StudyDate,StudyTime
0,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834,37510196,69,1f413a3b-78c5a4aa-978ff5ff-f72a424c-b3a51b92,57839849,21261103,220052.734
1,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834,37510196,70,dc78e84c-6bf41805-cab3f4a8-0a76d337-bb2a1fc6,57839849,21261103,220052.734
2,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834,37510196,71,dae10c54-dcb7bb1f-428bb377-af3b739e-9b16e579,58788638,21270724,161830.718
3,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834,37510196,72,e8f7736b-091930e2-a27c0c41-4ccd2003-d694a9fb,58788638,21270724,161830.718
4,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834,37510196,73,9f5446a9-46ea84a3-6806d8b7-2c0f6d4d-38799159,53268982,21280715,131123.0


In [19]:
print(ARDS_Xray['subject_id'].nunique())
print(ARDS_Xray['dicom'].nunique())

2473
39860


In [20]:
ARDS_Xray['pO2_date'] = ARDS_Xray['charttime_pO2'].dt.date
ARDS_Xray['pO2_date'] = ARDS_Xray['pO2_date'].astype(str)
ARDS_Xray['pO2_date'] = ARDS_Xray['pO2_date'].str.replace('-','')
ARDS_Xray['pO2_date'] = ARDS_Xray['pO2_date'].astype(int)

In [21]:
ARDS_Xray['Xray_day_diff'] = ARDS_Xray['pO2_date'] - ARDS_Xray['StudyDate']
ARDS_Xray['Xray_day_diff'] = ARDS_Xray['Xray_day_diff'].abs()
final_Xrays = ARDS_Xray.where(ARDS_Xray['Xray_day_diff'] <= 1)

final_Xrays = final_Xrays[final_Xrays['dicom'].notna()]
final_Xrays = final_Xrays.drop(['Unnamed: 0'], 
               axis=1)

final_Xrays.head(5)

Unnamed: 0,subject_id,po2,charttime_pO2,charttime_fiO2,converted_fiO2,time_diff,min_time_diff,PF_ratio,hadm_id,stay_id,dicom,StudyID,StudyDate,StudyTime,pO2_date,Xray_day_diff
51,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834.0,37510196.0,7b25b3ed-e780a527-319cb7b3-02d5d071-f1cddee9,50712381.0,21310112.0,45656.359,21310113.0,1.0
52,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834.0,37510196.0,c1ad3e27-62d05ef8-95018fe3-b8bcfe4b-bbba0e1f,56722923.0,21310113.0,44918.484,21310113.0,0.0
53,10001884.0,69.0,2131-01-13 02:28:00,2131-01-12 21:04:00,0.4,324.0,324.0,172.5,26184834.0,37510196.0,9b1a8a51-2b8e4a04-1719059d-aa6bc888-7ace612b,59305618.0,21310114.0,103428.765,21310113.0,1.0
55,10004235.0,114.0,2196-02-25 15:10:00,2196-02-25 14:28:00,0.7,42.0,42.0,162.857143,24181354.0,34100191.0,3813b9b6-88d998b4-941e767b-601ba7c1-98f61102,52379321.0,21960224.0,123746.937,21960225.0,1.0
56,10004235.0,114.0,2196-02-25 15:10:00,2196-02-25 14:28:00,0.7,42.0,42.0,162.857143,24181354.0,34100191.0,606ea60e-f3c5c58e-68fd84ca-db4e599b-127aa53e,57318275.0,21960225.0,55402.109,21960225.0,0.0


In [22]:
print(final_Xrays['dicom'].nunique())
print(final_Xrays['StudyID'].nunique())
print(final_Xrays['subject_id'].nunique())

6337
5526
1889


In [23]:
age = pd.read_csv('Downloads/age.csv')
final_patients = final_Xrays.merge(age, how='inner', on='subject_id')
final_adult_patients = final_patients.where(final_patients['age'] >= 18)

In [24]:
print(final_adult_patients['dicom'].nunique())
print(final_adult_patients['StudyID'].nunique())
print(final_adult_patients['subject_id'].nunique())

6337
5526
1889


In [25]:
final_dicom = final_adult_patients['dicom'].copy()
final_dicom = final_dicom.drop_duplicates()

In [26]:
final_dicom.to_csv('Downloads/final_dicom.csv')

In [27]:
print(final_dicom.nunique)

<bound method IndexOpsMixin.nunique of 0        7b25b3ed-e780a527-319cb7b3-02d5d071-f1cddee9
22       c1ad3e27-62d05ef8-95018fe3-b8bcfe4b-bbba0e1f
44       9b1a8a51-2b8e4a04-1719059d-aa6bc888-7ace612b
66       3813b9b6-88d998b4-941e767b-601ba7c1-98f61102
69       606ea60e-f3c5c58e-68fd84ca-db4e599b-127aa53e
                             ...                     
44668    b3de193f-980afff8-5f90661c-819d6823-6757f24d
44670    2eb70dfe-52fa728e-a36e09be-ec0ed3cf-0a2ea7f0
44673    ef49bac7-16939860-5a4f182e-c568720f-e0c9d278
44676    46510b80-411ac511-fe6ffab2-d7dfdc76-dff1a762
44679    f7e95a22-cb958055-47114ddf-38532ef4-b4c172d5
Name: dicom, Length: 6337, dtype: object>


In [29]:
final_adult_patients.to_csv('Downloads/final_adult_patients.csv')