In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
sys.path.append("/home/moritz/repositories/med_leak/")


In [3]:
data_dir = Path("/home/moritz/data/mimic-cxr/mimic-cxr-jpg")
assert data_dir.exists() and data_dir.is_dir()

In [4]:
metadata_df = pd.read_csv(data_dir / 'mimic-cxr-2.0.0-metadata.csv')
diagnosis_df = pd.read_csv(data_dir / 'mimic-cxr-2.0.0-negbio.csv')
admissions_df = pd.read_csv(data_dir / 'admissions.csv') # download this file from the MIMIC-IV dataset
patients_df = pd.read_csv(data_dir / 'patients.csv') # download this file from the MIMIC-IV dataset
split_df = pd.read_csv(data_dir / 'mimic-cxr-2.0.0-split.csv')

In [5]:
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [None]:
def get_info(df):
    n_images = len(df)
    n_patients = df.subject_id.nunique()
    return n_images, n_patients
original_n_images, original_n_patients = get_info(metadata_df)
print(f"{original_n_images} images from {original_n_patients} patients")

377110 images from 65379 patients


In [7]:
# identify patients who have inconsistent documented race information
# credit to github.com/robintibor
ethnicity_df = admissions_df.loc[:,['subject_id', 'race']].drop_duplicates()

v = ethnicity_df.subject_id.value_counts()
subject_id_more_than_once = v.index[v.gt(1)]

ambiguous_ethnicity_df = ethnicity_df[ethnicity_df.subject_id.isin(subject_id_more_than_once)]
inconsistent_race = ambiguous_ethnicity_df.subject_id.unique()

grouped = ambiguous_ethnicity_df.groupby('subject_id')
grouped.aggregate(lambda x: "_".join(sorted(x))).race.value_counts()

race
OTHER_WHITE                                                                                 1745
UNKNOWN_WHITE                                                                                969
WHITE_WHITE - OTHER EUROPEAN                                                                 804
WHITE_WHITE - RUSSIAN                                                                        450
HISPANIC OR LATINO_HISPANIC/LATINO - PUERTO RICAN                                            437
                                                                                            ... 
HISPANIC OR LATINO_HISPANIC/LATINO - CENTRAL AMERICAN_HISPANIC/LATINO - GUATEMALAN_OTHER       1
ASIAN_ASIAN - SOUTH EAST ASIAN_UNABLE TO OBTAIN                                                1
BLACK/AFRICAN AMERICAN_HISPANIC/LATINO - CENTRAL AMERICAN                                      1
ASIAN_BLACK/AFRICAN                                                                            1
BLACK/AFRICAN_BLACK/AFRIC

In [8]:
# set subjects with inconsistent race values to 'OTHER/UNKNOWN'
ethnicity_df.loc[ethnicity_df.subject_id.isin(subject_id_more_than_once), 'race'] = 'OTHER/UNKNOWN'
ethnicity_df = ethnicity_df.drop_duplicates(subset=['subject_id']) # drop duplicates
assert ethnicity_df.subject_id.nunique() == len(ethnicity_df) # check for no duplicates

In [9]:
metadata_df

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,
...,...,...,...,...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550.171,CHEST (PA AND LAT),lateral,Erect
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect


In [10]:
merge_df = pd.merge(metadata_df, diagnosis_df, on=['subject_id', 'study_id'], how='left', validate='many_to_one') # to not drop duplicate images here set how='left'
assert get_info(merge_df) == (original_n_images, original_n_patients) # check we haven't lost any images or patients
merge_df = pd.merge(merge_df, ethnicity_df, how='left', on='subject_id')
assert get_info(merge_df) == (original_n_images, original_n_patients) # check we haven't lost any images or patients

In [None]:
get_info(merge_df)

(377110, 65379)

In [12]:
assert patients_df.subject_id.nunique() == len(patients_df) # check for no subject duplicates in patients_df
df_cxr = pd.merge(merge_df, patients_df, how='left', on='subject_id', validate='many_to_one')
df_cxr = pd.merge(df_cxr, split_df, on=['study_id', 'subject_id', 'dicom_id'], validate='many_to_many')
assert get_info(df_cxr) == (original_n_images, original_n_patients) # check we haven't lost any images or patients

In [None]:
get_info(df_cxr), get_info(split_df)

((377110, 65379), (377110, 65379))

In [14]:
df_cxr.columns

Index(['dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'Atelectasis',
       'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
       'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax',
       'Support Devices', 'race', 'gender', 'anchor_age', 'anchor_year',
       'anchor_year_group', 'dod', 'split'],
      dtype='object')

In [15]:
# summarize values for race column
white = 'White'
asian = 'Asian'
black = 'Black'
other = "Other/Unknown"

black_mask = (df_cxr.race.str.contains("BLACK", na=False))
white_mask = (df_cxr.race.str.contains("WHITE", na=False))
asian_mask = (df_cxr.race.str.contains("ASIAN", na=False))
other_mask = np.logical_not(black_mask | white_mask | asian_mask)

df_cxr.loc[black_mask, "race"] = black
df_cxr.loc[white_mask, "race"] = white
df_cxr.loc[asian_mask, "race"] = asian
df_cxr.loc[other_mask, "race"] = other

In [16]:
df_cxr.race.value_counts()

race
White            197205
Other/Unknown    120244
Black             50750
Asian              8911
Name: count, dtype: int64

In [17]:
# rename gender column to sex
df_cxr = df_cxr.rename(columns={'gender': 'Sex'})
#add path column
df_cxr.subject_id = df_cxr.subject_id.astype(str)
df_cxr.study_id = df_cxr.study_id.astype(str)
df_cxr.insert(2, "path", "")
df_cxr.path = df_cxr.subject_id.str[0:2]
df_cxr.path = "p" + df_cxr.path
df_cxr.path = df_cxr.path + "/p" + df_cxr.subject_id + "/s" + df_cxr.study_id + "/" + df_cxr.dicom_id + ".jpg"
# change M/F in sex column to Male/ Female
df_cxr['Sex'] = df_cxr['Sex'].replace({'M': 'Male', 'F': 'Female'})


In [18]:
#df_cxr.to_csv("/home/moritz/repositories/med_leak/data/csv/mimic_train.csv", index=False)

### Cohort Table

In [19]:
view_positions = ["AP", "PA", "LATERAL", "LL"]
df_cxr.loc[np.logical_not(df_cxr.ViewPosition.isin(view_positions)), "ViewPosition"] = "Other"
df_cxr.loc[df_cxr.ViewPosition == "LATERAL", "ViewPosition"] = "L"
df_cxr.ViewPosition.value_counts()

ViewPosition
AP       147173
PA        96161
L         82853
LL        35133
Other     15790
Name: count, dtype: int64

In [20]:
from utils import cohort_table
from src.data_utils.constants import CXR_LABELS

In [21]:
other_vars = ['Sex', "ViewPosition"] + CXR_LABELS
cohort_df = cohort_table(df_cxr, strat_var='race', strat_var_vals=['Asian', 'Black', 'White', 'Other/Unknown'], other_vars=other_vars, include_age=False, patient_id_col="subject_id")
# keep only rows where diagnostic label was present (value 1.0)
cohort_df = cohort_df.loc[np.logical_not(cohort_df.Variable.isin(["-1.0 (\%)", "0.0 (\%)", "nan (\%)"]))]
string_cxr_labels = [f"{a} (\%)" for a in  CXR_LABELS]
cohort_df.iloc[-len(CXR_LABELS): ,cohort_df.columns.get_loc('Variable')] = string_cxr_labels
cohort_df

OrderedDict([('Variable', ['Patients', 'Records']), ('All', ['65379', '377110']), ('Asian', ['1637', '8911']), ('Black', ['8051', '50750']), ('White', ['30941', '197205']), ('Other/Unknown', ['24750', '120244'])])


Unnamed: 0,Variable,All,Asian,Black,White,Other/Unknown
0,Patients,65379,1637,8051,30941,24750
1,Records,377110,8911,50750,197205,120244
2,Female (\%),170803 (45.3),3862 (43.3),30792 (60.7),87711 (44.5),48438 (40.3)
3,Male (\%),186739 (49.5),5049 (56.7),19958 (39.3),109494 (55.5),52238 (43.4)
5,PA (\%),96161 (25.5),2301 (25.8),14300 (28.2),45739 (23.2),33821 (28.1)
6,L (\%),82853 (22.0),1752 (19.7),14189 (28.0),37598 (19.1),29314 (24.4)
7,AP (\%),147173 (39.0),3523 (39.5),17303 (34.1),84344 (42.8),42003 (34.9)
8,LL (\%),35133 (9.3),1027 (11.5),4097 (8.1),20088 (10.2),9921 (8.3)
9,Other (\%),15790 (4.2),308 (3.5),861 (1.7),9436 (4.8),5185 (4.3)
10,No Finding (\%),149986 (39.8),3286 (36.9),22643 (44.6),68604 (34.8),55453 (46.1)


In [22]:
cohort_df.to_latex("./tables/mimic.tex", index=False)