In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
np.random.seed(420)
sys.path.append("/home/moritz/repositories/med_leak/")

In [3]:
def get_info(df):
    n_images = len(df)
    n_patients = df.patient_id.nunique()
    return n_images, n_patients

In [4]:
cxr_df = pd.read_csv('/home/moritz/data/chexpert/CheXpert-v1.0/train.csv')
cxp_demo = pd.read_csv('/home/moritz/data/chexpert/CheXpert-v1.0/CHEXPERT_DEMO.csv')

In [5]:
def process_main(df):
    df.fillna(0, inplace=True)
    if "path" not in df.columns:
        df.rename(columns={'Path': 'path'}, inplace=True)
    df['patient_id']= df['path']
    df['patient_id'] = df['patient_id'].map(lambda x: str(x).split("/")[2][7:])
    df['study_id'] = df['path']
    df['study_id'] = df['study_id'].map(lambda x: str(x).split("/")[3][5:])
    return df
    
def process_demo(df):
    if "patient_id" not in df.columns:
        df.rename(columns={'PATIENT': 'patient_id'}, inplace=True)
    df['patient_id'] = df['patient_id'].map(lambda x: str(x)[7:])
    df = df.rename(columns={'PRIMARY_RACE': 'race'})
    df = df.rename(columns={'PATIENT': 'patient_id'})
    df = df.rename(columns={'GENDER': 'sex'})
    df = df.rename(columns={'AGE_AT_CXR': 'age'})
    df = df.rename(columns={'ETHNICITY': 'ethnicity'})
    df = df.drop(['sex', 'age'], axis=1)
    return df
cxr_df = process_main(cxr_df)
cxp_demo = process_demo(cxp_demo)
cxr_df = cxr_df[cxr_df["Sex"] != "Unknown"]

In [6]:
n_original_images, n_original_patients = get_info(cxr_df)
f"{n_original_images} images from {n_original_patients} patients"

'223413 images from 64539 patients'

In [7]:
cxr_df = pd.merge(cxr_df, cxp_demo, on='patient_id', how='left')
assert get_info(cxr_df) == (n_original_images, n_original_patients), f"Number of images and patients should be the same: {get_info(cxr_df)} vs {(n_original_images, n_original_patients)}"

In [8]:
cxr_df.head()

Unnamed: 0,path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,...,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,patient_id,study_id,race,ethnicity
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,Other,Non-Hispanic/Non-Latino
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,0.0,0.0,-1.0,1.0,0.0,...,-1.0,0.0,-1.0,0.0,1.0,0.0,2,2,"White, non-Hispanic",Non-Hispanic/Non-Latino
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2,1,"White, non-Hispanic",Non-Hispanic/Non-Latino
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2,1,"White, non-Hispanic",Non-Hispanic/Non-Latino
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,1,"White, non-Hispanic",Non-Hispanic/Non-Latino


In [9]:
white = "White"
asian = "Asian"
black = "Black"
unknown = "Other/Unknown"

mask = cxr_df.race.str.contains("Black", na=False)
cxr_df.loc[mask, "race"] = black
mask = cxr_df.race.str.contains("White", na=False)
cxr_df.loc[mask, "race"] = white
mask = cxr_df.race.str.contains("Asian", na=False)
cxr_df.loc[mask, "race"] = asian

other_vals = [
    val for val in cxr_df["race"].unique() if val not in [white, asian, black]
]
cxr_df["race"].replace({val: "Other/Unknown" for val in other_vals}, inplace=True)
assert get_info(cxr_df) == (
    n_original_images,
    n_original_patients,
), f"Number of images and patients should be the same: {get_info(cxr_df)} vs {get_info(cxp_df)}"

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cxr_df["race"].replace({val: "Other/Unknown" for val in other_vals}, inplace=True)


In [10]:
cxr_df.Sex.value_counts(), cxr_df.race.value_counts()

(Sex
 Male      132636
 Female     90777
 Name: count, dtype: int64,
 race
 White            125491
 Other/Unknown     62689
 Asian             23272
 Black             11961
 Name: count, dtype: int64)

In [11]:
cxr_df.head()

Unnamed: 0,path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,...,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,patient_id,study_id,race,ethnicity
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,Other/Unknown,Non-Hispanic/Non-Latino
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,0.0,0.0,-1.0,1.0,0.0,...,-1.0,0.0,-1.0,0.0,1.0,0.0,2,2,White,Non-Hispanic/Non-Latino
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2,1,White,Non-Hispanic/Non-Latino
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2,1,White,Non-Hispanic/Non-Latino
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,1,White,Non-Hispanic/Non-Latino


In [12]:
#cxr_df.to_csv("/home/moritz/repositories/med_leak/data/csv/chexpert_train.csv", index=False)

### Patient Cohort Table


In [13]:
cxr_df

Unnamed: 0,path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,...,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,patient_id,study_id,race,ethnicity
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,00001,1,Other/Unknown,Non-Hispanic/Non-Latino
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,0.0,0.0,-1.0,1.0,0.0,...,-1.0,0.0,-1.0,0.0,1.0,0.0,00002,2,White,Non-Hispanic/Non-Latino
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,00002,1,White,Non-Hispanic/Non-Latino
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,00002,1,White,Non-Hispanic/Non-Latino
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,00003,1,White,Non-Hispanic/Non-Latino
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223408,CheXpert-v1.0/train/patient64537/study2/view1_...,Male,59,Frontal,AP,0.0,0.0,0.0,-1.0,0.0,...,-1.0,0.0,1.0,0.0,0.0,0.0,64537,2,Black,Non-Hispanic/Non-Latino
223409,CheXpert-v1.0/train/patient64537/study1/view1_...,Male,59,Frontal,AP,0.0,0.0,0.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,0.0,0.0,64537,1,Black,Non-Hispanic/Non-Latino
223410,CheXpert-v1.0/train/patient64538/study1/view1_...,Female,0,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,64538,1,Other/Unknown,
223411,CheXpert-v1.0/train/patient64539/study1/view1_...,Female,0,Frontal,AP,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,64539,1,Other/Unknown,


In [14]:
cxr_df.loc[np.logical_not(cxr_df["AP/PA"].isin(["AP", "PA"])), "AP/PA"] = "Other"

In [15]:
from utils import cohort_table
from src.data_utils.constants import CXR_LABELS

In [16]:
other_vars = ['Sex', "AP/PA"] + CXR_LABELS
cohort_df = cohort_table(cxr_df, strat_var='race', strat_var_vals=['Asian', 'Black', 'White', 'Other/Unknown'], other_vars=other_vars, patient_id_col="patient_id")
# keep only rows where diagnostic label was present (value 1.0)
cohort_df = cohort_df.loc[np.logical_not(cohort_df.Variable.isin(["-1.0 (\%)", "0.0 (\%)"]))]
string_cxr_labels = [f"{a} (\%)" for a in  CXR_LABELS]
cohort_df.iloc[-len(CXR_LABELS): ,cohort_df.columns.get_loc('Variable')] = string_cxr_labels
cohort_df

OrderedDict([('Variable', ['Patients', 'Records', 'Median age (SD)']), ('All', ['64539', '223413', '62 (18)']), ('Asian', ['6936', '23272', '63 (17)']), ('Black', ['3093', '11961', '56 (17)']), ('White', ['36110', '125491', '64 (17)']), ('Other/Unknown', ['18400', '62689', '57 (18)'])])


Unnamed: 0,Variable,All,Asian,Black,White,Other/Unknown
0,Patients,64539,6936,3093,36110,18400
1,Records,223413,23272,11961,125491,62689
2,Median age (SD),62 (18),63 (17),56 (17),64 (17),57 (18)
3,Female (\%),90777 (40.6),9958 (42.8),5839 (48.8),49504 (39.4),25476 (40.6)
4,Male (\%),132636 (59.4),13314 (57.2),6122 (51.2),75987 (60.6),37213 (59.4)
5,AP (\%),161590 (72.3),16292 (70.0),8387 (70.1),91254 (72.7),45657 (72.8)
6,Other (\%),32404 (14.5),3636 (15.6),1910 (16.0),18003 (14.3),8855 (14.1)
7,PA (\%),29419 (13.2),3344 (14.4),1664 (13.9),16234 (12.9),8177 (13.0)
8,No Finding (\%),22380 (10.0),2412 (10.4),1398 (11.7),11786 (9.4),6784 (10.8)
11,Enlarged Cardiomediastinum (\%),10798 (4.8),1123 (4.8),593 (5.0),6131 (4.9),2951 (4.7)


In [18]:
cohort_df.to_latex("./tables/chexpert.tex", index=False)