# CheXpert

In [35]:
columns = ["Path",	"report",	"section_findings"	,"section_impression"]
metadata_columns = ['age', 'sex', 'race']
diseases = ['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices']

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
## LOAD EMBEDDINGS
df_embeddings_train = pd.read_csv(f"/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/train_embeddings.csv")
df_embeddings_val = pd.read_csv(f"/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/val_embeddings.csv")
df_embeddings_test = pd.read_csv(f"/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/test_embeddings.csv")


# Rename Unnamed: 0 to Path
df_embeddings_train = df_embeddings_train.rename(columns={"Unnamed: 0": "Path"})
df_embeddings_val = df_embeddings_val.rename(columns={"Unnamed: 0": "Path"})
df_embeddings_test = df_embeddings_test.rename(columns={"Unnamed: 0": "Path"})



# Remove from each Path the first part of the path to match the original path
df_embeddings_train["Path"] = df_embeddings_train["Path"].apply(lambda x: os.path.join( *x.split('/')[6:]))
df_embeddings_val["Path"] = df_embeddings_val["Path"].apply(lambda x: os.path.join( *x.split('/')[6:]))
df_embeddings_test["Path"] = df_embeddings_test["Path"].apply(lambda x: os.path.join( *x.split('/')[6:]))

## LOAD DATASET
df_train = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/train.csv")
df_val = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/val.csv")
df_test = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/test.csv")

df_train = df_train.rename(columns={"path_to_image": "Path"})
df_val = df_val.rename(columns={"path_to_image": "Path"})
df_test = df_test.rename(columns={"path_to_image": "Path"})

df_train["Path"] = df_train["Path"].apply(lambda x: os.path.join( *x.split('/')[:]))
df_val["Path"] = df_val["Path"].apply(lambda x: os.path.join( *x.split('/')[:]))
df_test["Path"] = df_test["Path"].apply(lambda x: os.path.join( *x.split('/')[:]))

## MERGE
df_train_merged = pd.merge(df_train, df_embeddings_train, on="Path")
df_val_merged = pd.merge(df_val, df_embeddings_val, on="Path")
df_test_merged = pd.merge(df_test, df_embeddings_test, on="Path")

In [13]:
df_train_merged.Path
# add to path /CheXpert-v1.0-512/images/
df_train_merged["Path"] = df_train_merged["Path"].apply(lambda x: os.path.join("/CheXpert-v1.0-512/images/", x))
df_val_merged["Path"] = df_val_merged["Path"].apply(lambda x: os.path.join("/CheXpert-v1.0-512/images/", x))
df_test_merged["Path"] = df_test_merged["Path"].apply(lambda x: os.path.join("/CheXpert-v1.0-512/images/", x))

df_train_merged.Path[0]

'/CheXpert-v1.0-512/images/train/patient08517/study15/view1_frontal.jpg'

In [36]:
all_columns = columns + metadata_columns + diseases+ df_train_merged.columns[-1024:].tolist()


In [37]:
df_train_merged[all_columns].head(2)


Unnamed: 0,Path,report,section_findings,section_impression,age,sex,race,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/CheXpert-v1.0-512/images/train/patient08517/s...,NARRATIVE:\nRADIOGRAPHIC EXAMINATION OF THE CH...,\n \nStable appearance of median sternotomy wi...,\n \n1. Interval resolution of pulmonary edem...,53.0,Male,Asian,0.0,0.0,0.0,...,0.015382,0.009719,-0.018445,-0.006051,-0.011464,-0.016469,-0.016203,-0.019582,0.011147,0.025024
1,/CheXpert-v1.0-512/images/train/patient55989/s...,"NARRATIVE:\nEXAM: Chest 1 View, May 29, 2006\n...",,\n \n1. NEW LEFT BASE ATELECTASIS OR CONSOLID...,30.0,Female,White,0.0,0.0,0.0,...,0.015354,-0.014592,0.00421,-0.05169,-0.000128,0.002003,-0.001599,0.002281,-0.02411,0.000979


In [38]:
df_train_merged[all_columns].to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/train.csv", index=False)
df_val_merged[all_columns].to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/val.csv", index=False)
df_test_merged[all_columns].to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/Chexpert/test.csv", index=False)
