# Split PadChest dataset into different subsets to train the models for each pathology

In [1]:
import os
import pandas as pd
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
import random

## Prepare the dataset

In [2]:
csv_path = '../CodeChestXRay/TrainXray/Data/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv'

In [4]:
# Read labels from CSV file 
csv_table = pd.read_csv('C:/Users/maria/OneDrive/Documentos/Escritorio/CodeChestXRay/TrainXray/Data/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv', low_memory = False)

# Replace missing values in "Labels" column with an empty string
csv_table['Labels'].fillna('', inplace = True)

# Remove Labels that contains the Labels 'exclude' or 'suboptimal study'
csv_table = csv_table[~csv_table["Labels"].str.contains('exclude')]
csv_table = csv_table[~csv_table["Labels"].str.contains('suboptimal study')]

# Projection PA
csv_table = csv_table[csv_table['Projection'] == 'PA']

# Filter the imeges by the available IDs
avail_imgID = os.path.join("PadChest_512_PA_manually_filtered_87946.txt")

with open(avail_imgID, "r") as f:
    avail_imgID = [line.strip() for line in f.readlines()]
    
csv_table = csv_table[csv_table["ImageID"].isin(avail_imgID)]

print("Images PA in PadChest512 (excluding the labels “exclude” o “suboptimal study”"+
      f"and manually filtered): {len(csv_table)}")

Images PA in PadChest512 (excluding the labels “exclude” o “suboptimal study”and manually filtered): 87946


## Obtain the occurrence of each pathology

In [5]:
conditions = {
    
    "Infiltrations": ["infiltrates", "interstitial pattern", "ground glass pattern",
                      "reticular interstitial pattern", "reticulonodular interstitial pattern",
                      "alveolar pattern", "consolidation", "air bronchogram"],
    
    "Atelectasis": ["atelectasis"],
    
    "Nodule": ["nodule", "multiple nodules"],
    
    "Mass": ["pulmonary mass"],
    
    "Cavitation": ["cavitation", "abscess", "cyst"],
    
    "Air Trapping": ["air trapping"],
    
    "Cardiomegaly": ["cardiomegaly"],
    
    "Heart Insufficiency": ["heart insufficiency"],
    
    "Pleural Effusion": ["pleural effusion"],
    
    "Pneumothorax": ["pneumothorax"],
    
    "Rib Fracture": ["rib fracture"],
    
    "Thoracic Cage Deformation": ["thoracic cage deformation"],
    
    "Mediastinal Conditions": ["mediastinal shift", "mediastinal mass", "pneumomediastinum"],

}

In [13]:
for pathology, labels in conditions.items():
    category = ', '.join(labels)
    csv_table_filtered = csv_table[csv_table['Labels'].str.contains('|'.join(labels))]
    count = len(csv_table_filtered)
    print(f"{pathology}: {count}")

Infiltrations: 7503
Atelectasis: 3839
Nodule: 3540
Mass: 520
Cavitation: 284
Air Trapping: 3239
Cardiomegaly: 8114
Heart Insufficiency: 679
Pleural Effusion: 3205
Pneumothorax: 217
Rib Fracture: 2080
Thoracic Cage Deformation: 151
Mediastinal Conditions: 225


### Create subsets containing the pathology and not containing the pathology.

In [26]:
subsets_with_pathology = {}  # Dictionary to store subsets with pathology
subsets_without_pathology = {}  # Dictionary to store subsets without pathology

# Iterate over the conditions dictionary
for pathology, labels in conditions.items():
    category = ', '.join(labels)
    subset_with_pathology = csv_table[csv_table['Labels'].str.contains('|'.join(labels))]

    # Randomly sample the subset without pathology to match the number of samples in the subset with pathology
    subset_without_pathology = csv_table[~csv_table['Labels'].str.contains('|'.join(labels))].sample(len(subset_with_pathology), replace=True)

    subsets_with_pathology[pathology] = subset_with_pathology
    subsets_without_pathology[pathology] = subset_without_pathology

### Create a list to store the image IDs for each subsets combining positive/negative samples

In [35]:
def get_image_label_list(df, sample_size = None):
    image_label_list = []
    for index, row in df.iterrows():
        image_label_list.append((row['ImageID'], row['Labels']))
    
    if sample_size is not None:
        image_label_list = random.sample(image_label_list, sample_size)
    
    return image_label_list

### Divide each subset into training, validation and test

In [38]:
for pathology, labels in conditions.items():
    
    # Get the subset with pathology and create the corresponding image_label_list
    subset_with_pathology = subsets_with_pathology[pathology]
    yes_list = get_image_label_list(subset_with_pathology)
    
    # Get the subset without pathology and create the corresponding image_label_list
    subset_without_pathology = subsets_without_pathology[pathology]
    not_list = get_image_label_list(subset_without_pathology)
    
    # Combine the lists
    combined_list = not_list + yes_list
    
    # Combine the lists
    train_imgs = []
    valid_imgs = []
    test_imgs = []

    # Loop through the dictionary and split each list into train, validation, and test sets

    train, test = train_test_split(combined_list, test_size=0.10, random_state=42)
    train, valid = train_test_split(train, test_size=0.1, random_state=42)
    
    #Write the image IDs in a txt file
    with open(f"{pathology}_train_imageIDs.txt", "w") as file:
        for imageID in train:
            file.write(str(imageID[0]) + "\n")

        with open(f"{pathology}_valid_imageIDs.txt", "w") as file:
            for imageID in valid:
                file.write(str(imageID[0]) + "\n")

        with open(f"{pathology}_test_imageIDs.txt", "w") as file:
            for imageID in test:
                file.write(str(imageID[0]) + "\n")