# Import

In [4]:
import os
import pandas as pd
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
import random

## Save all the images in the same folder

In [4]:
# Main folder that contains the images
root_folder = 'BIMCV-PadChest-FULL-resized512'

# List to store the paths of the images
images = []

# We walk the structure of folders and subfolders with os.walk()
for root, dirs, files in os.walk(root_folder):
    for file in files:
        # We check if the file is an image
        if file.endswith(".png"):
            # If it is an image, we add the path to the list
            image_path = os.path.join(root, file)
            images.append(image_path)
            
# Print the length of the list of image paths
print("Number of images:", len(images))

Number of images: 157105


In [5]:
# Create the destination folder if it does not already exist
dest_path = 'PadChest512_Full'
if not os.path.exists(dest_path):
    os.makedirs(dest_path)

# Copy each image to the destination folder
for image in images:
    shutil.copy(image, dest_path)

## Load the images

In [2]:
# Set the paths of the folders containing the images and the CSV file
dest_path = 'PadChest512_Full'
csv_path = 'CodeChestXRay/TrainXray/Data/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv'

In [140]:
# Read the CSV file
csv_table = pd.read_csv(csv_path, low_memory = False)

#---------------------Filters--------------------: 
imageIDs_available = "CodeChestXRay/TrainXray/normal_classification/normal_classify_training/Resources/available_imageIDs_PadChest512.txt"
with open(imageIDs_available, "r") as f:
    avail_imgID = [line.strip() for line in f.readlines()]   

# Filter by available image ID
if avail_imgID:
    csv_table= csv_table[csv_table["ImageID"].isin(avail_imgID)]
        
# Filter by projection
csv_table = csv_table[csv_table["Projection"].isin(["PA"])]

# Remove Labels that contains the Labels 'exclude' or 'suboptimal study'
csv_table["Labels"] = csv_table["Labels"].astype(str)
csv_table = csv_table[~csv_table["Labels"].str.contains('exclude')]
csv_table = csv_table[~csv_table["Labels"].str.contains('suboptimal study')]

In [141]:
print("PA and not exclude or suboptimal study:",len(csv_table))

PA and not exclude or suboptimal study: 88018


## Save the PA images in a new  folder: PadChest512_PA_Full

In [9]:
imageIDs = csv_table['ImageID'].tolist()

In [12]:
for img in imageIDs:
    source_path = os.path.join(dest_path, img)
    target_path = os.path.join('PadChest512_PA_Full', img)
    shutil.copy(source_path, target_path)

## We delete some samples manually: PadChest522_PA_Manually_Filtered

### Discard images

In [136]:
tatal_PA_IDs = [name for name in os.listdir('PadChest512_PA_Full')]
print('PA images available %s' % (len(tatal_PA_IDs)))

PA images available 88018


In [142]:
image_validIDs = [name for name in os.listdir('PadChest512_PA_Manually_filtered')]
print('PA images available after filtering: %s' % (len(image_validIDs)))

PA images available after filtering: 87946


In [138]:
# Create a list with the image IDs that have been removed manually
remove = list(set(tatal_PA_IDs) - set(image_validIDs))

# Write the discard image IDs in a txt file
with open("discard_manually_PA_imageIDs.txt", "w") as f:
    for imageID in remove:
        f.write(imageID + "\n")

In [143]:
with open('discard_manually_PA_imageIDs.txt', "r") as f:
    deleted_imageID = [line.strip() for line in f.readlines()]  
    print('Images deleted: %s' % (len(deleted_imageID)))

Images deleted: 72


In [145]:
# Write the available images in a txt
with open('available_imageIDs_PC512_PA_filtered.txt', 'w') as f:
    for filename in os.listdir('PadChest512_PA_Manually_filtered'):
        if filename.endswith('.png'):
            f.write(filename + '\n')

## Create a txt file with the train, valid and test images (balance normal and abnormal images)

In [149]:
# Read the CSV file
csv_table = pd.read_csv(csv_path, low_memory = False)

#---------------------Filters--------------------: 
imageIDs_available = "available_imageIDs_PC512_PA_filtered.txt"
with open(imageIDs_available, "r") as f:
    avail_imgID = [line.strip() for line in f.readlines()] 
    
# Filter by available image ID
if avail_imgID:
    csv_table= csv_table[csv_table["ImageID"].isin(avail_imgID)]

In [150]:
len(csv_table)

87946

In [151]:
# Get the list of all images with their respective labels
images_list = []
for index, row in csv_table.iterrows():
    images_list.append((row['ImageID'], row['Labels']))

# Calculate the number of normal and pathological images
normal_images = [img for img in images_list if img[1] == "['normal']"]
pathological_images = [img for img in images_list if img[1] != "['normal']"]

# Count the number of images in each category
num_normal_images = len(normal_images)
num_pathological_images = len(pathological_images)

In [152]:
print('Normal images:',num_normal_images)
print('Abnormal images:',num_pathological_images)
print('Total number of images:',len(images_list))

Normal images: 33366
Abnormal images: 54580
Total number of images: 87946


### Sample pathological images and write the samples in a file

In [154]:
pathologica_images_sampled = random.sample(pathological_images, 35000)

In [155]:
with open("sampled_phatological_35000.txt", "w") as file:
    for imageID in pathologica_images_sampled:
        file.write(str(imageID[0]) + "\n")

In [157]:
print('Normal images:',num_normal_images)
print('Abnormal images:',len(pathologica_images_sampled))
print('Total number of images:',num_normal_images + len(pathologica_images_sampled))

Normal images: 33366
Abnormal images: 35000
Total number of images: 68366


### Split the dataset

In [159]:
# Split the "normal" and "pathological" image lists into two parts with a ratio of 90% and 10%
train_normal, valid_normal = train_test_split(normal_images, test_size = 0.1, random_state = 42)
train_pathological, valid_pathological = train_test_split(pathologica_images_sampled, test_size=0.1, random_state = 42)

# Concatenate the two lists of "normal" and "pathological" images
train_imgs = train_normal + train_pathological
valid_imgs = valid_normal + valid_pathological

# Split the list of images into two parts with a ratio of 90% and 10%
train_imgs, test_imgs = train_test_split(train_imgs, test_size = 0.1, random_state = 42)

### Write the image IDs in a text file

In [168]:
with open("Full_PA_balanced_test_imageIDs.txt", "w") as file:
    for imageID in test_imgs:
        file.write(str(imageID[0]) + "\n")

### Count the number of images in each file

In [190]:
# Initialize counters
normal_train = 0
normal_valid = 0
normal_test = 0
pathological_train = 0
pathological_valid = 0
pathological_test = 0

In [191]:
# Iterate through the images in each folder and count them based on their label
i = 0
for img_name in train_imgs:
    img_name = train_imgs[i][0]
    # Check if image name exists in the CSV file
    if img_name in csv_table['ImageID'].values:
        # Get the corresponding row in the CSV file
        row = csv_table[csv_table['ImageID'] == img_name]
        # Check if the image is labeled as normal
        if "['normal']" in row['Labels'].values:
            normal_train += 1
        else:
            pathological_train += 1
        i+=1
            
# Print the number of normal images in the directory
print("Number of normal images in train dataset:", normal_train)
print("Number of pathological images in train dataset:", pathological_train)

print("--------------------------------------------------------------")

i=0
for img_name in valid_imgs:
    img_name = valid_imgs[i][0]
    # Check if image name exists in the CSV file
    if img_name in csv_table['ImageID'].values:
        # Get the corresponding row in the CSV file
        row = csv_table[csv_table['ImageID'] == img_name]
        # Check if the image is labeled as normal
        if "['normal']" in row['Labels'].values:
            normal_valid += 1
        else:
            pathological_valid += 1
        i+=1

# Print the number of normal images in the directory
print("Number of normal images in valid dataset:", normal_valid)
print("Number of pathological images in valid dataset:", pathological_valid)

print("--------------------------------------------------------------")

i=0
for img_name in test_imgs:
    img_name = test_imgs[i][0]
    # Check if image name exists in the CSV file
    if img_name in csv_table['ImageID'].values:
        # Get the corresponding row in the CSV file
        row = csv_table[csv_table['ImageID'] == img_name]
        # Check if the image is labeled as normal
        if "['normal']" in row['Labels'].values:
            normal_test += 1
        else:
            pathological_test += 1
    i+=1

# Print the number of normal images in the directory
print("Number of normal images in test dataset:", normal_test)
print("Number of pathological images in test dataset:", pathological_test)

Number of normal images in train dataset: 27001
Number of pathological images in train dataset: 28375
--------------------------------------------------------------
Number of normal images in valid dataset: 3337
Number of pathological images in valid dataset: 3500
--------------------------------------------------------------
Number of normal images in test dataset: 3028
Number of pathological images in test dataset: 3125
