<a href="https://colab.research.google.com/github/matsu641/APS360Project/blob/main/APS360_project_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
import shutil

#Load CSV with image metadata and labels
csv_path = "/content/drive/MyDrive/NIH_ChestXray/Data_Entry_2017.csv"
df = pd.read_csv(csv_path)

# Define target classes
# keep 4 classes only for this project
target_classes = ["No Finding", "Pneumonia", "Effusion", "Cardiomegaly"]

# Keep only rows where the label matches one of the target classes exactly
df_filtered = df[df["Finding Labels"].isin(target_classes)]

print("Original dataset size:", len(df))
print("Filtered dataset size:", len(df_filtered))

Original dataset size: 112120
Filtered dataset size: 65731


In [None]:
# Split multi-label entries into separate labels
all_labels = []
for entry in df["Finding Labels"]:
    labels = entry.split('|')  # labels are separated by "|"
    all_labels.extend(labels)

# Count frequency
label_counts = pd.Series(all_labels).value_counts()

# Convert to percentage
label_percent = (label_counts / len(df)) * 100

# Combine into a single DataFrame
summary = pd.DataFrame({
    "Count": label_counts,
    "Percentage": label_percent.round(2)
})

print(summary)

                    Count  Percentage
No Finding          60361       53.84
Infiltration        19894       17.74
Effusion            13317       11.88
Atelectasis         11559       10.31
Nodule               6331        5.65
Mass                 5782        5.16
Pneumothorax         5302        4.73
Consolidation        4667        4.16
Pleural_Thickening   3385        3.02
Cardiomegaly         2776        2.48
Emphysema            2516        2.24
Edema                2303        2.05
Fibrosis             1686        1.50
Pneumonia            1431        1.28
Hernia                227        0.20


In [None]:
# Create output directories for each class
output_dir = "/content/drive/MyDrive/NIH_ChestXray_subset"
os.makedirs(output_dir, exist_ok=True)

for cls in target_classes:
    os.makedirs(os.path.join(output_dir, cls), exist_ok=True)

# Copy selected images into class-specific folders
images_dir = "/content/drive/MyDrive/NIH_ChestXray/images"

for _, row in df_filtered.iterrows():
    fname = row["Image Index"]
    label = row["Finding Labels"]

    src = os.path.join(images_dir, fname)
    dst = os.path.join(output_dir, label, fname)

    if os.path.exists(src):
        shutil.copy(src, dst)

print("Subset dataset created successfully!")


Subset dataset created successfully!


In [None]:
counts = {}
for cls in os.listdir(output_dir):
    class_path = os.path.join(output_dir, cls)
    if os.path.isdir(class_path):
        num_files = len([f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        counts[cls] = num_files
        print(f"{cls}: {num_files} images")

No Finding: 5999 images
Pneumonia: 34 images
Effusion: 309 images
Cardiomegaly: 91 images


In [None]:
total = sum(counts.values())

for cls, num in counts.items():
    ratio = num / total * 100
    print(f"{cls}: {num} images ({ratio:.2f}%)")

print(f"\nTotal images: {total}")

No Finding: 5999 images (93.25%)
Pneumonia: 34 images (0.53%)
Effusion: 309 images (4.80%)
Cardiomegaly: 91 images (1.41%)

Total images: 6433
