## Dataset and sampling

**Authors**: Alisa Ochs & André Schomakers

**Date**: 29.05.25

This interactive Python notebook `.ipynb` is designed to sample $\frac{1}{10}$ of the chosen dataset _Chest X-Rays Computer Vision Project_, available on:
[Roboflow](https://universe.roboflow.com/mohamed-traore-2ekkp/chest-x-rays-qjmia).

The original dataset contains holds:

train folder:
- 1349 images with class NORMAL
- 3883 images with class PNEUMONIA

test folder:
- 234 images with class NORMAL
- 390 images with class PNEUMONIA

Within  setup `target_directory` we now have $\frac{1}{10}$ of the corresponding images for faster model training in our main `analysis.ipynb`.



In [1]:
# imports
import os
import shutil
import random
from pathlib import Path


In [2]:

# --- Configuration ---
SOURCE_BASE_DIR = Path("./ChestXRay2017/chest_xray")
TARGET_BASE_DIR = Path("./ChestXRay2017/chest_xray_sampled")
SAMPLING_FRACTION = 0.1 # 10 percent


In [3]:
# AI generated code for sampling (Gemini 2.5)

def get_image_files(directory):
    """Returns a list of image files (jpeg, jpg, png) in a directory."""
    image_extensions = ('.jpeg', '.jpg', '.png')
    dir_path = Path(directory) # Ensure it's a Path object
    if not dir_path.is_dir():
        return []
    return [f for f in os.listdir(dir_path) if f.lower().endswith(image_extensions)]

# --- Main Sampling Logic ---
def sample_dataset(source_base: Path, target_base: Path, fraction: float):
    print(f"Starting dataset sampling...")
    print(f"Source: {source_base.resolve()}")
    print(f"Target: {target_base.resolve()}")
    print(f"Sampling fraction: {fraction*100:.1f}%\n")

    if target_base.exists():
        print(f"Target directory '{target_base}' already exists. Removing it to start fresh.")
        shutil.rmtree(target_base)
    
    target_base.mkdir(parents=True, exist_ok=True)

    splits = ['train', 'test']
    classes = ['NORMAL', 'PNEUMONIA']

    for split_name in splits:
        source_split_dir = source_base / split_name
        target_split_dir = target_base / split_name
        
        if not source_split_dir.exists():
            print(f"  WARNING: Source split directory '{source_split_dir}' not found. Skipping split: {split_name}")
            continue
        
        target_split_dir.mkdir(exist_ok=True) # Create target split dir only if source exists
        print(f"Processing split: {split_name}")

        for class_name in classes:
            source_class_dir = source_split_dir / class_name
            target_class_dir = target_split_dir / class_name

            if not source_class_dir.exists():
                print(f"    WARNING: Source class directory '{source_class_dir}' not found. Skipping class: {class_name} in split: {split_name}")
                continue

            target_class_dir.mkdir(exist_ok=True) # Create target class dir only if source exists
            print(f"  Processing class: {class_name}")

            image_files = get_image_files(source_class_dir)
            
            if not image_files:
                print(f"    No image files found in '{source_class_dir}'. Skipping.")
                continue

            num_total_files = len(image_files)
            
            # Calculate number of files to sample
            if num_total_files > 0 and fraction > 0:
                num_to_sample = int(round(num_total_files * fraction))
                # Ensure we sample at least one file if the calculation is > 0, but don't force sampling if result is 0
                if num_to_sample == 0 and (num_total_files * fraction) > 0:
                    num_to_sample = 1 
            else:
                num_to_sample = 0
            
            num_to_sample = min(num_to_sample, num_total_files) # Cannot sample more than available

            if num_to_sample > 0:
                print(f"    Found {num_total_files} images. Sampling {num_to_sample} images.")
                sampled_files = random.sample(image_files, num_to_sample)
                for file_name in sampled_files:
                    source_file_path = source_class_dir / file_name
                    target_file_path = target_class_dir / file_name
                    shutil.copy2(source_file_path, target_file_path) # copy2 preserves metadata
                print(f"Copied {len(sampled_files)} files to '{target_class_dir}'\n")
            else:
                print(f"Found {num_total_files} images. Sampling fraction {fraction*100:.1f}% results in 0 files to sample for this class.\n")
            
    print("Dataset sampling complete!")
    print_dataset_summary(target_base)


def print_dataset_summary(base_dir: Path):
    print(f"\n--- Summary of Sampled Dataset at '{base_dir.resolve()}' ---")
    for split_dir in sorted(base_dir.iterdir()): # Sort for consistent order
        if split_dir.is_dir():
            print(f"Split: {split_dir.name}")
            for class_dir in sorted(split_dir.iterdir()): # Sort for consistent order
                if class_dir.is_dir():
                    num_files = len(get_image_files(class_dir))
                    print(f"  Class: {class_dir.name} - Files: {num_files}")
    print("--------------------------------------------------")


In [4]:
current_working_directory = Path.cwd()
print(f"Current Working Directory for path resolution: {current_working_directory.resolve()}")

Current Working Directory for path resolution: C:\Users\alisa\OneDrive\UMIT\2 Semester\Modul 12 Applications of Machine Learning in Health Care\AMLHC-final-exam


In [5]:
CHESTXRAY_ROOT_SOURCE_BASE_DIR = current_working_directory / "chest_xray"
CHESTXRAY_ROOT_TARGET_BASE_DIR = current_working_directory / "chest_xray_sampled"

In [None]:
sample_dataset(CHESTXRAY_ROOT_SOURCE_BASE_DIR, CHESTXRAY_ROOT_TARGET_BASE_DIR, fraction=0.1)