In [1]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# For reproducibility
import random
import numpy as np

random.seed(42)
np.random.seed(42)

In [3]:
# Define paths
base_dir = "../data/medical_images/"
raw_images_dir = os.path.join(base_dir, "raw/all_images/")
labels_csv = os.path.join(base_dir, "raw/image_labels.csv")

# Define output directories
train_dir = os.path.join(base_dir, "train/")
val_dir = os.path.join(base_dir, "val/")
test_dir = os.path.join(base_dir, "test/")

# Create directories if they don't exist
for directory in [train_dir, val_dir, test_dir]:
    os.makedirs(directory, exist_ok=True)

In [6]:
# Load the CSV file
df = pd.read_csv(labels_csv)

# Display the first few rows
print("Dataset Preview:")
print(df.head())

# Dataset information
print("\nDataset Info:")
print(df.info())
print(f"\nNumber of Images: {df.shape[0]}")

Dataset Preview:
      filename  label
0  image_0.jpg      0
1  image_1.jpg      0
2  image_2.jpg      0
3  image_3.jpg      0
4  image_4.jpg      0

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5856 entries, 0 to 5855
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  5856 non-null   object
 1   label     5856 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 91.6+ KB
None

Number of Images: 5856


In [7]:
# Check class distribution
print("\nClass Distribution:")
print(df['label'].value_counts())


Class Distribution:
label
1    4273
0    1583
Name: count, dtype: int64


In [8]:
# Split dataset into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

# Print the sizes of each split
print("\nSplit Sizes:")
print(f"Training Set: {train_df.shape[0]} samples")
print(f"Validation Set: {val_df.shape[0]} samples")
print(f"Testing Set: {test_df.shape[0]} samples")



Split Sizes:
Training Set: 4215 samples
Validation Set: 469 samples
Testing Set: 1172 samples


In [12]:
def organize_images(dataframe, source_dir, target_dir):
    """
    Copies images into class-labeled folders for a given dataframe.
    """
    for _, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0], desc=f"Organizing {target_dir}"):
        label = row['label']
        filename = row['filename']
        src_path = os.path.join(source_dir, filename)
        label_dir = os.path.join(target_dir, str(label))
        os.makedirs(label_dir, exist_ok=True)
        dst_path = os.path.join(label_dir, filename)
        shutil.copy(src_path, dst_path)

In [13]:
# Organize images into train, val, and test directories
organize_images(train_df, raw_images_dir, train_dir)
organize_images(val_df, raw_images_dir, val_dir)
organize_images(test_df, raw_images_dir, test_dir)

Organizing ../data/medical_images/train/: 100%|██████████| 4215/4215 [01:05<00:00, 64.10it/s]
Organizing ../data/medical_images/val/: 100%|██████████| 469/469 [00:06<00:00, 67.37it/s]
Organizing ../data/medical_images/test/: 100%|██████████| 1172/1172 [00:15<00:00, 73.65it/s]


In [14]:
def count_images_in_dir(directory):
    """
    Counts the number of images in each subdirectory (class) under the given directory.
    """
    counts = {}
    for label_dir in os.listdir(directory):
        label_path = os.path.join(directory, label_dir)
        if os.path.isdir(label_path):
            counts[label_dir] = len(os.listdir(label_path))
    return counts

print("\nTraining Set Distribution:")
print(count_images_in_dir(train_dir))

print("\nValidation Set Distribution:")
print(count_images_in_dir(val_dir))

print("\nTesting Set Distribution:")
print(count_images_in_dir(test_dir))


Training Set Distribution:
{'0': 1018, '1': 3076}

Validation Set Distribution:
{'0': 127, '1': 342}

Testing Set Distribution:
{'0': 305, '1': 855}


**Summary:**
- Loaded and explored the image labels dataset.
- Split the data into training, validation, and testing sets (80/10/10 split).
- Organized the images into labeled subdirectories under `train/`, `val/`, and `test/`.
- Saved the splits as CSV files for future reference.

**Next Steps:**
- Move on to model creation and training in the next notebook.