## Data Preprocessing

### Re split the data 

Dataset Resplitting Strategy
Due to class imbalance and missing classes in the original test set, the initial trainâ€“test split was not suitable for reliable model evaluation. Therefore, all images from the original training and test folders were merged and re-split using a stratified strategy to ensure that each class is represented in all subsets. The dataset was divided into 70% training, 15% validation, and 15% testing data, preserving class proportions across splits. This approach ensures fair evaluation, stable performance metrics, and prevents bias caused by an unrepresentative test set.

In [1]:
import os
import shutil
import random

In [2]:
def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

In [7]:
def stratified_split(
    train_dir,
    test_dir,
    output_dir,
    train_ratio=0.7,
    val_ratio=0.15,
    test_ratio=0.15,
    seed=42
):
    import os
    import shutil
    import random

    random.seed(seed)
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6

    def ensure_dir(path):
        os.makedirs(path, exist_ok=True)

    for split in ["Train", "Val", "Test"]:
        ensure_dir(os.path.join(output_dir, split))

    classes = os.listdir(train_dir)

    for cls in classes:
        print(f"Processing class: {cls}")
        images = []

        # Collect files only (ignore directories)
        for src_dir in [train_dir, test_dir]:
            cls_path = os.path.join(src_dir, cls)
            if not os.path.exists(cls_path):
                continue

            for item in os.listdir(cls_path):
                item_path = os.path.join(cls_path, item)
                if os.path.isfile(item_path):  
                    images.append(item_path)

        if len(images) == 0:
            print(f" Skipping {cls}: no images found")
            continue

        random.shuffle(images)
        n = len(images)

        n_train = int(n * train_ratio)
        n_val = int(n * val_ratio)

        splits = {
            "Train": images[:n_train],
            "Val": images[n_train:n_train + n_val],
            "Test": images[n_train + n_val:]
        }

        for split_name, split_imgs in splits.items():
            cls_out = os.path.join(output_dir, split_name, cls)
            ensure_dir(cls_out)

            for img_path in split_imgs:
                shutil.copy(
                    img_path,
                    os.path.join(cls_out, os.path.basename(img_path))
                )

        print(
            f"  Total: {n} | "
            f"Train: {len(splits['Train'])} | "
            f"Val: {len(splits['Val'])} | "
            f"Test: {len(splits['Test'])}"
        )


In [8]:
stratified_split(
    train_dir="data/Train",
    test_dir="data/Test",
    output_dir="data_new",
    train_ratio=0.7,
    val_ratio=0.15,
    test_ratio=0.15
)

Processing class: actinic keratosis
  Total: 130 | Train: 91 | Val: 19 | Test: 20
Processing class: basal cell carcinoma
  Total: 392 | Train: 274 | Val: 58 | Test: 60
Processing class: dermatofibroma
  Total: 111 | Train: 77 | Val: 16 | Test: 18
Processing class: melanoma
  Total: 454 | Train: 317 | Val: 68 | Test: 69
Processing class: nevus
  Total: 373 | Train: 261 | Val: 55 | Test: 57
Processing class: pigmented benign keratosis
  Total: 462 | Train: 323 | Val: 69 | Test: 70
Processing class: seborrheic keratosis
  Total: 80 | Train: 56 | Val: 12 | Test: 12
Processing class: squamous cell carcinoma
  Total: 197 | Train: 137 | Val: 29 | Test: 31
Processing class: vascular lesion
  Total: 142 | Train: 99 | Val: 21 | Test: 22


In [9]:
import os
import pandas as pd

def count_images_per_class(base_dir):
    """
    Counts the number of image files per class in the given directory.
    Ignores any nested directories.
    """
    class_counts = {}
    for cls in os.listdir(base_dir):
        cls_path = os.path.join(base_dir, cls)
        if os.path.isdir(cls_path):
            
            files = [f for f in os.listdir(cls_path) if os.path.isfile(os.path.join(cls_path, f))]
            class_counts[cls] = len(files)
    return class_counts


train_dir = "data_new/Train"
val_dir   = "data_new/Val"
test_dir  = "data_new/Test"


train_counts = count_images_per_class(train_dir)
val_counts   = count_images_per_class(val_dir)
test_counts  = count_images_per_class(test_dir)

df_counts = pd.DataFrame({
    "Train": train_counts,
    "Val": val_counts,
    "Test": test_counts
}).fillna(0)

df_counts


Unnamed: 0,Train,Val,Test
actinic keratosis,91,19,20
basal cell carcinoma,274,58,60
dermatofibroma,77,16,18
melanoma,317,68,69
nevus,261,55,57
pigmented benign keratosis,323,69,70
seborrheic keratosis,56,12,12
squamous cell carcinoma,137,29,31
vascular lesion,99,21,22
