In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split
from collections import Counter

In [None]:
TRAIN_DIR = "./data/Training"
TEST_DIR = "./data/Testing"
NEW_DIR = "./new_data"

CLASSES = ["glioma_tumor", "meningioma_tumor", "no_tumor", "pituitary_tumor"]

TRAIN_SPLIT = 0.7
VAL_SPLIT = 0.15
TEST_SPLIT = 0.15

In [None]:
def collect_all_images():
    all_imgs = []

    for class_name in CLASSES:
        train_class_dir = os.path.join(TRAIN_DIR, class_name)
        if os.path.exists(train_class_dir):
            for filename in os.listdir(train_class_dir):
                if filename.lower().endswith(".jpg"):
                    img_path = os.path.join(train_class_dir, filename)
                    all_imgs.append((img_path, class_name))

        test_class_dir = os.path.join(TEST_DIR, class_name)
        if os.path.exists(test_class_dir):
            for filename in os.listdir(test_class_dir):
                if filename.lower().endswith(".jpg"):
                    img_path = os.path.join(test_class_dir, filename)
                    all_imgs.append((img_path, class_name))

    return all_imgs


def create_directory_structure():
    for set in ["Training", "Validation", "Testing"]:
        for class_name in CLASSES:
            dir_path = os.path.join(NEW_DIR, set, class_name)
            os.makedirs(dir_path, exist_ok=True)


def split_and_copy(dataset):
    paths = []
    class_names = []

    for path, _ in dataset:
        paths.append(path)

    for _, class_name in dataset:
        class_names.append(class_name)

    train_paths, temp_paths, train_classes, temp_classes = train_test_split(
        paths,
        class_names,
        train_size=TRAIN_SPLIT,
        stratify=class_names,
        random_state=42,
    )

    val_ratio_adjusted = VAL_SPLIT / (VAL_SPLIT + TEST_SPLIT)

    val_paths, test_paths, val_classes, test_classes = train_test_split(
        temp_paths,
        temp_classes,
        train_size=val_ratio_adjusted,
        stratify=temp_classes,
        random_state=42,
    )

    subsets = {
        "Training": (train_paths, train_classes),
        "Validation": (val_paths, val_classes),
        "Testing": (test_paths, test_classes),
    }

    for subset, (s_path, s_class) in subsets.items():
        for current_path, current_class in zip(s_path, s_class):
            filename = os.path.basename(current_path)
            dst = os.path.join(NEW_DIR, subset, current_class, filename)
            if os.path.exists(dst):
                dst = os.path.join(NEW_DIR, subset, current_class, f"dup_{filename}")

            shutil.copy(current_path, dst)

    return {
        "train": (train_paths, train_classes),
        "val": (val_paths, val_classes),
        "test": (test_paths, test_classes),
    }


def print_statistics(dataset):
    print("STATS FOR DATASET AFTER RANDOM DIVISION")

    for subset, (s_path, s_class) in dataset.items():
        print(f"{subset.upper()}: {len(s_path)} images total")
        counts = Counter(s_class)
        for class_name in CLASSES:
            count = counts.get(class_name, 0)
            percentage = count / len(s_path) * 100

            print(f"\t{class_name}: {count} ({percentage:.1f}%)")