# Data Preprocessing

## Resplit images in dataset

In [1]:
# Import libraries
from pathlib import Path
import math, shutil

In [2]:
# Resplit images in dataset
BASE = Path("dataset")
SRC_TRAIN = BASE / "train"
SRC_TEST = BASE / "test"
OUT_TRAIN = BASE / "train"
OUT_VAL = BASE / "val"
OUT_TEST = BASE / "test"
classes = sorted({p.name for p in SRC_TRAIN.iterdir() if p.is_dir()} | {p.name for p in SRC_TEST.iterdir() if p.is_dir()})
for c in classes:
    files = []
    ct = SRC_TRAIN / c
    if ct.exists():
        files += sorted(ct.rglob("*.jpg"))
    cte = SRC_TEST / c
    if cte.exists():
        files += sorted(cte.rglob("*.jpg"))
    n = len(files)
    n_train = math.floor(n * 0.70)
    n_val = math.floor(n * 0.15)
    n_test = n - n_train - n_val
    splits = [(OUT_TRAIN / c, files[:n_train]), (OUT_VAL / c, files[n_train:n_train+n_val]), (OUT_TEST / c, files[n_train+n_val:])]
    for out_dir, group in splits:
        out_dir.mkdir(parents=True, exist_ok=True)
        for f in group:
            dst = out_dir / f.name
            if f.resolve() == dst.resolve():
                continue
            shutil.move(str(f), str(dst))

In [3]:
# Count all images in train, val and test sets
BASE = Path("dataset")
all_total = 0
for split in ["train", "val", "test"]:
    d = BASE / split
    if not d.exists():
        continue
    total = 0
    for cls in [p for p in d.iterdir() if p.is_dir()]:
        n = sum(1 for _ in cls.rglob("*.jpg"))
        total += n
        print(f"{split}/{cls.name}: {n}")
    all_total += total
    print(f"{split} total: {total}")
print(f"All total: {all_total}")

train/glioma: 1134
train/meningioma: 1151
train/pituitary: 1229
train/notumor: 1400
train total: 4914
val/glioma: 243
val/meningioma: 246
val/pituitary: 263
val/notumor: 300
val total: 1052
test/glioma: 244
test/meningioma: 248
test/pituitary: 265
test/notumor: 300
test total: 1057
All total: 7023


## Data augmentation

In [4]:
# Import libraries
from pathlib import Path
from PIL import Image, ImageEnhance
import random, shutil

In [5]:
# Data augmentation (rotate, brightness, flip) 
BASE = Path("dataset")
d = BASE / "train"
if d.exists():
    for cls in [p for p in d.iterdir() if p.is_dir()]:
        srcs = [p for p in cls.rglob("*.jpg") if ".tmp_augment" not in str(p)]
        tmp = cls / ".tmp_augment"
        tmp.mkdir(exist_ok=True)
        i = 1
        for f in srcs:
            with Image.open(f) as im:
                im = im.convert("RGB")
                imgs = [
                    im,
                    im.rotate(random.uniform(-15, 15), expand=False),
                    ImageEnhance.Brightness(im).enhance(random.uniform(0.8, 1.2)),
                    im.transpose(Image.FLIP_LEFT_RIGHT)
                ]
            for img in imgs:
                dst = (tmp / f"{cls.name}_{i:06d}").with_suffix(".jpg")
                img.save(dst, quality=95)
                i += 1
        for old in srcs:
            try:
                old.unlink()
            except Exception:
                pass
        for newf in tmp.glob("*.jpg"):
            shutil.move(str(newf), str(cls / newf.name))
        tmp.rmdir()

In [6]:
# Count all images in train, val and test sets
BASE = Path("dataset")
all_total = 0
for split in ["train", "val", "test"]:
    d = BASE / split
    if not d.exists():
        continue
    total = 0
    for cls in [p for p in d.iterdir() if p.is_dir()]:
        n = sum(1 for _ in cls.rglob("*.jpg"))
        total += n
        print(f"{split}/{cls.name}: {n}")
    all_total += total
    print(f"{split} total: {total}")
print(f"All total: {all_total}")

train/glioma: 4536
train/meningioma: 4604
train/pituitary: 4916
train/notumor: 5600
train total: 19656
val/glioma: 243
val/meningioma: 246
val/pituitary: 263
val/notumor: 300
val total: 1052
test/glioma: 244
test/meningioma: 248
test/pituitary: 265
test/notumor: 300
test total: 1057
All total: 21765


## Resize dataset images to 224x224 pixels

In [7]:
# Import libraries
from pathlib import Path
from PIL import Image

In [8]:
# Resize images in dataset to 224x224 pixels
BASE = Path("dataset")
SIZE = (224, 224)
try:
    R = Image.Resampling.BILINEAR
except Exception:
    R = Image.BILINEAR
exts = {".jpg"}
for split in ["train", "val", "test"]:
    d = BASE / split
    if not d.exists():
        continue
    for cls in [p for p in d.iterdir() if p.is_dir()]:
        for f in cls.rglob("*"):
            if f.is_file() and f.suffix.lower() in exts:
                with Image.open(f) as im:
                    im.convert("RGB").resize(SIZE, resample=R).save(f)

## Normalize pixel to [0, 1] or ImageNet statistics

In [9]:
# Import libraries
from pathlib import Path
from PIL import Image
import numpy as np

In [10]:
# Normalize pixel to [0, 1]
BASE = Path("dataset")
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
out_unit = BASE.with_name(BASE.name + "_unit")
for split in ["train", "val", "test"]:
    d = BASE / split
    if not d.exists():
        continue
    for cls in [p for p in d.iterdir() if p.is_dir()]:
        for f in cls.rglob("*.jpg"):
            with Image.open(f) as im:
                x = np.asarray(im.convert("RGB"), dtype=np.float32) / 255.0
            dst = (out_unit / split / cls.name / f.stem).with_suffix(".npy")
            dst.parent.mkdir(parents=True, exist_ok=True)
            np.save(dst, x.astype(np.float32))

In [11]:
# Normalize pixel to ImageNet statistics
BASE = Path("dataset")
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
out_imagenet = BASE.with_name(BASE.name + "_imagenet")
for split in ["train", "val", "test"]:
    d = BASE / split
    if not d.exists():
        continue
    for cls in [p for p in d.iterdir() if p.is_dir()]:
        for f in cls.rglob("*.jpg"):
            with Image.open(f) as im:
                x = np.asarray(im.convert("RGB"), dtype=np.float32) / 255.0
                x = (x - mean) / std
            dst = (out_imagenet / split / cls.name / f.stem).with_suffix(".npy")
            dst.parent.mkdir(parents=True, exist_ok=True)
            np.save(dst, x.astype(np.float32))