In [1]:
# prepare_dataset_split_aug.py
# Membagi data 70:30 (train:test), lalu augmentasi hanya training
# Output disimpan di data/final/train/ dan data/final/test/

import os
import random
from pathlib import Path
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')

In [2]:
# --------------------------
# PARAMETER
# --------------------------
SOURCE_DIR = "data/Original"
TARGET_DIR = "data/Final"
IMG_SIZE = (224, 224)
SEED = 42
random.seed(SEED)

# Augmentasi pipeline
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.2),
    layers.RandomZoom(0.2),
    layers.RandomContrast(0.2),
])

def load_and_preprocess(img_path, target_size=IMG_SIZE):
    """Load image and resize"""
    img = image.load_img(img_path, target_size=target_size)
    img_arr = image.img_to_array(img) / 255.0
    return img_arr

def save_image(arr, save_path):
    """Save numpy array as image JPG"""
    arr = np.clip(arr * 255.0, 0, 255).astype(np.uint8)
    tf.keras.utils.save_img(save_path, arr)

In [3]:
# --------------------------
# 1. List kelas & file
# --------------------------
classes = [d for d in os.listdir(SOURCE_DIR) if os.path.isdir(os.path.join(SOURCE_DIR, d))]
print("Kelas ditemukan:", classes)

os.makedirs(TARGET_DIR, exist_ok=True)

Kelas ditemukan: ['Benign', 'Early', 'Pre', 'Pro']


In [4]:
# --------------------------
# 2. Split train:test (70:30)
# --------------------------
split_ratio = 0.3
train_files = {}
test_files = {}

for cls in classes:
    src_folder = Path(SOURCE_DIR) / cls
    files = list(src_folder.glob("*.jpg")) + list(src_folder.glob("*.png")) + list(src_folder.glob("*.jpeg"))
    
    train_f, test_f = train_test_split(files, test_size=split_ratio, random_state=SEED, shuffle=True)
    train_files[cls] = train_f
    test_files[cls] = test_f

In [5]:
# --------------------------
# 3. Copy train & test
# --------------------------
print("\nMenyimpan data train & test...")
for cls in classes:
    train_dir = Path(TARGET_DIR) / "train" / cls
    test_dir = Path(TARGET_DIR) / "test" / cls
    train_dir.mkdir(parents=True, exist_ok=True)
    test_dir.mkdir(parents=True, exist_ok=True)

    for f in tqdm(train_files[cls], desc=f"Copy train {cls}"):
        img = load_and_preprocess(str(f))
        save_path = train_dir / f.name
        save_image(img, save_path)

    for f in tqdm(test_files[cls], desc=f"Copy test {cls}"):
        img = load_and_preprocess(str(f))
        save_path = test_dir / f.name
        save_image(img, save_path)


Menyimpan data train & test...


Copy train Benign: 100%|██████████| 352/352 [00:02<00:00, 173.76it/s]
Copy test Benign: 100%|██████████| 152/152 [00:00<00:00, 203.89it/s]
Copy train Early: 100%|██████████| 689/689 [00:03<00:00, 193.00it/s]
Copy test Early: 100%|██████████| 296/296 [00:01<00:00, 191.42it/s]
Copy train Pre: 100%|██████████| 674/674 [00:03<00:00, 197.78it/s]
Copy test Pre: 100%|██████████| 289/289 [00:01<00:00, 202.05it/s]
Copy train Pro: 100%|██████████| 562/562 [00:02<00:00, 190.16it/s]
Copy test Pro: 100%|██████████| 242/242 [00:01<00:00, 188.60it/s]


In [6]:
# --------------------------
# 4. Augmentasi data TRAIN
# --------------------------
print("\nAugmentasi data training...")
target_min = 3000   # minimal per kelas
final_target_total = 12000  # minimal total keseluruhan

class_counts = {}

for cls in classes:
    train_dir = Path(TARGET_DIR) / "train" / cls
    files = list(train_dir.glob("*.jpg")) + list(train_dir.glob("*.png"))
    count = len(files)

    while count < target_min:
        img_path = random.choice(files)
        img = load_and_preprocess(str(img_path))
        img_batch = np.expand_dims(img, axis=0)

        aug_img = data_augmentation(img_batch, training=True)[0].numpy()
        save_name = f"aug_{count+1:05d}.jpg"
        save_path = train_dir / save_name
        save_image(aug_img, save_path)

        count += 1
        files.append(save_path)

    class_counts[cls] = count
    print(f"Kelas {cls}: {count} gambar di train setelah augmentasi")

print("\nDistribusi train akhir:", class_counts)
print("Total train dataset:", sum(class_counts.values()))


Augmentasi data training...


KeyboardInterrupt: 

In [None]:
# --------------------------
# 5. Hitung distribusi test
# --------------------------
test_counts = {cls: len(test_files[cls]) for cls in classes}
print("Distribusi test:", test_counts, "Total:", sum(test_counts.values()))

print("\nDataset final tersimpan di:", TARGET_DIR)