# Dataset Preparation
Notebook untuk persiapan dan preprocessing dataset deteksi mata uang Rupiah.

File: notebooks/01_dataset_preparation.ipynb
Author: Alfrida Sabar

In [None]:
# Sel 1: Import Dependensi
import os
import sys
import yaml
import numpy as np
from pathlib import Path

# Import custom modules
from handlers.roboflow_handler import RoboflowHandler
from handlers.data_handler import DataHandler
from utils.logger import SmartCashLogger
from utils.preprocessing import ImagePreprocessor

# Setup logger
logger = SmartCashLogger(__name__)

# Definisi Path
BASE_DIR = Path.cwd()
CONFIG_PATH = BASE_DIR / 'configs' / 'base_config.yaml'
DATA_DIR = BASE_DIR / 'data'

In [None]:
# Sel 2: Fungsi Utilitas Konfigurasi
def load_config(path):
    """
    Muat konfigurasi dari file YAML
    
    Args:
        path (Path): Path ke file konfigurasi
    
    Returns:
        dict: Konfigurasi yang dimuat
    """
    try:
        with open(path, 'r') as f:
            return yaml.safe_load(f)
    except Exception as e:
        logger.error(f"Gagal memuat konfigurasi: {e}")
        raise

In [None]:
# Sel 3: Manajemen Sumber Dataset
def get_dataset(
    source='local', 
    config_path=CONFIG_PATH, 
    data_dir=DATA_DIR
):
    """
    Dapatkan dataset dari sumber yang berbeda
    
    Args:
        source (str): Sumber dataset ('roboflow' atau 'local')
        config_path (Path): Path ke file konfigurasi
        data_dir (Path): Direktori penyimpanan data
    
    Returns:
        tuple: Path train, validasi, dan test set
    """
    if source == 'local':
        logger.info(f"🏠 Menggunakan dataset lokal: {data_dir}")
        
        # Path untuk setiap split
        train_path = data_dir / 'train'
        val_path = data_dir / 'valid'
        test_path = data_dir / 'test'
        
    elif source == 'roboflow':
        # Download dataset dari Roboflow
        roboflow_handler = RoboflowHandler(
            config_path=str(config_path),
            data_dir=str(data_dir)
        )
        
        # Dapatkan informasi dan download dataset
        dataset_info = roboflow_handler.get_dataset_info()
        logger.data(f"📊 Informasi Dataset Roboflow:\n{dataset_info}")
        
        train_path, val_path, test_path = roboflow_handler.pull_dataset()
    
    else:
        raise ValueError("Sumber dataset tidak valid. Gunakan 'local' atau 'roboflow'")
    
    # Log path dataset
    logger.success(f"📥 Dataset berhasil diakses:\n"
                   f"Train: {train_path}\n"
                   f"Validasi: {val_path}\n"
                   f"Test: {test_path}")
    
    return train_path, val_path, test_path

In [None]:
# Sel 4: Validasi Dataset
def validate_dataset(config_path, data_dir):
    """
    Validasi struktur dan integritas dataset
    
    Args:
        config_path (Path): Path ke file konfigurasi
        data_dir (Path): Direktori penyimpanan data
    
    Returns:
        dict: Statistik dataset
    """
    # Inisiasi data handler
    data_handler = DataHandler(
        config_path=str(config_path),
        data_dir=str(data_dir)
    )
    
    # Validasi struktur dataset
    is_valid = data_handler.verify_dataset()
    if not is_valid:
        logger.warning("⚠️ Struktur dataset tidak valid!")
        return None
    
    # Dapatkan statistik dataset
    dataset_stats = data_handler.get_dataset_stats()
    logger.data(f"📊 Statistik Dataset:\n{dataset_stats}")
    
    return dataset_stats

In [None]:
# Sel 5: Preprocessing Dataset
def preprocess_dataset(config_path, train_path, val_path, test_path):
    """
    Preprocessing dataset dengan fokus pada augmentasi training set
    
    Args:
        config_path (Path): Path ke file konfigurasi
        train_path (Path): Path dataset training
        val_path (Path): Path dataset validasi
        test_path (Path): Path dataset testing
    """
    # Inisiasi preprocessor
    preprocessor = ImagePreprocessor(
        config_path=str(config_path)
    )
    
    # Konfigurasi split dataset
    splits = [
        ('train', train_path),
        ('valid', val_path),
        ('test', test_path)
    ]
    
    # Preprocessing untuk setiap split
    for split_name, split_path in splits:
        logger.start(f"Preprocessing dataset {split_name.upper()}")
        preprocessor.process_dataset(
            input_dir=split_path / 'images',
            output_dir=split_path / 'processed_images',
            augment=(split_name == 'train')  # Augmentasi hanya pada training set
        )

In [None]:
# Sel 6: Fungsi Utama
def run_notebook(source='local'):
    """
    Jalankan proses persiapan dataset
    
    Args:
        source (str): Sumber dataset ('local' atau 'roboflow')
    """
    try:
        logger.start("🚀 Memulai Persiapan Dataset SmartCash")
        
        # 1. Muat konfigurasi
        config = load_config(CONFIG_PATH)
        logger.info("✅ Konfigurasi berhasil dimuat")
        
        # 2. Dapatkan dataset
        train_path, val_path, test_path = get_dataset(source=source)
        
        # 3. Validasi dataset
        dataset_stats = validate_dataset(CONFIG_PATH, DATA_DIR)
        if not dataset_stats:
            return
        
        # 4. Preprocessing dataset
        preprocess_dataset(CONFIG_PATH, train_path, val_path, test_path)
        
        logger.success("🎉 Persiapan Dataset SmartCash Selesai!")
        
    except Exception as e:
        logger.error(f"❌ Kesalahan dalam Persiapan Dataset: {str(e)}")

# Sel 7: Eksekusi Notebook
if __name__ == '__main__':
    run_notebook()