# Dataset Preparation
Notebook untuk persiapan dan preprocessing dataset deteksi mata uang Rupiah.

File: notebooks/01_dataset_preparation.ipynb
Author: Alfrida Sabar

In [None]:

# Sel 1: Impor Dependensi
# =====================================================
import os
import sys
import yaml
import argparse
import numpy as np

# Dapatkan path absolut direktori project
project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_dir)

# Import custom handlers dan utilitas
from handlers.roboflow_handler import RoboflowHandler
from handlers.data_handler import DataHandler
from utils.logger import SmartCashLogger
from utils.preprocessing import ImagePreprocessor

# Setup logging
logger = SmartCashLogger(__name__)

# Path konfigurasi utama
CONFIG_PATH = os.path.join(project_dir, 'configs', 'base_config.yaml')
DATA_DIR = os.path.join(project_dir, 'data')

In [None]:
# Sel 2: Fungsi Utility Konfigurasi
# =====================================================
def load_config(path):
    """
    Muat konfigurasi dari file YAML
    
    Args:
        path (str): Path ke file konfigurasi
    
    Returns:
        dict: Konfigurasi yang dimuat
    """
    try:
        with open(path, 'r') as f:
            return yaml.safe_load(f)
    except Exception as e:
        logger.error(f"Gagal memuat konfigurasi: {e}")
        raise


In [None]:
# Sel 3: Manajemen Sumber Dataset
# =====================================================
def get_dataset(
    source='local', 
    config_path=CONFIG_PATH, 
    data_dir=DATA_DIR, 
    local_path=None
):
    """
    Dapatkan dataset dari sumber yang berbeda
    
    Args:
        source (str): Sumber dataset ('roboflow' atau 'local')
        config_path (str): Path ke file konfigurasi
        data_dir (str): Direktori penyimpanan data
        local_path (str, optional): Path dataset lokal
    
    Returns:
        tuple: Path train, validasi, dan test set
    """
    # Pilih sumber dataset
    if source == 'local':
        # Tentukan path dataset
        dataset_path = local_path if local_path else data_dir
        logger.info(f"🏠 Menggunakan dataset lokal: {dataset_path}")
        
        # Path untuk setiap split
        train_path = os.path.join(dataset_path, 'train')
        val_path = os.path.join(dataset_path, 'valid')
        test_path = os.path.join(dataset_path, 'test')
        
        # Validasi keberadaan direktori
        for path in [train_path, val_path, test_path]:
            if not os.path.exists(path):
                raise ValueError(f"Direktori tidak ditemukan: {path}")
    
    elif source == 'roboflow':
        # Download dataset dari Roboflow
        roboflow_handler = RoboflowHandler(
            config_path=config_path,
            data_dir=data_dir
        )
        
        # Dapatkan informasi dan download dataset
        dataset_info = roboflow_handler.get_dataset_info()
        logger.data(f"📊 Informasi Dataset Roboflow:\n{dataset_info}")
        
        train_path, val_path, test_path = roboflow_handler.pull_dataset()
    
    else:
        raise ValueError("Sumber dataset tidak valid. Gunakan 'local' atau 'roboflow'")
    
    # Log path dataset
    logger.success(f"📥 Dataset berhasil diakses:\n"
                   f"Train: {train_path}\n"
                   f"Validasi: {val_path}\n"
                   f"Test: {test_path}")
    
    return train_path, val_path, test_path

In [None]:
# Sel 4: Validasi Dataset
# =====================================================
def validate_dataset(config_path, data_dir):
    """
    Validasi struktur dan integritas dataset
    
    Args:
        config_path (str): Path ke file konfigurasi
        data_dir (str): Direktori penyimpanan data
    
    Returns:
        dict: Statistik dataset
    """
    # Inisiasi data handler
    data_handler = DataHandler(
        config_path=config_path,
        data_dir=data_dir
    )
    
    # Validasi struktur dataset
    is_valid = data_handler.verify_dataset()
    if not is_valid:
        logger.warning("⚠️ Struktur dataset tidak valid!")
        return None
    
    # Dapatkan statistik dataset
    dataset_stats = data_handler.get_dataset_stats()
    logger.data(f"📊 Statistik Dataset:\n{dataset_stats}")
    
    return dataset_stats


In [None]:
# Sel 5: Preprocessing Dataset
# =====================================================
def preprocess_dataset(config_path, train_path, val_path, test_path):
    """
    Preprocessing dataset dengan fokus pada augmentasi training set
    
    Args:
        config_path (str): Path ke file konfigurasi
        train_path (str): Path dataset training
        val_path (str): Path dataset validasi
        test_path (str): Path dataset testing
    """
    # Inisiasi preprocessor
    preprocessor = ImagePreprocessor(
        config_path=config_path
    )
    
    # Konfigurasi split dataset
    splits = [
        ('train', train_path),
        ('valid', val_path),
        ('test', test_path)
    ]
    
    # Preprocessing untuk setiap split
    for split_name, split_path in splits:
        logger.start(f"Preprocessing dataset {split_name.upper()}")
        preprocessor.preprocess_dataset(
            input_dir=os.path.join(split_path, 'images'),
            output_dir=os.path.join(split_path, 'processed_images'),
            augment=(split_name == 'train')  # Augmentasi hanya pada training set
        )

In [None]:
# Sel 6: Proses Utama
# =====================================================
def main(args):
    """
    Proses utama persiapan dataset
    
    Args:
        args (Namespace): Argumen baris perintah
    """
    logger.start("🚀 Memulai Persiapan Dataset SmartCash")
    
    try:
        # 1. Muat konfigurasi
        config = load_config(CONFIG_PATH)
        logger.info("✅ Konfigurasi berhasil dimuat")
        
        # 2. Dapatkan dataset
        train_path, val_path, test_path = get_dataset(
            source=args.source, 
            local_path=args.local_path
        )
        
        # 3. Validasi dataset
        dataset_stats = validate_dataset(CONFIG_PATH, DATA_DIR)
        if not dataset_stats:
            return
        
        # 4. Preprocessing dataset
        preprocess_dataset(CONFIG_PATH, train_path, val_path, test_path)
        
        logger.success("🎉 Persiapan Dataset SmartCash Selesai!")
        
    except Exception as e:
        logger.error(f"❌ Kesalahan dalam Persiapan Dataset: {str(e)}")


In [None]:
# Sel 7: Parsing Argumen
# =====================================================
def parse_arguments():
    """
    Parse argumen baris perintah untuk fleksibilitas sumber dataset
    
    Returns:
        Namespace: Argumen yang di-parse
    """
    parser = argparse.ArgumentParser(description='Persiapan Dataset SmartCash')
    parser.add_argument(
        '--source', 
        type=str, 
        choices=['local', 'roboflow'], 
        default='local',
        help='Sumber dataset (local atau roboflow)'
    )
    parser.add_argument(
        '--local-path', 
        type=str, 
        default=None,
        help='Path dataset lokal (opsional, default ke data/)'
    )
    return parser.parse_args()



In [37]:
# Sel 8: Eksekusi Notebook
# =====================================================
def run_notebook(
    source='local', 
    local_path=None
):
    """
    Jalankan proses persiapan dataset di Jupyter Notebook
    
    Args:
        source (str): Sumber dataset ('local' atau 'roboflow')
        local_path (str, optional): Path dataset lokal
    """
    # Buat namespace argumen untuk kompatibilitas
    from argparse import Namespace
    
    args = Namespace(
        source=source, 
        local_path=local_path
    )
    
    # Jalankan proses utama
    main(args)

# Mode default untuk Jupyter
def jupyter_main():
    """
    Mode utama untuk lingkungan Jupyter
    """
    print("🚀 Notebook Persiapan Dataset SmartCash")
    print("Gunakan run_notebook() dengan opsi:")
    print("  - run_notebook() : Dataset lokal default")
    print("  - run_notebook(source='roboflow') : Download dari Roboflow")
    print("  - run_notebook(local_path='/path/custom') : Dataset lokal khusus")

# Logika eksekusi
if __name__ == '__main__':
    # Jika dijalankan sebagai script
    args = parse_arguments()
    main(args)
else:
    # Jika tidak ada argumen, aktifkan mode Jupyter
    jupyter_main()

usage: ipykernel_launcher.py [-h] [--source {local,roboflow}]
                             [--local-path LOCAL_PATH]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/masdevid/Library/Jupyter/runtime/kernel-v38a2b3915a8f19a66df662fe215901f21aa39716a.json


SystemExit: 2

# Catatan Pengguna:
1. Pastikan dependensi terinstal
2. Default: dataset lokal di direktori `data/`
3. Gunakan `get_dataset()` untuk sumber berbeda
4. Periksa log untuk detail proses