# ML Quality Eval: Train/Test Split (80:20)
Handles both multi-file and single-file scenarios intelligently

In [None]:
RUN_TIMESTAMP = "2025-01-01_00-00-00"
INPUT_FOLDER = "s3://processed-data"
OUTPUT_TRAIN_DATA = "s3://models-quality-eval-ml/train/train_data.pkl"
OUTPUT_TEST_DATA = "s3://models-quality-eval-ml/test/test_data.pkl"

TRAIN_RATIO = 0.8
RANDOM_SEED = 42
MIN_ROWS_FOR_SPLIT = 1000

MINIO_ENDPOINT = "http://minio:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password123"

In [None]:
import pandas as pd
import numpy as np
import pickle
import s3fs
from sklearn.model_selection import train_test_split

In [None]:
print("=== ML Quality Eval: Train/Test Split ===")
print(f"Run Timestamp: {RUN_TIMESTAMP}")
print(f"MinIO Endpoint: {MINIO_ENDPOINT}")
print(f"Input Folder: {INPUT_FOLDER}")

fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={'endpoint_url': MINIO_ENDPOINT}
)

storage_options = {
    "key": MINIO_ACCESS_KEY,
    "secret": MINIO_SECRET_KEY,
    "client_kwargs": {"endpoint_url": MINIO_ENDPOINT}
}

In [None]:
# Create bucket if not exists
try:
    fs.ls('models-quality-eval-ml')
    print("‚úÖ Bucket 'models-quality-eval-ml' exists")
except FileNotFoundError:
    fs.mkdir('models-quality-eval-ml')
    print("‚úÖ Created bucket 'models-quality-eval-ml'")

# Create folder structure
folders = [
    'models-quality-eval-ml/train',
    'models-quality-eval-ml/test',
    'models-quality-eval-ml/models',
    'models-quality-eval-ml/metrics'
]

for folder in folders:
    try:
        fs.ls(folder)
    except FileNotFoundError:
        fs.mkdir(folder)
        print(f"Created folder: {folder}")

In [None]:
input_prefix = INPUT_FOLDER.replace('s3://', '')
if not input_prefix.endswith('/'):
    input_prefix += '/'

try:
    raw_paths = fs.glob(f"{input_prefix}*.csv")
    file_paths = [f"s3://{p}" for p in raw_paths]
    
    print(f"Found {len(file_paths)} CSV file(s)")
    for i, path in enumerate(file_paths, 1):
        print(f"   {i}. {path.split('/')[-1]}")
    
except Exception as e:
    print(f"Error listing files in {INPUT_FOLDER}: {e}")
    raise

if len(file_paths) == 0:
    raise ValueError(f"No CSV files found in {INPUT_FOLDER}")

In [None]:
# CELL 6: Determine Split Strategy
if len(file_paths) >= 10:
    split_strategy = "file-level"
    print(f"   Multiple files available ({len(file_paths)} files)")
elif len(file_paths) >= 2:
    split_strategy = "file-level"
    print(f"   Limited files: {len(file_paths)} files")
    print(f"   Warning: Split may not be fully representative")
else:
    split_strategy = "row-level"
    print(f"   Single file detected: {file_paths[0].split('/')[-1]}")
    print(f"   Will split by trip_id or random rows")

In [None]:
# CELL 7: Perform Split with Auto-Generated trip_id
np.random.seed(RANDOM_SEED)

# Helper function untuk extract nama file jadi trip_id
def get_trip_id_from_path(path):
    # Ambil nama file paling belakang, buang .csv
    filename = path.split('/')[-1]
    return filename.replace('.csv', '').replace('.CSV', '')

if split_strategy == "file-level":
    # === STRATEGY 1: BANYAK FILE (Split File-nya) ===
    
    train_files, test_files = train_test_split(
        file_paths,
        train_size=TRAIN_RATIO,
        random_state=RANDOM_SEED,
        shuffle=True
    )
    
    print(f"\n=== Split Results ===")
    print(f"Train Files: {len(train_files)} ({len(train_files)/len(file_paths)*100:.1f}%)")
    print(f"Test Files: {len(test_files)} ({len(test_files)/len(file_paths)*100:.1f}%)")
    
    # --- LOAD TRAIN FILES ---
    train_dfs = []
    for f in train_files:
        try:
            print(f"  ‚Üí Loading Train: {f.split('/')[-1]}")
            df_tmp = pd.read_csv(f, storage_options=storage_options)
            
            # [BARU] Generate trip_id dari nama file jika belum ada
            if 'trip_id' not in df_tmp.columns:
                df_tmp['trip_id'] = get_trip_id_from_path(f)
                
            train_dfs.append(df_tmp)
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading {f}: {e}")
    
    if len(train_dfs) == 0:
        raise ValueError("Could not load any training files")
    
    train_data = pd.concat(train_dfs, ignore_index=True)
    print(f"‚úÖ Train data combined: {train_data.shape}")
    
    # --- LOAD TEST FILES ---
    test_dfs = []
    for f in test_files:
        try:
            print(f"  ‚Üí Loading Test: {f.split('/')[-1]}")
            df_tmp = pd.read_csv(f, storage_options=storage_options)
            
            # [BARU] Generate trip_id dari nama file jika belum ada
            if 'trip_id' not in df_tmp.columns:
                df_tmp['trip_id'] = get_trip_id_from_path(f)
                
            test_dfs.append(df_tmp)
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading {f}: {e}")
    
    if len(test_dfs) == 0:
        raise ValueError("Could not load any test files")
    
    test_data = pd.concat(test_dfs, ignore_index=True)
    print(f"‚úÖ Test data combined: {test_data.shape}")
    
else:
    # === STRATEGY 2: SATU FILE (Split Baris-nya) ===
    print(f"\nLoading single file: {file_paths[0].split('/')[-1]}")
    df_full = pd.read_csv(file_paths[0], storage_options=storage_options)
    
    # [BARU] Generate trip_id dari nama file jika belum ada
    if 'trip_id' not in df_full.columns:
        print("   Generating trip_id from filename...")
        df_full['trip_id'] = get_trip_id_from_path(file_paths[0])
        
    print(f"‚úÖ Total rows: {len(df_full):,}")
    
    # Check minimum rows
    if len(df_full) < MIN_ROWS_FOR_SPLIT:
        print(f"\n‚ö†Ô∏è  Warning: {len(df_full):,} rows < MIN_ROWS_FOR_SPLIT ({MIN_ROWS_FOR_SPLIT:,})")
        print(f"   Metrics may be unstable with limited data")
    
    # Split logic
    if len(df_full['trip_id'].unique()) > 1:
        # Jika dalam satu file ternyata ada banyak trip_id
        print(f"\n‚úÖ Splitting by existing trip_id...")
        unique_trips = df_full['trip_id'].unique()
        
        train_trips, test_trips = train_test_split(
            unique_trips,
            train_size=TRAIN_RATIO,
            random_state=RANDOM_SEED,
            shuffle=True
        )
        
        train_data = df_full[df_full['trip_id'].isin(train_trips)].copy()
        test_data = df_full[df_full['trip_id'].isin(test_trips)].copy()
    else:
        # Jika benar-benar cuma 1 trip (single file, single trip)
        print(f"\n‚úÖ Single trip detected. Splitting rows randomly (Time-series split recommended but using random for simplicity)")
        
        train_data, test_data = train_test_split(
            df_full,
            train_size=TRAIN_RATIO,
            random_state=RANDOM_SEED,
            shuffle=False # Shuffle False supaya urutan waktu terjaga (opsional, tapi bagus buat time series)
        )
    
    print(f"\n=== Split Results === ")
    print(f"Train rows: {len(train_data):,} ({len(train_data)/len(df_full)*100:.1f}%)")
    print(f"Test rows: {len(test_data):,} ({len(test_data)/len(df_full)*100:.1f}%)")

print(f"\n‚úÖ Split complete (seed={RANDOM_SEED})")

In [None]:
# CELL 8: Verify Data Integrity
print(f"\n=== Data Verification ===")
print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")
print(f"Total samples: {len(train_data) + len(test_data):,}")

# Show columns
print(f"\nColumns ({len(train_data.columns)}):")
print(f"  First 10: {list(train_data.columns[:10])}")
if len(train_data.columns) > 10:
    print(f"  ... and {len(train_data.columns) - 10} more")

# Check for trip_id overlap (data leakage)
if 'trip_id' in train_data.columns and split_strategy == 'row-level':
    train_trips = set(train_data['trip_id'].unique())
    test_trips = set(test_data['trip_id'].unique())
    overlap = train_trips.intersection(test_trips)
    
    if len(overlap) > 0:
        print(f"\nWARNING: {len(overlap)} trips in both train and test!")
        print(f"   This indicates data leakage")
    else:
        print(f"\n‚úÖ No trip_id overlap (no data leakage)")

In [None]:
# CELL 9: Save Train and Test Data
print(f"\n=== Saving Results ===")

print(f"Saving train data to {OUTPUT_TRAIN_DATA}...")
with fs.open(OUTPUT_TRAIN_DATA, 'wb') as f:
    pickle.dump(train_data, f)
print(f"‚úÖ Train data saved")

print(f"\nSaving test data to {OUTPUT_TEST_DATA}...")
with fs.open(OUTPUT_TEST_DATA, 'wb') as f:
    pickle.dump(test_data, f)
print(f"‚úÖ Test data saved")

print(f"\n" + "="*70)
print("üéâ TRAIN/TEST SPLIT COMPLETE")
print("="*70)
print(f"Strategy: {split_strategy.upper()}")
print(f"Train samples: {len(train_data):,}")
print(f"Test samples: {len(test_data):,}")
print(f"Split ratio: {TRAIN_RATIO*100:.0f}% / {(1-TRAIN_RATIO)*100:.0f}%")
print(f"Random seed: {RANDOM_SEED}")
print(f"\nOutput files:")
print(f"  Train: {OUTPUT_TRAIN_DATA}")
print(f"  Test: {OUTPUT_TEST_DATA}")
print("="*70)