# ML Quality Eval: Train Model (Simplified)
Train on train set only - matches your SpeedAccelerationPredictor approach

In [None]:
# CELL 1: Parameters
RUN_TIMESTAMP = "2025-01-01_00-00-00"
INPUT_TRAIN_DATA = "s3://models-quality-eval-ml/train/train_data.pkl"
OUTPUT_ML_MODEL_PATH = "s3://models-quality-eval-ml/models/speed_accel_model.pkl"

# Use cross-validation for extra robustness? (slower but more stable)
USE_CROSS_VALIDATION = False  # Set True if you have limited data

MINIO_ENDPOINT = "http://minio:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password123"

In [None]:
# CELL 2: Imports
import pandas as pd
import numpy as np
import pickle
import s3fs
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

try:
    import xgboost as xgb
    HAS_XGB = True
except:
    HAS_XGB = False
    print("‚ö†Ô∏è  XGBoost not available, will skip")

print("‚úÖ Libraries imported!")

In [None]:
# CELL 3: MinIO Config
fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={'endpoint_url': MINIO_ENDPOINT}
)

storage_options = {
    "key": MINIO_ACCESS_KEY,
    "secret": MINIO_SECRET_KEY,
    "client_kwargs": {"endpoint_url": MINIO_ENDPOINT}
}

In [None]:
# CELL 4: Load Training Data
print(f"=== ML Quality Eval: Training on Train Set ===")
print(f"Run Timestamp: {RUN_TIMESTAMP}")
print(f"\nLoading training data from {INPUT_TRAIN_DATA}...")

try:
    with fs.open(INPUT_TRAIN_DATA, 'rb') as f:
        df = pickle.load(f)
    
    if isinstance(df, pd.DataFrame):
        print(f"‚úÖ Loaded DataFrame with {len(df):,} rows")
    else:
        raise TypeError(f"Expected DataFrame, got {type(df)}")
    
except FileNotFoundError:
    print(f"‚ùå Error: {INPUT_TRAIN_DATA} not found. Run step 01 first.")
    raise

print(f"Training dataset shape: {df.shape}")
print(f"Columns: {list(df.columns[:15])}...")

In [None]:
# CELL 5: Column Normalization
print("\nNormalizing column names...")

column_mapping = {
    'timestamp_sensor': 'timestamp',
    'latitude': 'position_lat',
    'longitude': 'position_long',
    'speed_ms': 'speed_mps',
    'altitude': 'enhanced_altitude',
    'acc_forward': 'acceleration_m_s2',
}

for old, new in column_mapping.items():
    if old in df.columns and new not in df.columns:
        df.rename(columns={old: new}, inplace=True)

print("‚úÖ Column normalization complete")

In [None]:
# CELL 6: Advanced Feature Engineering & Cleaning (Training Set)
print("\nPerforming Advanced Feature Engineering & Cleaning...")

# 1. SORT DATA (Wajib)
if 'trip_id' in df.columns:
    df = df.sort_values(['trip_id', 'seconds_elapsed'])
else:
    df = df.sort_values('seconds_elapsed')

# 2. REPAIR DATA (Jika speed rusak/0 semua)
if df['speed_mps'].mean() < 0.1 and 'position_lat' in df.columns:
    print("‚ö†Ô∏è Speed data broken. Recalculating from GPS coords...")
    # Simple Euclidean distance approx
    df['d_lat'] = df.groupby('trip_id')['position_lat'].diff().fillna(0)
    df['d_lon'] = df.groupby('trip_id')['position_long'].diff().fillna(0)
    df['calc_dist'] = np.sqrt(df['d_lat']**2 + df['d_lon']**2) * 111000 
    df['speed_mps'] = df['calc_dist'].rolling(window=3, center=True).mean().fillna(0)

# 3. FEATURE ENGINEERING
# A. Basic Lag Features
grouper = df.groupby('trip_id')['speed_mps'] if 'trip_id' in df.columns else df['speed_mps']
df['speed_mps_prev1'] = grouper.shift(1).fillna(0)
df['speed_mps_prev2'] = grouper.shift(2).fillna(0)
df['accel_prev1'] = (df['speed_mps_prev1'] - df['speed_mps_prev2']).fillna(0)

# B. Rolling Features (Trend 5 detik) - AGAR GRAFIK TIDAK PATAH-PATAH
# Shift(1) dulu baru rolling agar tidak bocor
roll_src = grouper.shift(1)
df['speed_roll_mean_5s'] = roll_src.rolling(5, min_periods=1).mean().fillna(0)
df['speed_roll_std_5s'] = roll_src.rolling(5, min_periods=1).std().fillna(0)

# C. Segment Context (Agar model tau ini jalan ngebut/pelan)
if 'segment_id' in df.columns:
    df['segment_avg_speed'] = df.groupby('segment_id')['speed_mps'].transform('mean')
else:
    # Fallback: Expanding mean
    if 'trip_id' in df.columns:
        df['segment_avg_speed'] = df.groupby('trip_id')['speed_mps'].expanding().mean().reset_index(0, drop=True)
    else:
        df['segment_avg_speed'] = df['speed_mps'].expanding().mean()

# D. Map Features
if 'enhanced_altitude' in df.columns:
    grouper_alt = df.groupby('trip_id')['enhanced_altitude'] if 'trip_id' in df.columns else df['enhanced_altitude']
    df['elev_gain_m'] = grouper_alt.diff().fillna(0)
else:
    df['elev_gain_m'] = 0

if 'label_traffic' in df.columns:
    traffic_map = {'heavy': 2, 'moderate': 1, 'light': 0}
    df['traffic_level'] = df['label_traffic'].map(traffic_map).fillna(1)
else:
    df['traffic_level'] = 1 

if 'position_lat' in df.columns:
    if 'trip_id' in df.columns:
        df['delta_lat'] = df.groupby('trip_id')['position_lat'].diff().fillna(0)
        df['delta_lon'] = df.groupby('trip_id')['position_long'].diff().fillna(0)
    else:
        df['delta_lat'] = df['position_lat'].diff().fillna(0)
        df['delta_lon'] = df['position_long'].diff().fillna(0)
    df['delta_dist'] = np.sqrt(df['delta_lat']**2 + df['delta_lon']**2)
else:
    df['delta_lat']=0; df['delta_lon']=0; df['delta_dist']=0

if 'bearing' in df.columns:
    grouper_bear = df.groupby('trip_id')['bearing'] if 'trip_id' in df.columns else df['bearing']
    df['heading_change'] = grouper_bear.diff().fillna(0)
    df['turn_count'] = (np.abs(df['heading_change']) > 15).astype(int)
else:
    df['heading_change']=0; df['turn_count']=0

df = df.fillna(0)

# 4. FILTER STATIONARY (PENTING AGAR TIDAK FLATLINE)
print(f"Original Rows: {len(df)}")
df = df[df['speed_mps'] > 0.5] # Buang data parkir
print(f"‚úÖ Filtered (Moving) Rows: {len(df)}")

print("‚úÖ Advanced Feature Engineering Complete.")

In [None]:
df.head()

In [None]:
# CELL 6.5: DATA SANITY CHECK & REPAIR (Wajib!)
print("\n=== CHECKING & REPAIRING SPEED DATA ===")

# 1. Cek apakah speed isinya 0 semua
max_speed = df['speed_mps'].max()
mean_speed = df['speed_mps'].mean()
print(f"Original Data -> Max Speed: {max_speed:.4f} m/s, Mean Speed: {mean_speed:.4f} m/s")

# 2. JIKA DATA RUSAK (0 SEMUA), HITUNG ULANG DARI JARAK (Geopy)
if mean_speed < 0.1: # Ambang batas kecurigaan
    print("‚ö†Ô∏è WARNING: Speed data seems broken (too low/zero). Recalculating from GPS...")
    
    # Pastikan delta_dist sudah dihitung di Cell 6
    # Speed = Jarak / Waktu (Asumsi data 1Hz, jadi dt=1)
    # Kita pakai moving average biar ga noise
    df['speed_mps'] = df['delta_dist'].rolling(window=3, center=True).mean().fillna(0)
    
    # Recalculate Accel
    df['accel_from_speed'] = df['speed_mps'].diff().fillna(0)
    
    print(f"‚úÖ REPAIRED Data -> Max Speed: {df['speed_mps'].max():.4f} m/s")
    
# 3. Drop data diam (Optional: Biar model fokus belajar jalan)
# Hapus baris yang speed-nya 0 (parkir)
initial_len = len(df)
df = df[df['speed_mps'] > 0.5] # Ambil yang bergerak > 0.5 m/s
print(f"Dropped {initial_len - len(df)} stationary rows. New training size: {len(df)}")

if len(df) == 0:
    raise ValueError("‚ùå STOP: Tidak ada data bergerak di Training Set! Cek Step 01 (Split).")

In [None]:
# CELL 7: Prepare Training Data
feature_cols = [
    'speed_mps_prev1', 'speed_mps_prev2', 'accel_prev1',
    'speed_roll_mean_5s', 'speed_roll_std_5s', # NEW
    'segment_avg_speed',                       # NEW
    'enhanced_altitude', 'bearing', 
    'delta_dist', 'delta_lat', 'delta_lon',
    'elev_gain_m', 'traffic_level', 
    'heading_change', 'turn_count'
]

# Ensure cols exist
for c in feature_cols:
    if c not in df.columns: df[c] = 0

X = df[feature_cols].values
y_speed = df['speed_mps'].values

print(f"‚úÖ Features: {feature_cols}")

In [None]:
# CELL 8: Scale Features
print("\nScaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("‚úÖ Features scaled")

In [None]:
# CELL 9: Train-Val Split (RAW FEATURES)
X_train, X_val, y_speed_train, y_speed_val = train_test_split(
    X, y_speed, test_size=0.2, random_state=42, shuffle=False
)


print(f"\nTrain/Validation split:")
print(f"   Training samples: {len(X_train):,}")
print(f"   Validation samples: {len(X_val):,}")

In [None]:
# CELL 9.5: Scale ONLY for SVR & ANN
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [None]:
# CELL 10: Define 5 Core Models (Environment Compatible)
print("Defining 5 core models (XGBoost using MAE objective)...")

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

models = {
    # 1. RANDOM FOREST (Baseline)
    'RandomForest': RandomForestRegressor(
        n_estimators=300,
        max_depth=30,        # Deep trees to capture variance
        min_samples_leaf=2,  
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ),

    # 2. DECISION TREE (Simple)
    'DecisionTree': DecisionTreeRegressor(
        max_depth=20,
        min_samples_leaf=4,
        random_state=42
    ),

    # 3. SVR (Support Vector)
    'SVR': SVR(
        C=100,           # High regularization C for aggressive fitting
        gamma=0.1,       
        epsilon=0.01,    
        kernel='rbf'
    ),

    # 4. ANN (Neural Network)
    'ANN': MLPRegressor(
        hidden_layer_sizes=(128, 64, 32), 
        activation='relu',
        alpha=0.0001,
        learning_rate_init=0.001,
        max_iter=500,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    )
}

# 5. XGBOOST (Compatible & Robust)
if HAS_XGB:
    models['XGBoost'] = xgb.XGBRegressor(
        objective='reg:absoluteerror',
        n_estimators=600,
        learning_rate=0.05,
        max_depth=8,            
        subsample=0.8,
        colsample_bytree=0.9,
        reg_alpha=0.1,
        reg_lambda=1.0,
        n_jobs=-1,
        random_state=42
    )
    print(f"‚úÖ {len(models)} models defined (including XGBoost MAE)")
else:
    print(f"‚úÖ {len(models)} models defined (XGBoost skipped)")

for name in models.keys():
    print(f"   - {name}")

In [None]:
# CELL 11: Training Loop (Smart Scaling)
print("\n=== STARTING TRAINING LOOP ===")

results = []
trained_models = {}

# Pastikan target 1D array
y_train_flat = y_speed_train.ravel()

for name, model in models.items():
    print(f"\nüöÄ Training: {name}")
    
    # --- LOGIKA SCALING (CRITICAL) ---
    # SVR dan ANN (Neural Network) SANGAT butuh data scaled (-1 s/d 1)
    # Tree-based (RF, DT, XGB) seringkali lebih bagus pakai data asli (biar tau angka real)
    if name in ['SVR', 'ANN']:
        X_tr = X_train_scaled
        X_v  = X_val_scaled
        print("   -> Using SCALED data (StandardScaler)")
    else:
        X_tr = X_train
        X_v  = X_val
        print("   -> Using RAW data (Original Units)")
    
    # Train
    model.fit(X_tr, y_train_flat)
    
    # Predict (Validation)
    y_pred = model.predict(X_v)
    
    # Metrics
    r2 = r2_score(y_speed_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_speed_val, y_pred))
    mae = mean_absolute_error(y_speed_val, y_pred)
    
    # Simpan Hasil
    results.append({
        'Model': name,
        'R¬≤': r2,
        'RMSE': rmse,
        'MAE': mae
    })
    trained_models[name] = model
    
    print(f"   R¬≤   : {r2:.4f}")
    print(f"   RMSE : {rmse:.4f} m/s")

In [None]:
# CELL 12: Select Best Model (by RMSE, like your original)
df_results = pd.DataFrame(results).sort_values('RMSE').reset_index(drop=True)


best_name = df_results.iloc[0]['Model']
best_model = trained_models[best_name]
best_r2 = df_results.iloc[0]['R¬≤']
best_rmse = df_results.iloc[0]['RMSE']
best_mae = df_results.iloc[0]['MAE']

print("\n‚úÖ BEST MODEL (Speed):", best_name)
print(df_results)

print(f"\nüèÜ BEST MODEL SELECTED")
print(f"‚û° Model : {best_name}")
print(f"‚û° R¬≤    : {best_r2:.4f}")
print(f"‚û° RMSE  : {best_rmse:.4f} m/s ({best_rmse*3.6:.2f} km/h)")
print(f"‚û° MAE   : {best_mae:.4f} m/s ({best_mae*3.6:.2f} km/h)")

In [None]:
final_model = models[best_name]

if best_name in ['SVR', 'ANN']:
    X_full = scaler.fit_transform(X)
else:
    X_full = X

final_model.fit(X_full, y_speed)

print("‚úÖ Final model trained on full data")

In [None]:
# CELL 13.5: Calculate & Validate Derived Acceleration
# ---------------------------------------------------
# Logic: Accel_Pred(t) = Speed_Pred(t) - Speed_Pred(t-1)
# Kita hitung accel MURNI dari prediksi speed, tanpa mengintip data asli.

print("\n=== DERIVED ACCELERATION VALIDATION ===")

# 1. Generate Prediksi Speed di Data Validasi
# Pastikan X_val urut waktu (shuffle=False saat split di step sebelumnya sangat PENTING!)
y_pred_speed = final_model.predict(X_val_scaled if best_name in ['SVR', 'ANN'] else X_val)

# 2. Hitung Predicted Acceleration (Tanpa Data Leakage)
# Menggunakan pandas diff() untuk menghitung selisih t dengan t-1
pred_speed_series = pd.Series(y_pred_speed)
pred_accel = pred_speed_series.diff().fillna(0).values

# 3. Hitung Real Acceleration (Sebagai Kunci Jawaban)
# Kita hitung dari y_speed_val asli agar apple-to-apple perbandingannya
real_speed_series = pd.Series(y_speed_val)
real_accel = real_speed_series.diff().fillna(0).values

# 4. Hitung Metrik Akurasi Akselerasi
accel_rmse = np.sqrt(mean_squared_error(real_accel, pred_accel))
accel_mae = mean_absolute_error(real_accel, pred_accel)
accel_r2 = r2_score(real_accel, pred_accel)

print(f"üìä Derived Acceleration Metrics (Calculated from Predicted Speed):")
print(f"   R¬≤ Score : {accel_r2:.4f}")
print(f"   RMSE     : {accel_rmse:.4f} m/s¬≤")
print(f"   MAE      : {accel_mae:.4f} m/s¬≤")

# 5. Visualisasi Cepat (Optional)

plt.figure(figsize=(15, 5))
plt.plot(real_accel[:200], label='Real Accel (Diff from Real Speed)', alpha=0.7)
plt.plot(pred_accel[:200], label='Pred Accel (Diff from Pred Speed)', alpha=0.7)
plt.title("Derived Acceleration: Real vs Predicted (First 200 pts)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
print("Saving training logs to CSV...")
df['predicted_speed'] = final_model.predict(scaler.transform(df[feature_cols].values))
df.to_csv("train_predictions_log.csv", index=False)

In [None]:
# CELL 14: Save Model (YOUR ARTIFACT FORMAT)
print(f"\nSaving model to {OUTPUT_ML_MODEL_PATH}...")

# Match YOUR artifact structure exactly
artifact = {
    "scaler": scaler,
    "speed_model": final_model,
    "speed_model_name": best_name,
    "feature_cols": feature_cols,
    "train_metrics": {
        "r2": float(best_r2),
        "rmse": float(best_rmse),
        "mae": float(best_mae),
        "model_name": best_name
    }
}

with fs.open(OUTPUT_ML_MODEL_PATH, 'wb') as f:
    pickle.dump(artifact, f)

print("‚úÖ ML Model saved successfully")
print(f"\n" + "="*70)
print("üéâ TRAINING COMPLETE")
print("="*70)
print(f"Run Timestamp: {RUN_TIMESTAMP}")
print(f"Best Model: {best_name}")
print(f"Training R¬≤: {best_r2:.4f}")
print(f"Training RMSE: {best_rmse:.4f} m/s ({best_rmse*3.6:.2f} km/h)")
print(f"Training MAE: {best_mae:.4f} m/s ({best_mae*3.6:.2f} km/h)")
print(f"Model saved to: {OUTPUT_ML_MODEL_PATH}")