# ML Quality Eval: Train Model (Simplified)
Train on train set only - matches your SpeedAccelerationPredictor approach

In [None]:
# CELL 1: Parameters
RUN_TIMESTAMP = "2025-01-01_00-00-00"
INPUT_TRAIN_DATA = "s3://models-quality-eval-ml/train/train_data.pkl"
OUTPUT_ML_MODEL_PATH = "s3://models-quality-eval-ml/models/speed_accel_model.pkl"

# Use cross-validation for extra robustness? (slower but more stable)
USE_CROSS_VALIDATION = False  # Set True if you have limited data

MINIO_ENDPOINT = "http://minio:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password123"

In [None]:
# CELL 2: Imports
import pandas as pd
import numpy as np
import pickle
import s3fs
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

try:
    import xgboost as xgb
    HAS_XGB = True
except:
    HAS_XGB = False
    print("‚ö†Ô∏è  XGBoost not available, will skip")

print("‚úÖ Libraries imported!")

In [None]:
# CELL 3: MinIO Config
fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={'endpoint_url': MINIO_ENDPOINT}
)

storage_options = {
    "key": MINIO_ACCESS_KEY,
    "secret": MINIO_SECRET_KEY,
    "client_kwargs": {"endpoint_url": MINIO_ENDPOINT}
}

In [None]:
# CELL 4: Load Training Data
print(f"=== ML Quality Eval: Training on Train Set ===")
print(f"Run Timestamp: {RUN_TIMESTAMP}")
print(f"\nLoading training data from {INPUT_TRAIN_DATA}...")

try:
    with fs.open(INPUT_TRAIN_DATA, 'rb') as f:
        df = pickle.load(f)
    
    if isinstance(df, pd.DataFrame):
        print(f"‚úÖ Loaded DataFrame with {len(df):,} rows")
    else:
        raise TypeError(f"Expected DataFrame, got {type(df)}")
    
except FileNotFoundError:
    print(f"‚ùå Error: {INPUT_TRAIN_DATA} not found. Run step 01 first.")
    raise

print(f"Training dataset shape: {df.shape}")
print(f"Columns: {list(df.columns[:15])}...")

In [None]:
# CELL 5: Column Normalization (YOUR WAY)
print("\nNormalizing column names...")

column_mapping = {
    'timestamp_sensor': 'timestamp',
    'latitude': 'position_lat',
    'longitude': 'position_long',
    'speed_ms': 'speed_mps',
    'altitude': 'enhanced_altitude',
    'acc_forward': 'acceleration_m_s2',
    'acceleration': 'acceleration_m_s2'
}

for old, new in column_mapping.items():
    if old in df.columns and new not in df.columns:
        df.rename(columns={old: new}, inplace=True)

print("‚úÖ Column normalization complete")

In [None]:
# CELL 6: Feature Engineering (EXACTLY YOUR WAY)
print("\nPerforming feature engineering...")

# Sort by trip and time
if 'trip_id' in df.columns:
    df = df.sort_values(['trip_id', 'seconds_elapsed'])
else:
    df = df.sort_values('seconds_elapsed')

# Previous speed values
if 'trip_id' in df.columns:
    df['speed_mps_prev1'] = df.groupby('trip_id')['speed_mps'].shift(1).fillna(0)
    df['speed_mps_prev2'] = df.groupby('trip_id')['speed_mps'].shift(2).fillna(0)
else:
    df['speed_mps_prev1'] = df['speed_mps'].shift(1).fillna(0)
    df['speed_mps_prev2'] = df['speed_mps'].shift(2).fillna(0)

# Delta features
if 'position_lat' in df.columns and 'position_long' in df.columns:
    if 'trip_id' in df.columns:
        df['delta_lat'] = df.groupby('trip_id')['position_lat'].diff().fillna(0)
        df['delta_lon'] = df.groupby('trip_id')['position_long'].diff().fillna(0)
    else:
        df['delta_lat'] = df['position_lat'].diff().fillna(0)
        df['delta_lon'] = df['position_long'].diff().fillna(0)
    df['delta_dist'] = np.sqrt(df['delta_lat']**2 + df['delta_lon']**2)
else:
    df['delta_lat'] = 0
    df['delta_lon'] = 0
    df['delta_dist'] = 0

# Elevation gain
if 'enhanced_altitude' in df.columns:
    if 'trip_id' in df.columns:
        df['elev_gain_m'] = df.groupby('trip_id')['enhanced_altitude'].diff().fillna(0)
    else:
        df['elev_gain_m'] = df['enhanced_altitude'].diff().fillna(0)
else:
    df['elev_gain_m'] = 0

# Traffic level
if 'label_traffic' in df.columns:
    traffic_map = {'heavy': 2, 'moderate': 1, 'light': 0}
    df['traffic_level'] = df['label_traffic'].map(traffic_map).fillna(1)
else:
    df['traffic_level'] = 1

# Heading/bearing features
if 'bearing' not in df.columns:
    df['bearing'] = 0
    
if 'trip_id' in df.columns:
    df['heading_change'] = df.groupby('trip_id')['bearing'].diff().fillna(0)
else:
    df['heading_change'] = df['bearing'].diff().fillna(0)

# Turn count (sharp heading changes > 15 degrees)
df['turn_count'] = (np.abs(df['heading_change']) > 15).astype(int)

# Fill any remaining NaN values
df = df.fillna(0)

print("‚úÖ Feature engineering complete")
print(f"   Dataset shape after features: {df.shape}")

In [None]:
# CELL 7: Prepare Training Data (YOUR FEATURES)
feature_cols = [
    'enhanced_altitude', 'bearing',
    'speed_mps_prev1', 'speed_mps_prev2',
    'delta_dist', 'delta_lat', 'delta_lon',
    'elev_gain_m', 'traffic_level',
    'heading_change', 'turn_count'
]

# Ensure all features exist
missing = [c for c in feature_cols if c not in df.columns]
if missing:
    print(f"‚ö†Ô∏è  Warning: Missing columns {missing}. Creating with zeros.")
    for col in missing:
        df[col] = 0

X = df[feature_cols].values
y_speed = df['speed_mps'].values

print(f"\n‚úÖ Training data prepared:")
print(f"   X shape: {X.shape}")
print(f"   y_speed shape: {y_speed.shape}")
print(f"   Features: {feature_cols}")

In [None]:
# CELL 8: Scale Features
print("\nScaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("‚úÖ Features scaled")

In [None]:
# CELL 9: Train-Val Split (for model selection)
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_speed, test_size=0.2, random_state=42
)

print(f"\nTrain/Validation split:")
print(f"   Training samples: {len(X_train):,}")
print(f"   Validation samples: {len(X_val):,}")

In [None]:
# CELL 10: Define Models (YOUR MODELS with FIXED hyperparameters)
print("\n=== Defining Models (Fixed Hyperparameters) ===")

# Use best hyperparameters from your production training
# NO GridSearchCV - too expensive for weekly quality checks!
models = {
    'RandomForest': RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),
    'DecisionTree': DecisionTreeRegressor(
        max_depth=20,
        random_state=42
    ),
    'SVR': SVR(
        C=10,
        gamma='scale',
        kernel='rbf'
    ),
    'ANN': MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        max_iter=500,
        random_state=42
    )
}

# Add XGBoost if available
if HAS_XGB:
    models['XGBoost'] = xgb.XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        tree_method='hist',
        eval_metric='rmse',
        random_state=42
    )
    print(f"‚úÖ {len(models)} models defined (including XGBoost)")
else:
    print(f"‚úÖ {len(models)} models defined (XGBoost skipped)")

for name in models.keys():
    print(f"   - {name}")

In [None]:
# CELL 11: Train Models
print("\n" + "="*70)
print("‚ö° TRAINING MODELS")
print("="*70)

results = []
trained_models = {}

for name, model in models.items():
    print(f"\n{'='*70}")
    print(f"üöÄ Training: {name}")
    print(f"{'='*70}")
    
    # Train on training set
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    r2 = r2_score(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    
    # Calculate MAPE (avoid divide by zero)
    y_val_safe = np.where(y_val == 0, 1e-6, y_val)
    mape = np.mean(np.abs((y_val - y_pred) / y_val_safe)) * 100
    
    print(f"\nüìä SPEED METRICS:")
    print(f"  R¬≤   : {r2:.4f}")
    print(f"  RMSE : {rmse:.4f} m/s ({rmse*3.6:.2f} km/h)")
    print(f"  MAE  : {mae:.4f} m/s ({mae*3.6:.2f} km/h)")
    print(f"  MSE  : {mse:.4f}")
    print(f"  MAPE : {mape:.2f}%")
    
    # Optional: Cross-validation for extra confidence
    if USE_CROSS_VALIDATION:
        print(f"\n  Running 5-fold CV...")
        cv_scores = cross_val_score(
            model, X_scaled, y_speed,
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1
        )
        cv_rmse = np.sqrt(-cv_scores.mean())
        print(f"  CV RMSE: {cv_rmse:.4f} ¬± {np.std(cv_scores):.4f}")
    
    results.append({
        'Model': name,
        'R¬≤': r2,
        'RMSE': rmse,
        'MAE': mae,
        'MSE': mse,
        'MAPE': mape
    })
    
    trained_models[name] = model

# Create results DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values('RMSE').reset_index(drop=True)

print("\n" + "="*70)
print("üìä MODEL COMPARISON (sorted by RMSE)")
print("="*70)
print(df_results.to_string(index=False))

In [None]:
# CELL 12: Select Best Model (by RMSE, like your original)
best_name = df_results.iloc[0]['Model']
best_model = trained_models[best_name]
best_r2 = df_results.iloc[0]['R¬≤']
best_rmse = df_results.iloc[0]['RMSE']
best_mae = df_results.iloc[0]['MAE']

print(f"\nüèÜ BEST MODEL SELECTED")
print(f"‚û° Model : {best_name}")
print(f"‚û° R¬≤    : {best_r2:.4f}")
print(f"‚û° RMSE  : {best_rmse:.4f} m/s ({best_rmse*3.6:.2f} km/h)")
print(f"‚û° MAE   : {best_mae:.4f} m/s ({best_mae*3.6:.2f} km/h)")

In [None]:
# CELL 13: Retrain Best Model on Full Training Set
print(f"\nRetraining {best_name} on full training set...")

# Get fresh instance with same params
final_model = models[best_name]
final_model.fit(X_scaled, y_speed)

print("‚úÖ Final model trained on all training data")

In [None]:
# CELL 14: Save Model (YOUR ARTIFACT FORMAT)
print(f"\nSaving model to {OUTPUT_ML_MODEL_PATH}...")

# Match YOUR artifact structure exactly
artifact = {
    "scaler": scaler,
    "speed_model": final_model,
    "speed_model_name": best_name,
    "feature_cols": feature_cols,
    "train_metrics": {
        "r2": float(best_r2),
        "rmse": float(best_rmse),
        "mae": float(best_mae),
        "model_name": best_name
    }
}

with fs.open(OUTPUT_ML_MODEL_PATH, 'wb') as f:
    pickle.dump(artifact, f)

print("‚úÖ ML Model saved successfully")
print(f"\n" + "="*70)
print("üéâ TRAINING COMPLETE")
print("="*70)
print(f"Run Timestamp: {RUN_TIMESTAMP}")
print(f"Best Model: {best_name}")
print(f"Training R¬≤: {best_r2:.4f}")
print(f"Training RMSE: {best_rmse:.4f} m/s ({best_rmse*3.6:.2f} km/h)")
print(f"Training MAE: {best_mae:.4f} m/s ({best_mae*3.6:.2f} km/h)")
print(f"Model saved to: {OUTPUT_ML_MODEL_PATH}")