# üöó ML Models for Speed & Acceleration Prediction

**Machine Learning Models untuk Prediksi Speed & Acceleration per Detik**

---

## Overview
Notebook ini train dan compare multiple ML models:
- Random Forest
- Gradient Boosting
- Ridge Regression
- Lasso Regression
- SVR (Support Vector Regression)

**Input Features:**
- `elev_gain_m`, `traffic_index`, `turn_count`, `label_traffic`
- Previous state (speed, acceleration)
- Rolling statistics
- Time-based features

**Output:**
- `speed_m_s` per detik
- `acceleration_m_s2` per detik

## üì¶ Install & Import Libraries

In [None]:
# Parameters passed from Papermill
INPUT_PROCESSED_FOLDER = None
OUTPUT_ML_MODEL_PATH = None
OUTPUT_TRAIN_RESULTS_CSV = None
MINIO_ENDPOINT = None
MINIO_ACCESS_KEY = None
MINIO_SECRET_KEY = None

In [None]:
# Parameters
INPUT_PROCESSED_FOLDER = "s3://processed-data"
OUTPUT_ML_MODEL_PATH = "s3://models/ml/speed_accel_model.pkl"
OUTPUT_TRAIN_RESULTS_CSV = "s3://models/ml/speed_accel_train_results.csv"
MINIO_ENDPOINT = "http://minio:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password123"

In [None]:
if INPUT_PROCESSED_FOLDER is None:
    INPUT_PROCESSED_FOLDER = "s3://processed-data"

if OUTPUT_ML_MODEL_PATH is None:
    OUTPUT_ML_MODEL_PATH = "s3://models/ml/speed_accel_model.pkl"

if OUTPUT_TRAIN_RESULTS_CSV is None:
    OUTPUT_TRAIN_RESULTS_CSV = "s3://models/ml/speed_accel_train_results.csv"

if MINIO_ENDPOINT is None:
    MINIO_ENDPOINT = "http://minio:9000"

if MINIO_ACCESS_KEY is None:
    MINIO_ACCESS_KEY = "admin"

if MINIO_SECRET_KEY is None:
    MINIO_SECRET_KEY = "password123"


In [None]:
# Install libraries
!pip install -q scikit-learn matplotlib seaborn pandas numpy joblib

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)
from sklearn.multioutput import MultiOutputRegressor
from sklearn.base import clone

import joblib
import requests
import datetime
import json
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries imported successfully!")

‚úÖ Libraries imported successfully!


In [None]:
import s3fs
import pandas as pd

fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={'endpoint_url': MINIO_ENDPOINT}
)

# Load all CSV files in processed-data/
folder = INPUT_PROCESSED_FOLDER.rstrip("/")
csv_files = fs.glob(f"{folder}/*.csv")

print("Found CSV files:", csv_files)

df_list = []
for file in csv_files:
    # ensure correct URI format
    file_path = file if file.startswith("s3://") else f"s3://{file}"
    print("Reading:", file_path)
    df_list.append(pd.read_csv(file_path, storage_options={
        'key': MINIO_ACCESS_KEY,
        'secret': MINIO_SECRET_KEY,
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }))

df = pd.concat(df_list, ignore_index=True)
print("Training dataset shape:", df.shape)


## üìÅ Load Data

**Option 1:** Upload your CSV files

**Option 2:** Use demo data (uncomment code below)


## üîß Data Preprocessing

In [None]:
# Column normalization

column_mapping = {
    'timestamp_sensor': 'timestamp',
    'latitude': 'position_lat',
    'longitude': 'position_long',
    'speed_ms': 'speed_mps',
    'altitude': 'enhanced_altitude',
    'acc_forward': 'acceleration_m_s2'
}

for old, new in column_mapping.items():
    if old in df.columns:
        df.rename(columns={old: new}, inplace=True)

In [None]:
# ====== SIMPLE TRAFFIC LEVEL FROM SPEED ======

# free flow speed = 95th percentile (jalan lancar)
free_flow = df['speed_mps'].quantile(0.95)

# traffic index scaled 0-1
df['traffic_level'] = 1 - (df['speed_mps'] / free_flow)
df['traffic_level'] = df['traffic_level'].clip(0, 1)

print("Free flow speed:", free_flow)
print(df['traffic_level'].describe())


Free flow speed: 9.175667053820742
count    876.000000
mean       0.531686
std        0.315976
min        0.000000
25%        0.268470
50%        0.523144
75%        0.796607
max        1.000000
Name: traffic_level, dtype: float64


In [None]:
# Elevation gain

df['enhanced_altitude'] = pd.to_numeric(df['enhanced_altitude'], errors='coerce').fillna(0)
df['elev_gain_m'] = df['enhanced_altitude'].diff().fillna(0)


In [None]:
# Turn features

df['heading_change'] = df['bearing'].diff().abs().fillna(0)
df['heading_change'] = df['heading_change'].apply(lambda x: min(x, 360 - x))
df['is_turn'] = (df['heading_change'] > 30).astype(int)
df['turn_count'] = df['is_turn'].rolling(30, min_periods=1).sum()


In [None]:
# === FEATURE ENGINEERING ===

df['speed_mps_prev1'] = df['speed_mps'].shift(1).fillna(0)
df['speed_mps_prev2'] = df['speed_mps'].shift(2).fillna(0)

# Acceleration from raw speed
df['acceleration'] = (df['speed_mps'] - df['speed_mps_prev1']).fillna(0)


In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
from geopy.distance import geodesic

distances = [0.0]

for i in range(1, len(df)):
    lat1 = df.loc[i-1, "position_lat"]
    lon1 = df.loc[i-1, "position_long"]
    lat2 = df.loc[i,   "position_lat"]
    lon2 = df.loc[i,   "position_long"]

    d = geodesic((lat1, lon1), (lat2, lon2)).meters
    distances.append(d)

df["distance_m"] = distances
df["distance_cum_m"] = df["distance_m"].cumsum()


In [None]:
# Movement deltas
df['delta_lat'] = df['position_lat'].diff().fillna(0)
df['delta_lon'] = df['position_long'].diff().fillna(0)
df['delta_dist'] = df['distance_m']

## ü§ñ Machine Learning Models Class

### Speed ML

In [None]:
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)

    # Avoid divide-by-zero for MAPE
    y_true_safe = np.where(y_true == 0, 1e-6, y_true)
    mape = np.mean(np.abs((y_true - y_pred) / y_true_safe)) * 100

    r2 = r2_score(y_true, y_pred)

    return {
        "R2": r2,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape
    }


In [None]:
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.base import clone


class SpeedAccelerationPredictor:

    def __init__(self):
        """
        Train models to predict speed (mps).
        """

        # ============================
        # MODEL CANDIDATES
        # ============================
        self.models = {
            "SVR": SVR(),
            "RandomForest": RandomForestRegressor(),
            "DecisionTree": DecisionTreeRegressor(),
            "XGBoost": xgb.XGBRegressor(tree_method="hist", eval_metric="rmse"),
            # "LightGBM": lgb.LGBMRegressor(),
            "ANN": MLPRegressor(max_iter=500)
        }

        # ============================
        # PARAMETER SEARCH GRID
        # ============================
        self.param_grids = {
            "SVR": {
                "C": [1, 10, 50],
                "gamma": ["scale", 0.1, 0.01],
                "kernel": ["rbf", "poly"]
            },
            "RandomForest": {
                "n_estimators": [100, 300],
                "max_depth": [None, 10, 20]
            },
            "DecisionTree": {
                "max_depth": [None, 10, 20, 30]
            },
            "XGBoost": {
                "n_estimators": [100, 200],
                "learning_rate": [0.05, 0.1],
                "max_depth": [4, 6]
            },
            # "LightGBM": {
            #     "n_estimators": [200, 400],
            #     "learning_rate": [0.05, 0.1],
            #     "num_leaves": [31, 63]
            # },
            "ANN": {
                "hidden_layer_sizes": [(64, 32), (128, 64)],
                "activation": ["relu", "tanh"]
            }
        }

        self.scaler = StandardScaler()
        self.results = {}

        self.best_speed_model = None
        self.best_speed_name = None


    # ============================================================
    # TRAIN MODELS WITH GroupKFold + GridSearchCV
    # ============================================================
    def train_models(self, X_train, y_train, X_test, y_test, groups=None):

        print("="*70)
        print("‚ö° TRAINING MODELS WITH GROUP K-FOLD + GRID SEARCH")
        print("="*70)

        # Scaling
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled  = self.scaler.transform(X_test)

        y_speed_train = y_train[:, 0]
        y_speed_test  = y_test[:, 0]

        # IMPORTANT: use Group K-Fold for trajectory data
        if groups is None:
            raise ValueError("‚ùå groups must be provided (e.g., segment_id or trip_id)")

        gkf = GroupKFold(n_splits=5)

        best_rmse = np.inf

        for name, base_model in self.models.items():
            print(f"\n{'='*70}")
            print(f"üöÄ Training: {name}")
            print(f"{'='*70}")

            # Grid Search CV
            grid = GridSearchCV(
                estimator=base_model,
                param_grid=self.param_grids[name],
                scoring="neg_mean_squared_error",
                cv=gkf.split(X_train_scaled, y_speed_train, groups=groups),
                n_jobs=-1,
                verbose=1
            )

            # Train
            grid.fit(X_train_scaled, y_speed_train)

            best_model = grid.best_estimator_
            print(f"‚úî Best Params: {grid.best_params_}")

            # ===== Predict on Test Set =====
            pred_speed = best_model.predict(X_test_scaled)

            # ===== Metrics =====
            metrics = calculate_metrics(y_speed_test, pred_speed)

            print("\nüìä SPEED METRICS:")
            print(f"  R¬≤   : {metrics['R2']:.4f}")
            print(f"  RMSE : {metrics['RMSE']:.4f}")
            print(f"  MAE  : {metrics['MAE']:.4f}")
            print(f"  MSE  : {metrics['MSE']:.4f}")
            print(f"  MAPE : {metrics['MAPE']:.2f}%")

            # Save results
            self.results[name] = {
                "model": best_model,
                "params": grid.best_params_,
                "metrics": metrics
            }

            # Track best by RMSE
            if metrics["RMSE"] < best_rmse:
                best_rmse = metrics["RMSE"]
                self.best_speed_model = best_model
                self.best_speed_name = name


        print("\nüèÜ BEST MODEL SELECTED")
        best = self.results[self.best_speed_name]["metrics"]
        print(f"‚û° Model : {self.best_speed_name}")
        print(f"‚û° RMSE  : {best['RMSE']:.4f}")
        print(f"‚û° R¬≤    : {best['R2']:.4f}")

        return self.results


    # ============================================================
    # PREDICT SEQUENCE
    # ============================================================
    def predict_sequence(self, X):
        X_scaled = self.scaler.transform(X)
        pred_speed = self.best_speed_model.predict(X_scaled)
        return pred_speed.reshape(-1, 1)


    # ============================================================
    # RESULTS TABLE
    # ============================================================
    def build_results_table(self):

        rows = []
        for model_name, info in self.results.items():
            m = info["metrics"]
            rows.append({
                "Model": model_name,
                "R2": m["R2"],
                "MSE": m["MSE"],
                "RMSE": m["RMSE"],
                "MAE": m["MAE"],
                "MAPE": m["MAPE"],
            })

        df_results = pd.DataFrame(rows)
        df_results = df_results.sort_values("RMSE").reset_index(drop=True)

        display(df_results.style.format("{:.4f}", subset=["R2","MSE","RMSE","MAE","MAPE"]))

        return df_results


### Speed Train Models

In [None]:
# Create X, y
feature_cols = [
    'enhanced_altitude', 'bearing',
    'speed_mps_prev1', 'speed_mps_prev2',
    'delta_dist', 'delta_lat', 'delta_lon',
    'elev_gain_m', 'traffic_level',
    'heading_change', 'turn_count'
]

target_cols = ['speed_mps']

In [None]:
X_df = df[feature_cols]
y_df = df[target_cols]
groups_df = df["segment_id"]

In [None]:
print("üìä Splitting data...")

(X_train_df,
 X_test_df,
 y_train_df,
 y_test_df,
 groups_train,
 groups_test) = train_test_split(
    X_df, y_df, groups_df,
    test_size=0.2,
    random_state=42,
    shuffle=False
)

print(f"  ‚úÖ Training samples DF: {len(X_train_df)}")
print(f"  ‚úÖ Testing samples DF:  {len(X_test_df)}")


In [None]:
X_train = X_train_df.values
X_test  = X_test_df.values

y_train = y_train_df.values
y_test  = y_test_df.values

groups = groups_train.values

üìä Splitting data into train/test sets...
  ‚úÖ Training samples: 700
  ‚úÖ Testing samples: 176
ü§ñ TRAINING SEPARATE MODELS FOR SPEED & ACCELERATION

üîÑ Training model: Random Forest

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9999
      RMSE: 0.0193
      MAE:  0.0140
      MSE:  0.0004

üîÑ Training model: Gradient Boosting

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9997
      RMSE: 0.0341
      MAE:  0.0260
      MSE:  0.0012

üîÑ Training model: Ridge Regression

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9811
      RMSE: 0.2732
      MAE:  0.2066
      MSE:  0.0747

üîÑ Training model: Lasso Regression

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9852
      RMSE: 0.2413
      MAE:  0.1693
      MSE:  0.0582

üîÑ Training model: SVR

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9958
      RMSE: 0.1291
      MAE:  0.0958
      MSE:  0.0167

üèÜ BEST MODELS SELECTED:
  ‚û§ Model: Random Forest
      R¬≤:   0.9999
      RMSE: 0.0193
      MAE:  0.0140
      MSE:  0.0004


In [None]:
predictor = SpeedAccelerationPredictor()

results = predictor.train_models(
    X_train, y_train,
    X_test, y_test,
    groups=groups
)

In [None]:
df_results = predictor.build_results_table()

Unnamed: 0,Model,Speed_R2,Speed_RMSE,Speed_MAE,Speed_MSE
0,Random Forest,0.9999,0.0193,0.014,0.0004
1,Gradient Boosting,0.9997,0.0341,0.026,0.0012
2,SVR,0.9958,0.1291,0.0958,0.0167
3,Lasso Regression,0.9852,0.2413,0.1693,0.0582
4,Ridge Regression,0.9811,0.2732,0.2066,0.0747


### Speed Prediction

In [None]:
# Build X for full dataset
X = df[[
    'enhanced_altitude', 'bearing',
    'speed_mps_prev1', 'speed_mps_prev2',
    'delta_dist', 'delta_lat', 'delta_lon',
    'elev_gain_m', 'traffic_level',
    'heading_change', 'turn_count'
]].values


In [None]:
X_scaled_full = predictor.scaler.transform(X)
predicted_speed = predictor.best_speed_model.predict(X_scaled_full)

In [None]:
df['predicted_speed'] = predicted_speed

### Acceleration Prediction

In [None]:
df['predicted_accel'] = (df['predicted_speed'] - df['speed_mps_prev1']) # / df['dt']

In [None]:
y_true_accel = df['acceleration']
y_pred_accel = df['predicted_accel']

In [None]:
accel_mse = mean_squared_error(y_true_accel, y_pred_accel)
accel_rmse = np.sqrt(accel_mse)
accel_mae = mean_absolute_error(y_true_accel, y_pred_accel)
accel_r2 = r2_score(y_true_accel, y_pred_accel)

# Avoid divide by zero for MAPE
y_true_safe = np.where(y_true_accel == 0, 1e-6, y_true_accel)
accel_mape = np.mean(np.abs((y_true_accel - y_pred_accel) / y_true_safe)) * 100

print("\nüìä ACCELERATION REGRESSION METRICS (based on speed diff):")
print("========================================")
print(f"R¬≤   : {accel_r2:.4f}")
print(f"MSE  : {accel_mse:.4f}")
print(f"RMSE : {accel_rmse:.4f}")
print(f"MAE  : {accel_mae:.4f}")
print(f"MAPE : {accel_mape:.2f}%")

üìä ACCURACY OF ACCELERATION (from speed diff):
   R¬≤:   0.9929
   MAE:  0.0151
   RMSE: 0.0504
   MSE:  0.0025


In [None]:
# CELL: Comprehensive Visualization - Speed & Acceleration Predictions
print("=== GENERATING PREDICTION VISUALIZATIONS ===\n")

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Determine sample size for visualization
full_length = len(df['speed_mps'])
sample_length = min(1000, full_length)  # Show max 1000 seconds
sample_start = 0  # Or choose: np.random.randint(0, full_length - sample_length)
sample_end = sample_start + sample_length

print(f"Visualizing {sample_length} seconds of predictions")
print(f"  (from index {sample_start} to {sample_end})\n")

# Create figure
fig = plt.figure(figsize=(20, 14))
gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.35, wspace=0.25)

# Calculate R¬≤ for display
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

speed_r2 = r2_score(df['speed_mps'], df['predicted_speed'])
speed_rmse = np.sqrt(mean_squared_error(df['speed_mps'], df['predicted_speed']))
speed_mae = mean_absolute_error(df['speed_mps'], df['predicted_speed'])

accel_r2 = r2_score(df['acceleration'], df['predicted_accel'])
accel_rmse = np.sqrt(mean_squared_error(df['acceleration'], df['predicted_accel']))
accel_mae = mean_absolute_error(df['acceleration'], df['predicted_accel'])

# ========================================
# ROW 1: SPEED VISUALIZATIONS
# ========================================

# Plot 1: Speed Time Series
ax1 = fig.add_subplot(gs[0, :])
time_axis = np.arange(sample_start, sample_end)
ax1.plot(time_axis, df['speed_mps'].iloc[sample_start:sample_end], 
         'b-', linewidth=2, label='Actual Speed', alpha=0.8)
ax1.plot(time_axis, df['predicted_speed'].iloc[sample_start:sample_end], 
         'r-', linewidth=2, label='Predicted Speed', alpha=0.7)
ax1.set_xlabel('Time (seconds)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Speed (m/s)', fontsize=12, fontweight='bold')
ax1.set_title(f'Speed Prediction: Actual vs Predicted\nR¬≤={speed_r2:.4f} | RMSE={speed_rmse:.4f} m/s | MAE={speed_mae:.4f} m/s', 
              fontsize=14, fontweight='bold')
ax1.legend(loc='upper right', fontsize=11, framealpha=0.95)
ax1.grid(True, alpha=0.3)

# Plot 2: Speed Scatter Plot
ax2 = fig.add_subplot(gs[1, 0])
ax2.scatter(df['speed_mps'], df['predicted_speed'], 
            alpha=0.4, s=8, c='steelblue', edgecolors='none')
# Perfect prediction line
min_speed = min(df['speed_mps'].min(), df['predicted_speed'].min())
max_speed = max(df['speed_mps'].max(), df['predicted_speed'].max())
ax2.plot([min_speed, max_speed], [min_speed, max_speed], 
         'r--', linewidth=2.5, label='Perfect Prediction')
ax2.set_xlabel('Actual Speed (m/s)', fontsize=11, fontweight='bold')
ax2.set_ylabel('Predicted Speed (m/s)', fontsize=11, fontweight='bold')
ax2.set_title(f'Speed Correlation\nR¬≤={speed_r2:.4f}', fontsize=13, fontweight='bold')
ax2.legend(loc='upper left', fontsize=10)
ax2.grid(True, alpha=0.3)
ax2.set_aspect('equal', adjustable='box')

# Add statistics text
stats_text = f'Mean Actual: {df["speed_mps"].mean():.3f} m/s\n'
stats_text += f'Mean Predicted: {df["predicted_speed"].mean():.3f} m/s\n'
stats_text += f'Std Actual: {df["speed_mps"].std():.3f} m/s\n'
stats_text += f'Std Predicted: {df["predicted_speed"].std():.3f} m/s'
ax2.text(0.05, 0.95, stats_text, transform=ax2.transAxes, 
         fontsize=9, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Plot 3: Speed Error Distribution
ax3 = fig.add_subplot(gs[1, 1])
speed_errors = df['speed_mps'] - df['predicted_speed']
ax3.hist(speed_errors, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
ax3.axvline(x=0, color='red', linestyle='--', linewidth=2.5, label='Zero Error')
ax3.axvline(x=speed_errors.mean(), color='green', linestyle='-', 
            linewidth=2, label=f'Mean Error: {speed_errors.mean():.4f}')
ax3.set_xlabel('Prediction Error (m/s)', fontsize=11, fontweight='bold')
ax3.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax3.set_title(f'Speed Error Distribution\nMean={speed_errors.mean():.4f} | Std={speed_errors.std():.4f} m/s', 
              fontsize=13, fontweight='bold')
ax3.legend(loc='upper right', fontsize=10)
ax3.grid(True, alpha=0.3, axis='y')

# ========================================
# ROW 2: ACCELERATION VISUALIZATIONS
# ========================================

# Plot 4: Acceleration Time Series
ax4 = fig.add_subplot(gs[2, 0])
ax4.plot(time_axis, df['acceleration'].iloc[sample_start:sample_end], 
         'b-', linewidth=2, label='Actual Acceleration', alpha=0.8)
ax4.plot(time_axis, df['predicted_accel'].iloc[sample_start:sample_end], 
         'r-', linewidth=2, label='Predicted Acceleration', alpha=0.7)
ax4.axhline(y=0, color='gray', linestyle='-', linewidth=1, alpha=0.5)
ax4.set_xlabel('Time (seconds)', fontsize=11, fontweight='bold')
ax4.set_ylabel('Acceleration (m/s¬≤)', fontsize=11, fontweight='bold')
ax4.set_title(f'Acceleration Prediction: Actual vs Predicted\nR¬≤={accel_r2:.4f} | RMSE={accel_rmse:.4f} m/s¬≤', 
              fontsize=13, fontweight='bold')
ax4.legend(loc='upper right', fontsize=10)
ax4.grid(True, alpha=0.3)

# Plot 5: Acceleration Scatter Plot
ax5 = fig.add_subplot(gs[2, 1])
ax5.scatter(df['acceleration'], df['predicted_accel'], 
            alpha=0.4, s=8, c='coral', edgecolors='none')
# Perfect prediction line
min_accel = min(df['acceleration'].min(), df['predicted_accel'].min())
max_accel = max(df['acceleration'].max(), df['predicted_accel'].max())
ax5.plot([min_accel, max_accel], [min_accel, max_accel], 
         'r--', linewidth=2.5, label='Perfect Prediction')
ax5.axhline(y=0, color='gray', linestyle='-', linewidth=1, alpha=0.5)
ax5.axvline(x=0, color='gray', linestyle='-', linewidth=1, alpha=0.5)
ax5.set_xlabel('Actual Acceleration (m/s¬≤)', fontsize=11, fontweight='bold')
ax5.set_ylabel('Predicted Acceleration (m/s¬≤)', fontsize=11, fontweight='bold')
ax5.set_title(f'Acceleration Correlation\nR¬≤={accel_r2:.4f}', fontsize=13, fontweight='bold')
ax5.legend(loc='upper left', fontsize=10)
ax5.grid(True, alpha=0.3)
ax5.set_aspect('equal', adjustable='box')

# Add statistics text
stats_text_accel = f'Mean Actual: {df["acceleration"].mean():.4f} m/s¬≤\n'
stats_text_accel += f'Mean Predicted: {df["predicted_accel"].mean():.4f} m/s¬≤\n'
stats_text_accel += f'Std Actual: {df["acceleration"].std():.4f} m/s¬≤\n'
stats_text_accel += f'Std Predicted: {df["predicted_accel"].std():.4f} m/s¬≤'
ax5.text(0.05, 0.95, stats_text_accel, transform=ax5.transAxes, 
         fontsize=9, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Overall title
fig.suptitle('ML Training Results: Speed & Acceleration Predictions', 
             fontsize=16, fontweight='bold', y=0.995)

plt.show()

print("\n‚úÖ Visualization complete!")
print("\nüìä SUMMARY:")
print(f"  Speed:")
print(f"    - R¬≤ Score: {speed_r2:.4f}")
print(f"    - RMSE: {speed_rmse:.4f} m/s ({speed_rmse*3.6:.4f} km/h)")
print(f"    - MAE: {speed_mae:.4f} m/s ({speed_mae*3.6:.4f} km/h)")
print(f"\n  Acceleration:")
print(f"    - R¬≤ Score: {accel_r2:.4f}")
print(f"    - RMSE: {accel_rmse:.4f} m/s¬≤")
print(f"    - MAE: {accel_mae:.4f} m/s¬≤")
print(f"\n  Sample size: {len(df):,} data points")
print(f"  Visualization window: {sample_length} seconds")

In [None]:
feature_cols = [
    'enhanced_altitude', 'bearing',
    'speed_mps_prev1', 'speed_mps_prev2',
    'delta_dist', 'delta_lat', 'delta_lon',
    'elev_gain_m', 'traffic_level',
    'heading_change', 'turn_count',
]


In [None]:
df.head(10)

In [None]:
# Save CSV locally (optional)
df.to_csv("train_results.csv", index=False)

# Upload to MinIO
with fs.open(OUTPUT_TRAIN_RESULTS_CSV, "w") as f:
    df.to_csv(f, index=False)

print("Training results saved to:", OUTPUT_TRAIN_RESULTS_CSV)

In [None]:
import pickle

artifact = {
    "scaler": predictor.scaler,
    "speed_model": predictor.best_speed_model,
    "speed_model_name": predictor.best_speed_name,
    "feature_cols": feature_cols
}

with fs.open(OUTPUT_ML_MODEL_PATH, "wb") as f:
    pickle.dump(artifact, f)


print("ML Model saved to:", OUTPUT_ML_MODEL_PATH)
