# üöó ML Models for Speed & Acceleration Prediction

**Machine Learning Models untuk Prediksi Speed & Acceleration per Detik**

---

## Overview
Notebook ini train dan compare multiple ML models:
- Random Forest
- Gradient Boosting
- Ridge Regression
- Lasso Regression
- SVR (Support Vector Regression)

**Input Features:**
- `elev_gain_m`, `traffic_index`, `turn_count`, `label_traffic`
- Previous state (speed, acceleration)
- Rolling statistics
- Time-based features

**Output:**
- `speed_m_s` per detik
- `acceleration_m_s2` per detik

## üì¶ Install & Import Libraries

In [None]:
# Parameters passed from Papermill
INPUT_PROCESSED_FOLDER = None
OUTPUT_ML_MODEL_PATH = None
MINIO_ENDPOINT = None
MINIO_ACCESS_KEY = None
MINIO_SECRET_KEY = None

In [None]:
if INPUT_PROCESSED_FOLDER is None:
    INPUT_PROCESSED_FOLDER = "s3://processed-data"

if OUTPUT_ML_MODEL_PATH is None:
    OUTPUT_ML_MODEL_PATH = "s3://models/ml/speed_accel_model.pkl"

if MINIO_ENDPOINT is None:
    MINIO_ENDPOINT = "http://minio:9000"

if MINIO_ACCESS_KEY is None:
    MINIO_ACCESS_KEY = "admin"

if MINIO_SECRET_KEY is None:
    MINIO_SECRET_KEY = "password123"


In [None]:
# Install libraries
!pip install -q scikit-learn matplotlib seaborn pandas numpy joblib

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.base import clone
import joblib
import requests
import datetime
import json
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries imported successfully!")

‚úÖ Libraries imported successfully!


In [None]:
import s3fs
import pandas as pd

fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={'endpoint_url': MINIO_ENDPOINT}
)

# Load all CSV files in processed-data/
folder = INPUT_PROCESSED_FOLDER.rstrip("/")
csv_files = fs.glob(f"{folder}/*.csv")

print("Found CSV files:", csv_files)

df_list = []
for file in csv_files:
    # ensure correct URI format
    file_path = file if file.startswith("s3://") else f"s3://{file}"
    print("Reading:", file_path)
    df_list.append(pd.read_csv(file_path, storage_options={
        'key': MINIO_ACCESS_KEY,
        'secret': MINIO_SECRET_KEY,
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }))

df = pd.concat(df_list, ignore_index=True)
print("Training dataset shape:", df.shape)


## üìÅ Load Data

**Option 1:** Upload your CSV files

**Option 2:** Use demo data (uncomment code below)


## üîß Data Preprocessing

In [None]:
# Column normalization

column_mapping = {
    'timestamp_sensor': 'timestamp',
    'latitude': 'position_lat',
    'longitude': 'position_long',
    'speed_ms': 'speed_mps',
    'altitude': 'enhanced_altitude',
    'acc_forward': 'acceleration_m_s2'
}

for old, new in column_mapping.items():
    if old in df.columns:
        df.rename(columns={old: new}, inplace=True)

In [None]:
# ====== SIMPLE TRAFFIC LEVEL FROM SPEED ======

# free flow speed = 95th percentile (jalan lancar)
free_flow = df['speed_mps'].quantile(0.95)

# traffic index scaled 0-1
df['traffic_level'] = 1 - (df['speed_mps'] / free_flow)
df['traffic_level'] = df['traffic_level'].clip(0, 1)

print("Free flow speed:", free_flow)
print(df['traffic_level'].describe())


Free flow speed: 9.175667053820742
count    876.000000
mean       0.531686
std        0.315976
min        0.000000
25%        0.268470
50%        0.523144
75%        0.796607
max        1.000000
Name: traffic_level, dtype: float64


In [None]:
# Elevation gain

df['enhanced_altitude'] = pd.to_numeric(df['enhanced_altitude'], errors='coerce').fillna(0)
df['elev_gain_m'] = df['enhanced_altitude'].diff().fillna(0)


In [None]:
# Turn features

df['heading_change'] = df['bearing'].diff().abs().fillna(0)
df['heading_change'] = df['heading_change'].apply(lambda x: min(x, 360 - x))
df['is_turn'] = (df['heading_change'] > 30).astype(int)
df['turn_count'] = df['is_turn'].rolling(30, min_periods=1).sum()


In [None]:
# === FEATURE ENGINEERING ===

df['speed_mps_prev1'] = df['speed_mps'].shift(1).fillna(0)
df['speed_mps_prev2'] = df['speed_mps'].shift(2).fillna(0)

# Acceleration from raw speed
df['acceleration'] = (df['speed_mps'] - df['speed_mps_prev1']).fillna(0)


In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
from geopy.distance import geodesic

distances = [0.0]

for i in range(1, len(df)):
    lat1 = df.loc[i-1, "position_lat"]
    lon1 = df.loc[i-1, "position_long"]
    lat2 = df.loc[i,   "position_lat"]
    lon2 = df.loc[i,   "position_long"]

    d = geodesic((lat1, lon1), (lat2, lon2)).meters
    distances.append(d)

df["distance_m"] = distances
df["distance_cum_m"] = df["distance_m"].cumsum()


In [None]:
# Movement deltas
df['delta_lat'] = df['position_lat'].diff().fillna(0)
df['delta_lon'] = df['position_long'].diff().fillna(0)
df['delta_dist'] = df['distance_m']

## ü§ñ Machine Learning Models Class

### Speed ML

In [None]:
class SpeedAccelerationPredictor:
    """
    Train TWO separate models:
    - one for speed_mps
    - one for acceleration
    """

    def __init__(self):
        self.models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
            'Ridge Regression': Ridge(alpha=1.0),
            'Lasso Regression': Lasso(alpha=0.1),
            'SVR': SVR(kernel='rbf', C=50, gamma=0.1)
        }

        self.scaler = StandardScaler()
        self.results = {}

        self.best_speed_model = None
        self.best_speed_name = None

    # ============================================================
    # TRAIN MODELS
    # ============================================================
    def train_models(self, X_train, y_train, X_test, y_test):
        print("="*70)
        print("ü§ñ TRAINING SEPARATE MODELS FOR SPEED & ACCELERATION")
        print("="*70)

        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        y_speed_train = y_train[:, 0]
        y_speed_test  = y_test[:, 0]


        best_speed_score = -np.inf
        best_speed_metrics = {
                "r2": None,
                "rmse": None,
                "mae": None,
                "mse": None
        }

        for name, base_model in self.models.items():
            print(f"\n{'='*70}")
            print(f"üîÑ Training model: {name}")
            print(f"{'='*70}")

            model_speed = clone(base_model)

            # Train
            model_speed.fit(X_train_scaled, y_speed_train)

            # Predict
            pred_speed = model_speed.predict(X_test_scaled)

            # ---- EVALUATION METRICS ----
            # SPEED
            mse_speed = mean_squared_error(y_speed_test, pred_speed)
            rmse_speed = np.sqrt(mse_speed)
            mae_speed = mean_absolute_error(y_speed_test, pred_speed)
            r2_speed = r2_score(y_speed_test, pred_speed)

            print("\nüìä Results:")
            print(f"  ‚û§ SPEED:")
            print(f"      R¬≤:   {r2_speed:.4f}")
            print(f"      RMSE: {rmse_speed:.4f}")
            print(f"      MAE:  {mae_speed:.4f}")
            print(f"      MSE:  {mse_speed:.4f}")

            # Store all metrics
            self.results[name] = {
                "speed_model": model_speed,
                "pred_speed": pred_speed,
                "metrics": {
                    "speed": {
                        "r2": r2_speed,
                        "rmse": rmse_speed,
                        "mae": mae_speed,
                        "mse": mse_speed
                    }
                }
            }


            # Track best models
            score_speed = self.compute_model_score(
                r2_speed, rmse_speed, mae_speed, mse_speed
            )

            if score_speed > best_speed_score:
              best_speed_score = score_speed
              self.best_speed_model = model_speed
              self.best_speed_name = name

              # store metrics cleanly
              best_speed_metrics["r2"] = r2_speed
              best_speed_metrics["rmse"] = rmse_speed
              best_speed_metrics["mae"] = mae_speed
              best_speed_metrics["mse"] = mse_speed



        print("\nüèÜ BEST MODELS SELECTED:")
        print(f"  ‚û§ Model: {self.best_speed_name}")
        print(f"      R¬≤:   {best_speed_metrics["r2"]:.4f}")
        print(f"      RMSE: {best_speed_metrics["rmse"]:.4f}")
        print(f"      MAE:  {best_speed_metrics["mae"]:.4f}")
        print(f"      MSE:  {best_speed_metrics["mse"]:.4f}")


        return self.results

    # ============================================================
    # Model Selection Function
    # ============================================================

    def compute_model_score(self, r2, rmse, mae, mse,
                        w_r2=0.5, w_rmse=0.2, w_mae=0.2, w_mse=0.1):

    # Normalize so metrics sit in similar range
      rmse_norm = 1 / (1 + rmse)
      mae_norm = 1 / (1 + mae)
      mse_norm = 1 / (1 + mse)

      score = (w_r2 * r2) + \
              (w_rmse * rmse_norm) + \
              (w_mae * mae_norm) + \
              (w_mse * mse_norm)

      return score


    # ============================================================
    # PREDICT SEQUENCE USING BOTH MODELS
    # ============================================================
    def predict_sequence(self, X):
        X_scaled = self.scaler.transform(X)
        pred_speed = self.best_speed_model.predict(X_scaled)
        return np.column_stack([pred_speed])

    # ============================================================
    # VISUALIZATION
    # ============================================================
    def build_results_table(self):
        """
        Build a summary table containing metrics for each algorithm
        for both speed and acceleration.
        Returns a pandas DataFrame.
        """

        rows = []

        for model_name, content in self.results.items():
            metrics = content["metrics"]

            row = {
                "Model": model_name,

                # Speed metrics
                "Speed_R2":   metrics["speed"]["r2"],
                "Speed_RMSE": metrics["speed"]["rmse"],
                "Speed_MAE":  metrics["speed"]["mae"],
                "Speed_MSE":  metrics["speed"]["mse"],

            }

            rows.append(row)

        df_results = pd.DataFrame(rows)

        # Sort best models by R2 (speed first)
        df_results = df_results.sort_values("Speed_R2", ascending=False).reset_index(drop=True)

        num_cols = df_results.select_dtypes(include=["number"]).columns
        display(df_results.style.format("{:.4f}", subset=num_cols))

        return df_results



### Speed Train Models

In [None]:
# Create X, y
target_cols = ['speed_mps']

y = df[target_cols].values
X = df[[
    'enhanced_altitude', 'bearing',
    'speed_mps_prev1',
    'speed_mps_prev2',

    'delta_dist',
    'delta_lat',
    'delta_lon',

    'elev_gain_m',
    'traffic_level',
    'heading_change',
    'turn_count',

]].values

In [None]:
# Split data
print("üìä Splitting data into train/test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

print(f"  ‚úÖ Training samples: {len(X_train)}")
print(f"  ‚úÖ Testing samples: {len(X_test)}")

predictor = SpeedAccelerationPredictor()
results = predictor.train_models(X_train, y_train, X_test, y_test)

üìä Splitting data into train/test sets...
  ‚úÖ Training samples: 700
  ‚úÖ Testing samples: 176
ü§ñ TRAINING SEPARATE MODELS FOR SPEED & ACCELERATION

üîÑ Training model: Random Forest

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9999
      RMSE: 0.0193
      MAE:  0.0140
      MSE:  0.0004

üîÑ Training model: Gradient Boosting

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9997
      RMSE: 0.0341
      MAE:  0.0260
      MSE:  0.0012

üîÑ Training model: Ridge Regression

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9811
      RMSE: 0.2732
      MAE:  0.2066
      MSE:  0.0747

üîÑ Training model: Lasso Regression

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9852
      RMSE: 0.2413
      MAE:  0.1693
      MSE:  0.0582

üîÑ Training model: SVR

üìä Results:
  ‚û§ SPEED:
      R¬≤:   0.9958
      RMSE: 0.1291
      MAE:  0.0958
      MSE:  0.0167

üèÜ BEST MODELS SELECTED:
  ‚û§ Model: Random Forest
      R¬≤:   0.9999
      RMSE: 0.0193
      MAE:  0.0140
      MSE:  0.0004


In [None]:
df_results = predictor.build_results_table()

Unnamed: 0,Model,Speed_R2,Speed_RMSE,Speed_MAE,Speed_MSE
0,Random Forest,0.9999,0.0193,0.014,0.0004
1,Gradient Boosting,0.9997,0.0341,0.026,0.0012
2,SVR,0.9958,0.1291,0.0958,0.0167
3,Lasso Regression,0.9852,0.2413,0.1693,0.0582
4,Ridge Regression,0.9811,0.2732,0.2066,0.0747


### Speed Prediction

In [None]:
# Scale full X
X_scaled_full = predictor.scaler.transform(X)

# Predict speed for ALL rows
predicted_speed = predictor.best_speed_model.predict(X_scaled_full)


In [None]:
df['predicted_speed'] = predicted_speed


### Acceleration Prediction

In [None]:
df['predicted_accel'] = (df['predicted_speed'] - df['speed_mps_prev1']) # / df['dt']

In [None]:
y_true = df['acceleration']
y_pred = df['predicted_accel']

r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)

print("üìä ACCURACY OF ACCELERATION (from speed diff):")
print(f"   R¬≤:   {r2:.4f}")
print(f"   MAE:  {mae:.4f}")
print(f"   RMSE: {rmse:.4f}")
print(f"   MSE:  {mse:.4f}")


üìä ACCURACY OF ACCELERATION (from speed diff):
   R¬≤:   0.9929
   MAE:  0.0151
   RMSE: 0.0504
   MSE:  0.0025


In [None]:
feature_cols = [
    'enhanced_altitude', 'bearing',
    'speed_mps_prev1', 'speed_mps_prev2',
    'delta_dist', 'delta_lat', 'delta_lon',
    'elev_gain_m', 'traffic_level',
    'heading_change', 'turn_count',
]


In [None]:
import pickle

artifact = {
    "scaler": predictor.scaler,
    "speed_model": predictor.best_speed_model,
    "speed_model_name": predictor.best_speed_name,
    "feature_cols": feature_cols
}

with fs.open(OUTPUT_ML_MODEL_PATH, "wb") as f:
    pickle.dump(artifact, f)


print("ML Model saved to:", OUTPUT_ML_MODEL_PATH)
