# **Build Feature Engineering Script for GUI Inference (XGBoost)**

This notebook creates a reusable `feature_engineer.py` module for our Tkinter GUI.

Goal:
- Accept raw inputs (sensor_data, driver_data, safety_labels)
- Engineer trip-level features (1 row per bookingID)
- Match the exact feature schema used by the XGBoost model (`xgboost_feature_cols.json`)
- Support both batch inference (all trips) and single-trip inference (select bookingID)

Important:
- The GUI will call this script before sending features to the saved scaler + XGBoost model.

---

In [1]:
# ============================================================
# this cell writes feature_engineer.py
# save into: Sprint 2/Tkinter/feature_engineer.py
# ============================================================

from pathlib import Path

FEATURE_ENGINEER_PY = r'''
import numpy as np
import pandas as pd

# ============================================================
# thresholds follow the Feature_Engineering notebook
# ============================================================
THRESHOLDS = {
    # acceleration thresholds (m/s^2)
    "harsh_acceleration": 4.5,
    "harsh_braking": -5.5,
    "max_acceleration_cap": 10.0,
    "max_acceleration_z_cap": 12.0,

    # speed thresholds (m/s)
    "speeding_limit": 33.3,      # 120 km/h
    "high_speed": 25.0,
    "max_speed_cap": 50.0,

    # gyro thresholds (rad/s)
    "sharp_turn": 2.0,
    "gyro_stability": 0.5,
    "gyro_peak_height": 1.5,
    "max_gyro_cap": 4.0,

    # gps accuracy threshold (m)
    "low_gps_accuracy": 30.0,

    # rolling windows (rows, since sensor is per-second mostly)
    "rolling_window_5s": 5,
    "rolling_window_10s": 10,
}

# ============================================================
# helpers
# ============================================================
def _coerce_bool_label(x):
    # supports true/false, 0/1, yes/no
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if s in {"true", "1", "yes", "y"}:
        return 1
    if s in {"false", "0", "no", "n"}:
        return 0
    try:
        return int(float(s))
    except Exception:
        return np.nan


def _clip_sensor_values(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # speed
    if "speed" in df.columns:
        df["speed"] = pd.to_numeric(df["speed"], errors="coerce").clip(
            THRESHOLDS["speed_lower"] if "speed_lower" in THRESHOLDS else 0,
            THRESHOLDS["max_speed_cap"],
        )

    # accel
    for c in ["acceleration_x", "acceleration_y"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").clip(
                -THRESHOLDS["max_acceleration_cap"],
                THRESHOLDS["max_acceleration_cap"],
            )

    if "acceleration_z" in df.columns:
        df["acceleration_z"] = pd.to_numeric(df["acceleration_z"], errors="coerce").clip(
            -THRESHOLDS["max_acceleration_z_cap"],
            THRESHOLDS["max_acceleration_z_cap"],
        )

    # gyro
    for c in ["gyro_x", "gyro_y", "gyro_z"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").clip(
                -THRESHOLDS["max_gyro_cap"],
                THRESHOLDS["max_gyro_cap"],
            )

    # accuracy
    if "accuracy" in df.columns:
        df["accuracy"] = pd.to_numeric(df["accuracy"], errors="coerce")

    # bearing and second
    if "bearing" in df.columns:
        df["bearing"] = pd.to_numeric(df["bearing"], errors="coerce")
    if "second" in df.columns:
        df["second"] = pd.to_numeric(df["second"], errors="coerce")

    return df


def _rms(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    if x.size == 0:
        return 0.0
    return float(np.sqrt(np.mean(x * x)))


def _skewness(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    if x.size < 3:
        return 0.0
    m = np.mean(x)
    s = np.std(x) + 1e-12
    return float(np.mean(((x - m) / s) ** 3))


def _kurtosis(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    if x.size < 4:
        return 0.0
    m = np.mean(x)
    s = np.std(x) + 1e-12
    return float(np.mean(((x - m) / s) ** 4) - 3.0)


def _mean_change(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    if x.size < 2:
        return 0.0
    return float(np.mean(np.abs(np.diff(x))))


def _max_change(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    if x.size < 2:
        return 0.0
    return float(np.max(np.abs(np.diff(x))))


def _count_above_mean(x: np.ndarray) -> int:
    x = np.asarray(x, dtype=float)
    if x.size == 0:
        return 0
    m = np.mean(x)
    return int(np.sum(x > m))


def _count_below_mean(x: np.ndarray) -> int:
    x = np.asarray(x, dtype=float)
    if x.size == 0:
        return 0
    m = np.mean(x)
    return int(np.sum(x < m))


def _ts_stats(prefix: str, x: np.ndarray, include_median: bool, include_abs_max: bool, include_rms: bool) -> dict:
    x = np.asarray(x, dtype=float)
    if x.size == 0:
        x = np.asarray([0.0], dtype=float)

    q25 = float(np.quantile(x, 0.25))
    q75 = float(np.quantile(x, 0.75))
    out = {
        f"{prefix}__mean": float(np.mean(x)),
        f"{prefix}__std": float(np.std(x)),
        f"{prefix}__min": float(np.min(x)),
        f"{prefix}__max": float(np.max(x)),
        f"{prefix}__range": float(np.max(x) - np.min(x)),
        f"{prefix}__q25": q25,
        f"{prefix}__q75": q75,
        f"{prefix}__iqr": float(q75 - q25),
        f"{prefix}__skewness": _skewness(x),
        f"{prefix}__kurtosis": _kurtosis(x),
        f"{prefix}__count_above_mean": _count_above_mean(x),
        f"{prefix}__count_below_mean": _count_below_mean(x),
        f"{prefix}__mean_change": _mean_change(x),
        f"{prefix}__max_change": _max_change(x),
    }

    if include_median:
        out[f"{prefix}__median"] = float(np.median(x))
    if include_abs_max:
        out[f"{prefix}__abs_max"] = float(np.max(np.abs(x)))
    if include_rms:
        out[f"{prefix}__rms"] = _rms(x)

    return out


def _build_vehicle_key(driver_df: pd.DataFrame) -> pd.DataFrame:
    # replicate ETL idea: build a stable surrogate key from car_make + car_model_year
    df = driver_df.copy()

    if "car_make" not in df.columns or "car_model_year" not in df.columns:
        df["vehicle_key"] = 0
        return df

    df["car_make"] = df["car_make"].astype(str)
    df["car_model_year"] = pd.to_numeric(df["car_model_year"], errors="coerce").fillna(0).astype(int)

    # stable mapping across the provided file
    unique = (
        df[["car_make", "car_model_year"]]
        .drop_duplicates()
        .assign(vehicle_id_natural=lambda d: d["car_make"] + "_" + d["car_model_year"].astype(str))
        .sort_values("vehicle_id_natural")
        .reset_index(drop=True)
    )
    unique["vehicle_key"] = np.arange(1, len(unique) + 1)

    df = df.merge(unique[["car_make", "car_model_year", "vehicle_key"]], on=["car_make", "car_model_year"], how="left")
    df["vehicle_key"] = df["vehicle_key"].fillna(0).astype(float)

    return df


def engineer_features_from_raw_tables(
    sensor_df: pd.DataFrame,
    driver_df: pd.DataFrame,
    safety_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    main public API for GUI

    inputs:
    - sensor_df: raw sensor table (must include bookingID)
    - driver_df: raw driver table (must include id)
    - safety_df: raw safety labels (must include bookingID and driver_id). label optional for inference.

    output:
    - engineered_df: one row per bookingID containing engineered features
    """

    # -------------------------------
    # normalize column names
    # -------------------------------
    sensor_df = sensor_df.copy()
    driver_df = driver_df.copy()
    safety_df = safety_df.copy()

    sensor_df.columns = [c.strip() for c in sensor_df.columns]
    driver_df.columns = [c.strip() for c in driver_df.columns]
    safety_df.columns = [c.strip() for c in safety_df.columns]

    # required columns
    if "bookingID" not in sensor_df.columns:
        raise ValueError("sensor_data must include bookingID column")

    # ensure safety contains mapping
    if "bookingID" not in safety_df.columns or "driver_id" not in safety_df.columns:
        raise ValueError("safety_labels must include bookingID and driver_id columns")

    # coerce label
    if "label" in safety_df.columns:
        safety_df["label"] = safety_df["label"].apply(_coerce_bool_label)

    # driver id key
    if "id" not in driver_df.columns:
        raise ValueError("driver_data must include id column")

    # build vehicle_key
    driver_df = _build_vehicle_key(driver_df)

    # -------------------------------
    # merge driver info into safety
    # -------------------------------
    safety_driver = safety_df.merge(
        driver_df,
        left_on="driver_id",
        right_on="id",
        how="left",
        suffixes=("", "_driver"),
    )

    # -------------------------------
    # sensor preprocessing
    # -------------------------------
    sensor_df = _clip_sensor_values(sensor_df)

    # fill missing numeric with 0 (gui assumption: no nulls, but we still guard)
    for c in ["accuracy","bearing","second","speed","acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z"]:
        if c in sensor_df.columns:
            sensor_df[c] = pd.to_numeric(sensor_df[c], errors="coerce").fillna(0.0)

    # sort within trip
    if "second" in sensor_df.columns:
        sensor_df = sensor_df.sort_values(["bookingID", "second"])

    # -------------------------------
    # per-trip aggregation
    # -------------------------------
    rows = []
    for bid, g in sensor_df.groupby("bookingID"):
        g = g.copy()
        n = len(g)

        # time and distance
        if "second" in g.columns and n > 0:
            tmin = float(g["second"].min())
            tmax = float(g["second"].max())
            trip_duration_sec = float(max(0.0, tmax - tmin))
        else:
            trip_duration_sec = float(n)

        # distance estimate: assume ~1 second step
        # distance(m) ~ sum(speed) * 1
        total_distance_km = float(g["speed"].sum() / 1000.0)

        avg_gps_accuracy = float(g["accuracy"].mean()) if "accuracy" in g.columns else 0.0

        # accel magnitude
        ax = g["acceleration_x"].to_numpy()
        ay = g["acceleration_y"].to_numpy()
        az = g["acceleration_z"].to_numpy()
        accel_mag = np.sqrt(ax*ax + ay*ay + az*az)

        # gyro magnitude
        gx = g["gyro_x"].to_numpy()
        gy = g["gyro_y"].to_numpy()
        gz = g["gyro_z"].to_numpy()
        gyro_mag = np.sqrt(gx*gx + gy*gy + gz*gz)

        # events
        harsh_acceleration_count = int(np.sum(g["acceleration_x"] > THRESHOLDS["harsh_acceleration"]))
        harsh_braking_count = int(np.sum(g["acceleration_x"] < THRESHOLDS["harsh_braking"]))
        sharp_turn_count = int(np.sum(np.abs(g["gyro_z"]) > THRESHOLDS["sharp_turn"]))
        speeding_event_count = int(np.sum(g["speed"] > THRESHOLDS["speeding_limit"]))

        # phone distraction proxy: poor gps quality count
        phone_distraction_count = int(np.sum(g["accuracy"] > THRESHOLDS["low_gps_accuracy"])) if "accuracy" in g.columns else 0

        avg_acceleration_magnitude = float(np.mean(accel_mag)) if accel_mag.size else 0.0
        max_acceleration_magnitude = float(np.max(accel_mag)) if accel_mag.size else 0.0

        # rolling features
        w5 = THRESHOLDS["rolling_window_5s"]
        w10 = THRESHOLDS["rolling_window_10s"]

        speed_rolling_std_5s = float(pd.Series(g["speed"]).rolling(w5, min_periods=1).std().mean())
        accel_x_rolling_max_10s = float(pd.Series(g["acceleration_x"]).rolling(w10, min_periods=1).max().mean())
        gyro_z_rolling_range_5s = float(
            (pd.Series(g["gyro_z"]).rolling(w5, min_periods=1).max()
             - pd.Series(g["gyro_z"]).rolling(w5, min_periods=1).min()).mean()
        )
        accel_magnitude_rolling_mean_5s = float(pd.Series(accel_mag).rolling(w5, min_periods=1).mean().mean())

        # speed change rate
        speed_change_rate = float(_mean_change(g["speed"].to_numpy()))

        # phase features
        third = max(1, n // 3)
        phase1 = g.iloc[:third]
        phase2 = g.iloc[third:2*third]
        phase3 = g.iloc[2*third:]

        accel_x_first_third_mean = float(phase1["acceleration_x"].mean()) if len(phase1) else 0.0
        speed_last_third_std = float(phase3["speed"].std()) if len(phase3) else 0.0

        total_harsh_braking = int(np.sum(g["acceleration_x"] < THRESHOLDS["harsh_braking"]))
        middle_harsh_braking = int(np.sum(phase2["acceleration_x"] < THRESHOLDS["harsh_braking"])) if len(phase2) else 0
        harsh_braking_middle_third_ratio = float(middle_harsh_braking / max(1, total_harsh_braking))

        # jerk features (derivative of acceleration)
        jerk_x = np.diff(g["acceleration_x"].to_numpy(), prepend=g["acceleration_x"].iloc[0])
        jerk_y = np.diff(g["acceleration_y"].to_numpy(), prepend=g["acceleration_y"].iloc[0])
        jerk_z = np.diff(g["acceleration_z"].to_numpy(), prepend=g["acceleration_z"].iloc[0])
        jerk_mag = np.sqrt(jerk_x*jerk_x + jerk_y*jerk_y + jerk_z*jerk_z)

        jerk_x_mean = float(np.mean(jerk_x))
        jerk_y_max = float(np.max(jerk_y)) if jerk_y.size else 0.0
        jerk_z_std = float(np.std(jerk_z))
        jerk_magnitude_std = float(np.std(jerk_mag))

        # gyro features
        gyro_total_rotation = float(np.sum(np.abs(gx)) + np.sum(np.abs(gy)) + np.sum(np.abs(gz)))
        gyro_magnitude_max = float(np.max(gyro_mag)) if gyro_mag.size else 0.0
        gyro_stability_ratio = float(np.mean(
            (np.abs(gx) < THRESHOLDS["gyro_stability"]) &
            (np.abs(gy) < THRESHOLDS["gyro_stability"]) &
            (np.abs(gz) < THRESHOLDS["gyro_stability"])
        ))

        # simple peak count proxy
        gyro_z_peak_count = int(np.sum(np.abs(gz) > THRESHOLDS["gyro_peak_height"]))

        # interaction features
        speed_accel_product = float(np.mean(g["speed"].to_numpy() * accel_mag)) if n else 0.0
        harsh_decel_at_high_speed_count = int(np.sum(
            (g["acceleration_x"] < THRESHOLDS["harsh_braking"]) &
            (g["speed"] > THRESHOLDS["high_speed"])
        ))
        accel_variance_normalized_by_speed = float(np.var(accel_mag) / (np.mean(g["speed"]) + 1e-6))

        # core row
        row = {
            "bookingID": bid,
            "trip_duration_sec": trip_duration_sec,
            "total_distance_km": total_distance_km,
            "avg_gps_accuracy": avg_gps_accuracy,
            "harsh_acceleration_count": harsh_acceleration_count,
            "harsh_braking_count": harsh_braking_count,
            "sharp_turn_count": sharp_turn_count,
            "speeding_event_count": speeding_event_count,
            "phone_distraction_count": phone_distraction_count,
            "avg_acceleration_magnitude": avg_acceleration_magnitude,
            "max_acceleration_magnitude": max_acceleration_magnitude,
            "speed_rolling_std_5s": speed_rolling_std_5s,
            "accel_x_rolling_max_10s": accel_x_rolling_max_10s,
            "gyro_z_rolling_range_5s": gyro_z_rolling_range_5s,
            "speed_change_rate": speed_change_rate,
            "accel_magnitude_rolling_mean_5s": accel_magnitude_rolling_mean_5s,
            "accel_x_first_third_mean": accel_x_first_third_mean,
            "speed_last_third_std": speed_last_third_std,
            "harsh_braking_middle_third_ratio": harsh_braking_middle_third_ratio,
            "jerk_x_mean": jerk_x_mean,
            "jerk_y_max": jerk_y_max,
            "jerk_z_std": jerk_z_std,
            "jerk_magnitude_std": jerk_magnitude_std,
            "gyro_total_rotation": gyro_total_rotation,
            "gyro_z_peak_count": gyro_z_peak_count,
            "gyro_stability_ratio": gyro_stability_ratio,
            "gyro_magnitude_max": gyro_magnitude_max,
            "speed_accel_product": speed_accel_product,
            "harsh_decel_at_high_speed_count": harsh_decel_at_high_speed_count,
            "accel_variance_normalized_by_speed": accel_variance_normalized_by_speed,
        }

        # tsfresh-like stats (only the ones used by model feature list)
        # speed
        row.update(_ts_stats("speed", g["speed"].to_numpy(), include_median=True, include_abs_max=False, include_rms=False))
        # accel
        row.update(_ts_stats("acceleration_x", g["acceleration_x"].to_numpy(), include_median=False, include_abs_max=True, include_rms=True))
        row.update(_ts_stats("acceleration_y", g["acceleration_y"].to_numpy(), include_median=False, include_abs_max=True, include_rms=True))
        row.update(_ts_stats("acceleration_z", g["acceleration_z"].to_numpy(), include_median=False, include_abs_max=True, include_rms=True))
        # gyro (gyro_x/y/z include median, abs_max. gyro_y and gyro_z include rms in feature list, so do rms for all gyro to be safe)
        row.update(_ts_stats("gyro_x", g["gyro_x"].to_numpy(), include_median=True, include_abs_max=True, include_rms=False))
        row.update(_ts_stats("gyro_y", g["gyro_y"].to_numpy(), include_median=True, include_abs_max=True, include_rms=True))
        row.update(_ts_stats("gyro_z", g["gyro_z"].to_numpy(), include_median=True, include_abs_max=True, include_rms=True))

        rows.append(row)

    engineered = pd.DataFrame(rows)

    # -------------------------------
    # attach safety + driver columns needed by model
    # -------------------------------
    # keep only minimal driver columns needed for history features and vehicle_key
    keep_driver_cols = ["bookingID", "driver_id", "label", "vehicle_key"]

    if "no_of_years_driving_exp" in safety_driver.columns:
        keep_driver_cols.append("no_of_years_driving_exp")
    if "rating" in safety_driver.columns:
        keep_driver_cols.append("rating")
    if "date_of_birth" in safety_driver.columns:
        keep_driver_cols.append("date_of_birth")

    trip_meta = safety_driver[keep_driver_cols].drop_duplicates("bookingID")
    engineered = engineered.merge(trip_meta, on="bookingID", how="left")

    # if vehicle_key missing, fill
    if "vehicle_key" not in engineered.columns:
        engineered["vehicle_key"] = 0.0
    engineered["vehicle_key"] = pd.to_numeric(engineered["vehicle_key"], errors="coerce").fillna(0.0)

    # -------------------------------
    # driver history features
    # computed using the uploaded dataset (not magic)
    # -------------------------------
    if "driver_id" in engineered.columns:
        # driver average harsh accel from this dataset
        engineered["driver_avg_harsh_accel_historical"] = (
            engineered.groupby("driver_id")["harsh_acceleration_count"].transform("mean").fillna(0.0)
        )

        if "label" in engineered.columns:
            engineered["driver_dangerous_trip_rate_historical"] = (
                engineered.groupby("driver_id")["label"].transform("mean").fillna(0.0)
            )
        else:
            engineered["driver_dangerous_trip_rate_historical"] = 0.0

        # deviation from driver norm (use avg speed mean)
        driver_speed_mean = engineered.groupby("driver_id")["speed__mean"].transform("mean")
        driver_speed_std = engineered.groupby("driver_id")["speed__mean"].transform("std").fillna(0.0)
        engineered["trip_deviation_from_driver_norm"] = (
            (engineered["speed__mean"] - driver_speed_mean).abs() / (driver_speed_std + 1e-6)
        ).fillna(0.0)
    else:
        engineered["driver_avg_harsh_accel_historical"] = 0.0
        engineered["driver_dangerous_trip_rate_historical"] = 0.0
        engineered["trip_deviation_from_driver_norm"] = 0.0

    return engineered


def load_raw_csvs(sensor_path: str, driver_path: str, safety_path: str):
    sensor_df = pd.read_csv(sensor_path)
    driver_df = pd.read_csv(driver_path)
    safety_df = pd.read_csv(safety_path)
    return sensor_df, driver_df, safety_df
'''

# change this to your project path when you paste into your notebook
out_path = Path.cwd() / "feature_engineer.py"
out_path.write_text(FEATURE_ENGINEER_PY, encoding="utf-8")
print("wrote:", out_path.resolve())

wrote: C:\PAI-GoBest-Project\Sprint 2\Tkinter\feature_engineer.py


#

# **Quick Test: Build Engineered Features from Raw 3 CSVs**

This section validates that the feature engineering script produces 1 row per bookingID
and includes the key engineered columns needed by the XGBoost model.

---

In [2]:
import json
import pandas as pd
from feature_engineer import load_raw_csvs, engineer_features_from_raw_tables

# update these paths to your raw csvs
SENSOR_PATH = "sensor_data.csv"
DRIVER_PATH = "driver_data.csv"
SAFETY_PATH = "safety_labels.csv"

sensor_df, driver_df, safety_df = load_raw_csvs(SENSOR_PATH, DRIVER_PATH, SAFETY_PATH)

engineered = engineer_features_from_raw_tables(sensor_df, driver_df, safety_df)
print(engineered.shape)
print(engineered.head(3))

# check feature coverage using your model feature list
feature_cols = json.loads(open("xgboost_feature_cols.json","r",encoding="utf-8").read())
missing = [c for c in feature_cols if c not in engineered.columns]
extra = [c for c in engineered.columns if c not in (["bookingID","driver_id","label"] + feature_cols)]

print("missing required:", len(missing))
print(missing[:20])
print("extra cols:", len(extra))
print(extra[:20])

FileNotFoundError: [Errno 2] No such file or directory: 'sensor_data.csv'