# 02_feature_engineering_backfill.ipynb

This notebook performs a **historical backfill** of engineered features from the raw traffic Feature Group in Hopsworks.

Steps:
- Read full historical data from the raw Feature Group
- Generate temporal, traffic, and rolling features
- Write the engineered dataset to a new temporal Feature Group

This notebook should be run **once** (or whenever feature logic changes).


In [90]:
import os
import pandas as pd
import numpy as np
import hopsworks

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)


In [91]:
# ================= CONFIG =================
RAW_FG_NAME = os.getenv("RAW_FG_NAME", "traffic_flow_fg")
RAW_FG_VERSION = int(os.getenv("RAW_FG_VERSION", "1"))

ENGINEERED_FG_NAME = os.getenv("ENGINEERED_FG_NAME", "traffic_temporal_fg")
ENGINEERED_FG_VERSION = int(os.getenv("ENGINEERED_FG_VERSION", "1"))

METADATA_FG_NAME = os.getenv("METADATA_FG_NAME", "monitoring_points_fg")
METADATA_FG_VERSION = int(os.getenv("METADATA_FG_VERSION", "1"))
DO_METADATA_JOIN = os.getenv("DO_METADATA_JOIN", "0") == "1"

ROLL_WINDOWS = [3, 6, 12]  # assuming 10-min resolution
LOW_CONF_THRESHOLD = float(os.getenv("LOW_CONF_THRESHOLD", "0.7"))


In [92]:
# ============== CONNECT TO HOPSWORKS ==============
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)
fs = project.get_feature_store()


2026-01-11 11:52:32,171 INFO: Closing external client and cleaning up certificates.
2026-01-11 11:52:32,185 INFO: Connection closed.
2026-01-11 11:52:32,186 INFO: Initializing external client
2026-01-11 11:52:32,186 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 11:52:32,836 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209


In [93]:
# ============== READ RAW FEATURE GROUP ==============
raw_fg = fs.get_feature_group(name=RAW_FG_NAME, version=RAW_FG_VERSION)
df = raw_fg.read()

print("Raw shape:", df.shape)
df.head()


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.37s) 
Raw shape: (26057, 12)


Unnamed: 0,timestamp_utc,point_id,frc,current_speed,free_flow_speed,current_travel_time,free_flow_travel_time,confidence,road_closure,ts_10m,speed_ratio,delay_seconds
0,2026-01-05 04:06:27.356994+00:00,37825,FRC2,24.0,24.0,209.0,209.0,1.0,False,2026-01-05 04:00:00+00:00,1.0,0.0
1,2026-01-06 06:29:30.303904+00:00,46818,FRC3,27.0,27.0,142.0,142.0,1.0,False,2026-01-06 06:20:00+00:00,1.0,0.0
2,2026-01-06 22:47:48.292950+00:00,17687,FRC3,30.0,30.0,136.0,136.0,1.0,False,2026-01-06 22:40:00+00:00,1.0,0.0
3,2026-01-07 03:50:03.489327+00:00,17524,FRC3,34.0,34.0,316.0,316.0,1.0,False,2026-01-07 03:50:00+00:00,1.0,0.0
4,2026-01-06 10:32:59.405332+00:00,38572,FRC3,23.0,23.0,60.0,60.0,1.0,False,2026-01-06 10:30:00+00:00,1.0,0.0


In [94]:
# ============== SANITY CHECKS ==============
required_cols = [
    "ts_10m", "timestamp_utc", "point_id",
    "current_speed", "free_flow_speed",
    "current_travel_time", "free_flow_travel_time",
    "confidence", "road_closure"
]

missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df["ts_10m"] = pd.to_datetime(df["ts_10m"], utc=True, errors="coerce")
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True, errors="coerce")

df = df.dropna(subset=["point_id", "ts_10m"])
df = df.sort_values(["point_id", "ts_10m"])
df = df.drop_duplicates(subset=["point_id", "ts_10m"], keep="last")

print("After cleaning shape:", df.shape)


After cleaning shape: (26057, 12)


In [95]:
# ============== TEMPORAL FEATURES ==============
df["day_of_week"] = df["ts_10m"].dt.weekday
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

df["hour"] = df["ts_10m"].dt.hour
df["minute"] = df["ts_10m"].dt.minute

df["is_rush_hour"] = (
    (df["hour"].between(7, 9)) |
    (df["hour"].between(16, 18))
).astype(int)

def time_interval(hour: int) -> str:
    if 7 <= hour < 10:
        return "morning_peak"
    elif 10 <= hour < 16:
        return "midday"
    elif 16 <= hour < 19:
        return "evening_peak"
    else:
        return "night"

df["time_interval"] = df["hour"].apply(time_interval)
df = pd.get_dummies(df, columns=["time_interval"], prefix="ti")


In [96]:
# ============== TRAFFIC FEATURES ==============
eps = 1e-6

df["speed_diff"] = df["free_flow_speed"] - df["current_speed"]
df["travel_time_ratio"] = df["current_travel_time"] / (df["free_flow_travel_time"] + eps)

df["low_confidence_flag"] = (df["confidence"] < LOW_CONF_THRESHOLD).astype(int)
df["travel_time_ratio"] = df["travel_time_ratio"].clip(lower=0, upper=10)


In [97]:
# ============== ROLLING FEATURES (PER POINT) ==============
df = df.sort_values(["point_id", "ts_10m"])

for w in ROLL_WINDOWS:
    df[f"speed_roll_mean_{w}"] = (
        df.groupby("point_id")["current_speed"]
          .rolling(window=w, min_periods=1)
          .mean()
          .reset_index(level=0, drop=True)
    )

    df[f"speed_roll_std_{w}"] = (
        df.groupby("point_id")["current_speed"]
          .rolling(window=w, min_periods=1)
          .std()
          .reset_index(level=0, drop=True)
          .fillna(0.0)
    )

    df[f"delay_roll_mean_{w}"] = (
        df.groupby("point_id")["delay_seconds"]
          .rolling(window=w, min_periods=1)
          .mean()
          .reset_index(level=0, drop=True)
    )


In [98]:
# ============== OPTIONAL METADATA JOIN ==============
if DO_METADATA_JOIN:
    meta_fg = fs.get_feature_group(name=METADATA_FG_NAME, version=METADATA_FG_VERSION)
    meta_df = meta_fg.read()

    # Keep only useful, stable metadata features
    meta_cols = [
        "point_id",         
        "road_type",
        "road_category",
        "link_length_km",
        "latitude",
        "longitude"
    ]
    meta_cols = [c for c in meta_cols if c in meta_df.columns]
    meta_df = meta_df[meta_cols].copy()

    # Ensure unique key in metadata
    meta_df["point_id"] = meta_df["point_id"].astype(str)
    meta_df = meta_df.drop_duplicates(subset=["point_id"], keep="last")

    # Ensure same dtype for join key
    df["point_id"] = df["point_id"].astype(str)

    # Merge only selected metadata columns
    df = df.merge(meta_df, on="point_id", how="left")

    print("After metadata join shape:", df.shape)



In [99]:
# ============== FINAL QA ==============
dup = df.duplicated(subset=["point_id", "ts_10m"]).sum()
print("Duplicate PK rows:", dup)
if dup > 0:
    raise ValueError("Primary key duplicates detected")

print("Final shape:", df.shape)

Duplicate PK rows: 0
Final shape: (26057, 33)


In [100]:
# ============== WRITE FEATURE GROUP ==============
engineered_fg = fs.get_or_create_feature_group(
    name=ENGINEERED_FG_NAME,
    version=ENGINEERED_FG_VERSION,
    primary_key=["point_id", "ts_10m"],
    event_time="ts_10m",
    description="Backfilled engineered temporal traffic features"
)

engineered_fg.insert(df)

print("Backfill completed successfully")


Uploading Dataframe: 100.00% |██████████| Rows 26057/26057 | Elapsed Time: 00:04 | Remaining Time: 00:00


Launching job: traffic_temporal_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3209/jobs/named/traffic_temporal_fg_1_offline_fg_materialization/executions
Backfill completed successfully


In [101]:
# df to csv
df.to_csv("data/processed/traffic_flow_data_engineered.csv", index=False)