# 05_label_generation_backfill.ipynb

This notebook generates **future labels** from the historical traffic feature group.

Labels created (per point, per 10-min timestamp):
- `label_speed_ratio_t_plus_30`  (t + 30 minutes)
- `label_speed_ratio_t_plus_60`  (t + 60 minutes)

Notes:
- Labels are stored in a **separate Feature Group** to avoid leakage and keep online features clean.
- This notebook is for **offline backfill** (training). It does not run as an online inference job.


In [12]:
import os
import pandas as pd
import numpy as np
import hopsworks

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 220)


In [13]:
# ================= CONFIG =================
TRAFFIC_FG_NAME = os.getenv("TRAFFIC_FG_NAME", "traffic_temporal_fg")
TRAFFIC_FG_VERSION = int(os.getenv("TRAFFIC_FG_VERSION", "1"))

POINT_ID_COL = os.getenv("POINT_ID_COL", "point_id")
TIME_COL = os.getenv("TIME_COL", "ts_10m")
TARGET_COL = os.getenv("TARGET_COL", "speed_ratio")

# 10-min resolution horizons
HORIZON_30_STEPS = int(os.getenv("HORIZON_30_STEPS", "3"))   # 30m
HORIZON_60_STEPS = int(os.getenv("HORIZON_60_STEPS", "6"))   # 60m

LABEL_FG_NAME = os.getenv("LABEL_FG_NAME", "labels_speed_ratio_fg")
LABEL_FG_VERSION = int(os.getenv("LABEL_FG_VERSION", "1"))

OVERWRITE = os.getenv("LABEL_OVERWRITE", "true").lower() == "true"


In [14]:
# ============== CONNECT TO HOPSWORKS ==============
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)
fs = project.get_feature_store()


2026-01-10 22:10:33,449 INFO: Closing external client and cleaning up certificates.


2026-01-10 22:10:33,511 INFO: Connection closed.
2026-01-10 22:10:33,520 INFO: Initializing external client
2026-01-10 22:10:33,520 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-10 22:10:34,670 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209


In [15]:
# ============== READ SOURCE FEATURES ==============
traffic_fg = fs.get_feature_group(name=TRAFFIC_FG_NAME, version=TRAFFIC_FG_VERSION)
df = traffic_fg.read()

needed = [POINT_ID_COL, TIME_COL, TARGET_COL]
missing = [c for c in needed if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns in source FG: {missing}")

df[POINT_ID_COL] = df[POINT_ID_COL].astype(str)
df[TIME_COL] = pd.to_datetime(df[TIME_COL], utc=True, errors="coerce")
df[TARGET_COL] = pd.to_numeric(df[TARGET_COL], errors="coerce")

df = df.dropna(subset=[POINT_ID_COL, TIME_COL, TARGET_COL]).copy()

print("Source shape:", df.shape)
print("Time range:", df[TIME_COL].min(), "->", df[TIME_COL].max())
df.head()


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.77s) 
Source shape: (23807, 33)
Time range: 2026-01-04 21:00:00+00:00 -> 2026-01-10 19:40:00+00:00


Unnamed: 0,timestamp_utc,point_id,frc,current_speed,free_flow_speed,current_travel_time,free_flow_travel_time,confidence,road_closure,ts_10m,speed_ratio,delay_seconds,day_of_week,is_weekend,hour,minute,is_rush_hour,ti_evening_peak,ti_midday,ti_morning_peak,ti_night,speed_diff,travel_time_ratio,low_confidence_flag,speed_roll_mean_3,speed_roll_std_3,delay_roll_mean_3,speed_roll_mean_6,speed_roll_std_6,delay_roll_mean_6,speed_roll_mean_12,speed_roll_std_12,delay_roll_mean_12
0,2026-01-05 04:06:27.356994+00:00,37825,FRC2,24.0,24.0,209.0,209.0,1.0,False,2026-01-05 04:00:00+00:00,1.0,0.0,0,0,4,0,0,False,False,False,True,0.0,1.0,0,24.0,0.0,0.0,24.0,0.0,0.0,23.833333,0.57735,0.833333
1,2026-01-06 06:29:30.303904+00:00,46818,FRC3,27.0,27.0,142.0,142.0,1.0,False,2026-01-06 06:20:00+00:00,1.0,0.0,1,0,6,20,0,False,False,False,True,0.0,1.0,0,27.0,0.0,0.0,27.0,0.0,0.0,27.0,0.0,0.0
2,2026-01-06 22:47:48.292950+00:00,17687,FRC3,30.0,30.0,136.0,136.0,1.0,False,2026-01-06 22:40:00+00:00,1.0,0.0,1,0,22,40,0,False,False,False,True,0.0,1.0,0,30.0,0.0,0.0,30.0,0.0,0.0,30.0,0.0,0.0
3,2026-01-07 03:50:03.489327+00:00,17524,FRC3,34.0,34.0,316.0,316.0,1.0,False,2026-01-07 03:50:00+00:00,1.0,0.0,2,0,3,50,0,False,False,False,True,0.0,1.0,0,34.0,0.0,0.0,34.0,0.0,0.0,34.0,0.0,0.0
4,2026-01-06 10:32:59.405332+00:00,38572,FRC3,23.0,23.0,60.0,60.0,1.0,False,2026-01-06 10:30:00+00:00,1.0,0.0,1,0,10,30,0,False,True,False,False,0.0,1.0,0,16.666667,6.027714,30.666667,18.166667,4.535049,21.0,18.25,3.980064,19.416667


In [16]:
# ============== DUPLICATE KEYS CHECK ==============
dup = df.duplicated(subset=[POINT_ID_COL, TIME_COL]).sum()
print("Duplicate (point_id, ts_10m) rows:", dup)

if dup > 0:
    print("Duplicates detected. Aggregating by mean for target column.")
    df = (
        df.sort_values([POINT_ID_COL, TIME_COL])
          .groupby([POINT_ID_COL, TIME_COL], as_index=False)[TARGET_COL].mean()
    )
    print("After dedup shape:", df.shape)


Duplicate (point_id, ts_10m) rows: 0


In [17]:
# ============== OPTIONAL: CHECK 10-MIN REGULARITY ==============
df = df.sort_values([POINT_ID_COL, TIME_COL]).copy()
delta = df.groupby(POINT_ID_COL)[TIME_COL].diff()

# note: includes first row per point as NaT in delta (counts as non-10m here)
gap_rate = (delta != pd.Timedelta(minutes=10)).mean()
print("Fraction of non-10min deltas (incl. first row per point):", gap_rate)

gaps = df.loc[delta > pd.Timedelta(minutes=10), [POINT_ID_COL, TIME_COL]].head(10)
print("Example gaps (>10m):")
print(gaps)


Fraction of non-10min deltas (incl. first row per point): 0.06120048725164867
Example gaps (>10m):
      point_id                    ts_10m
17624    16435 2026-01-05 10:10:00+00:00
9378     16435 2026-01-05 13:30:00+00:00
11898    16435 2026-01-05 23:20:00+00:00
18482    16435 2026-01-06 00:20:00+00:00
9622     16435 2026-01-06 00:40:00+00:00
12288    16435 2026-01-06 01:10:00+00:00
17543    16435 2026-01-06 02:30:00+00:00
8062     16435 2026-01-06 03:20:00+00:00
10747    16435 2026-01-06 04:20:00+00:00
16276    16435 2026-01-06 04:40:00+00:00


In [18]:
# ============== CREATE LABELS ==============
df["label_speed_ratio_t_plus_30"] = df.groupby(POINT_ID_COL)[TARGET_COL].shift(-HORIZON_30_STEPS)
df["label_speed_ratio_t_plus_60"] = df.groupby(POINT_ID_COL)[TARGET_COL].shift(-HORIZON_60_STEPS)

labels_df = df[[POINT_ID_COL, TIME_COL, "label_speed_ratio_t_plus_30", "label_speed_ratio_t_plus_60"]].copy()

before = len(labels_df)
labels_df = labels_df.dropna(subset=["label_speed_ratio_t_plus_30", "label_speed_ratio_t_plus_60"]).copy()
after = len(labels_df)

print("Labels before dropna:", before)
print("Labels after dropna:", after)
labels_df.head()


Labels before dropna: 23807
Labels after dropna: 23468


Unnamed: 0,point_id,ts_10m,label_speed_ratio_t_plus_30,label_speed_ratio_t_plus_60
17329,16435,2026-01-04 21:00:00+00:00,0.941176,0.944444
14022,16435,2026-01-04 21:10:00+00:00,0.777778,1.0
4923,16435,2026-01-04 21:20:00+00:00,0.944444,1.0
11399,16435,2026-01-04 21:30:00+00:00,0.944444,0.944444
3518,16435,2026-01-04 21:40:00+00:00,1.0,0.888889


In [19]:
# ============== LABEL SANITY CHECKS ==============
for col in ["label_speed_ratio_t_plus_30", "label_speed_ratio_t_plus_60"]:
    s = pd.to_numeric(labels_df[col], errors="coerce")
    print("\n", col)
    print("min/max:", float(s.min()), float(s.max()))
    print("<=0 count:", int((s <= 0).sum()))
    print(">2 count:", int((s > 2).sum()))

dup_labels = labels_df.duplicated(subset=[POINT_ID_COL, TIME_COL]).sum()
print("\nDuplicate keys in labels:", dup_labels)
if dup_labels > 0:
    raise ValueError("Labels DF has duplicate PK rows; investigate upstream duplicates.")



 label_speed_ratio_t_plus_30
min/max: 0.05714285714285714 1.0
<=0 count: 0
>2 count: 0

 label_speed_ratio_t_plus_60
min/max: 0.05714285714285714 1.0
<=0 count: 0
>2 count: 0

Duplicate keys in labels: 0


In [20]:
# ============== WRITE LABEL FEATURE GROUP ==============
label_fg = fs.get_or_create_feature_group(
    name=LABEL_FG_NAME,
    version=LABEL_FG_VERSION,
    primary_key=[POINT_ID_COL, TIME_COL],
    event_time=TIME_COL,
    description=f"Future labels for {TARGET_COL}: t+30 and t+60 minutes (from {TRAFFIC_FG_NAME} v{TRAFFIC_FG_VERSION})."
)

label_fg.insert(
    labels_df
)

print(f"Labels written to {LABEL_FG_NAME} v{LABEL_FG_VERSION} (overwrite={OVERWRITE})")


Uploading Dataframe: 100.00% |██████████| Rows 23468/23468 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: labels_speed_ratio_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3209/jobs/named/labels_speed_ratio_fg_1_offline_fg_materialization/executions
Labels written to labels_speed_ratio_fg v1 (overwrite=True)
