# 04_tfl_disruptions_backfill_fast.ipynb

Latency-focused TfL disruptions backfill:
- Uses `point` (string like `"[lon,lat]"`) instead of `geometry`
- Vectorized distance matching (no nested loops)
- Aggregates to hourly features per `(point_id, tfl_time_utc)`


In [25]:
import os
import ast
import numpy as np
import pandas as pd
import requests
import hopsworks

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)


In [26]:
# ================= CONFIG =================
METADATA_FG_NAME = os.getenv("METADATA_FG_NAME", "traffic_points_metadata")
METADATA_FG_VERSION = int(os.getenv("METADATA_FG_VERSION", "1"))

TRAFFIC_FG_NAME = os.getenv("TRAFFIC_FG_NAME", "traffic_flow_fg")
TRAFFIC_FG_VERSION = int(os.getenv("TRAFFIC_FG_VERSION", "1"))

TFL_FG_NAME = os.getenv("TFL_FG_NAME", "tfl_disruptions_hourly_fg")
TFL_FG_VERSION = int(os.getenv("TFL_FG_VERSION", "1"))

TFL_BASE_URL = "https://api.tfl.gov.uk"
TFL_APP_ID = os.getenv("TFL_APP_ID", "")
TFL_APP_KEY = os.getenv("TFL_APP_KEY", "")

TFL_START_DATE = os.getenv("TFL_START_DATE", "2026-01-04")  # YYYY-MM-DD
TFL_END_DATE = os.getenv("TFL_END_DATE", "2026-01-09")      # YYYY-MM-DD

RADIUS_KM = float(os.getenv("TFL_RADIUS_KM", "0.5"))
MAX_BACKFILL_DAYS = int(os.getenv("TFL_MAX_BACKFILL_DAYS", "14"))


In [27]:
# ============== CONNECT TO HOPSWORKS ==============
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)
fs = project.get_feature_store()


2026-01-10 22:05:38,387 INFO: Closing external client and cleaning up certificates.
2026-01-10 22:05:38,409 INFO: Connection closed.
2026-01-10 22:05:38,414 INFO: Initializing external client
2026-01-10 22:05:38,414 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-10 22:05:39,514 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209


In [28]:
# ============== READ MONITORING POINTS ==============
meta_fg = fs.get_feature_group(name=METADATA_FG_NAME, version=METADATA_FG_VERSION)
points_df = meta_fg.read()

required = ["point_id", "latitude", "longitude"]
missing = [c for c in required if c not in points_df.columns]
if missing:
    raise ValueError(f"Missing metadata columns: {missing}")

points_df = points_df[required].copy()
points_df["point_id"] = points_df["point_id"].astype(str)
points_df["latitude"] = pd.to_numeric(points_df["latitude"], errors="coerce")
points_df["longitude"] = pd.to_numeric(points_df["longitude"], errors="coerce")
points_df = points_df.dropna(subset=["latitude", "longitude"]).drop_duplicates("point_id")

print("Monitoring points:", len(points_df))


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.21s) 
Monitoring points: 200


In [29]:
# ============== DATE RANGE ==============
def infer_date_range_from_traffic(max_days: int) -> (str, str):
    traffic_fg = fs.get_feature_group(name=TRAFFIC_FG_NAME, version=TRAFFIC_FG_VERSION)
    tdf = traffic_fg.read()
    if "ts_10m" not in tdf.columns:
        raise ValueError("Traffic FG must contain 'ts_10m' to infer date range.")
    tdf["ts_10m"] = pd.to_datetime(tdf["ts_10m"], utc=True, errors="coerce")
    tdf = tdf.dropna(subset=["ts_10m"])

    end = tdf["ts_10m"].max().normalize()
    start = tdf["ts_10m"].min().normalize()
    if (end - start).days > max_days:
        start = end - pd.Timedelta(days=max_days)

    return start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")

if TFL_START_DATE and TFL_END_DATE:
    start_date, end_date = TFL_START_DATE, TFL_END_DATE
else:
    start_date, end_date = infer_date_range_from_traffic(MAX_BACKFILL_DAYS)

print("TfL backfill range:", start_date, "->", end_date)


TfL backfill range: 2026-01-04 -> 2026-01-09


In [30]:
# ============== FETCH TFL DISRUPTIONS ==============
def fetch_tfl_disruptions(start_date: str, end_date: str) -> list:
    url = f"{TFL_BASE_URL}/Road/all/Disruption"
    params = {"startDate": start_date, "endDate": end_date}
    if TFL_APP_ID:
        params["app_id"] = TFL_APP_ID
    if TFL_APP_KEY:
        params["app_key"] = TFL_APP_KEY

    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    return r.json()

raw = fetch_tfl_disruptions(start_date, end_date)
raw_df = pd.DataFrame(raw)

print("Raw disruptions:", len(raw_df))
raw_df.head()


Raw disruptions: 70


Unnamed: 0,$type,id,url,point,severity,ordinal,category,subCategory,comments,currentUpdate,currentUpdateDateTime,corridorIds,startDateTime,endDateTime,lastModifiedTime,levelOfInterest,location,status,geography,isProvisional,hasClosures,roadDisruptionLines,roadDisruptionImpactAreas,recurringSchedules,geometry,streets
0,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-222381,/Road/All/Disruption/TIMS-222381,"[-0.236916,51.53247]",Serious,1,Works,Utility works,Harrow Road - [A404] Harrow Road (Both directi...,Traffic is slow moving on all approaches to th...,2026-01-10T19:07:39Z,[],2026-01-07T14:00:00Z,2026-01-14T20:00:00Z,2026-01-10T19:07:39Z,High,"[A404] HARROW ROAD (NW10 ) (Brent,Hammersmith ...",Active,"{'type': 'Point', 'coordinates': [-0.236916, 5...",False,False,[],[],[],,
1,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-206772,/Road/All/Disruption/TIMS-206772,"[0.212921,51.592491]",Moderate,2,Works,TfL works,Gallows Corner Flyover Refurbishment - [A12] E...,"Use an alternative route. There are, on averag...",2026-01-10T19:22:20Z,[a12],2025-03-15T21:00:00Z,2026-03-31T21:59:00Z,2026-01-10T19:22:26Z,High,"[A12] EASTERN AVENUE EAST (RM2 ,RM3 ) (Havering)",Active,"{'type': 'Point', 'coordinates': [0.212921, 51...",False,False,[],[],[],"{'type': 'MultiPolygon', 'coordinates': [[[[0....",[{'$type': 'Tfl.Api.Presentation.Entities.Stre...
2,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-222355,/Road/All/Disruption/TIMS-222355,"[-0.243992,51.452782]",Moderate,3,Works,Utility works,[A306] Roehampton Lane (Both directions) at th...,Delays are possible.,2026-01-10T09:49:52Z,[a205],2026-01-07T08:00:00Z,2026-01-12T23:59:00Z,2026-01-10T09:49:52Z,High,[A306] ROEHAMPTON LANE (SW15 ) (Wandsworth),Active,"{'type': 'Point', 'coordinates': [-0.243992, 5...",False,False,[],[],[],"{'type': 'MultiPolygon', 'coordinates': [[[[-0...",[{'$type': 'Tfl.Api.Presentation.Entities.Stre...
3,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-222317,/Road/All/Disruption/TIMS-222317,"[-0.241337,51.45007]",Moderate,4,Works,Utility works,[A306] Roehampton Lane (Both directions) at th...,Delays are possible.,2026-01-10T10:11:55Z,[a205],2026-01-06T01:00:00Z,2026-01-19T23:59:00Z,2026-01-10T10:11:55Z,High,[A306] ROEHAMPTON LANE (SW15 ) (Wandsworth),Active,"{'type': 'Point', 'coordinates': [-0.241337, 5...",False,False,[],[],[],,
4,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-213893,/Road/All/Disruption/TIMS-213893,"[-0.125602,51.440164]",Moderate,5,Works,TfL works,[A23] Streatham Hill (Both directions) between...,The traffic is flowing well past the works.,2026-01-10T20:49:28Z,[a23],2025-08-03T21:00:00Z,2026-07-06T05:00:00Z,2026-01-10T20:49:28Z,High,"[A23] STREATHAM HILL (SW16 ,SW2 ) (Lambeth)",Active,"{'type': 'Point', 'coordinates': [-0.125602, 5...",False,False,[],[],[],"{'type': 'Polygon', 'coordinates': [[[-0.12807...",[{'$type': 'Tfl.Api.Presentation.Entities.Stre...


In [31]:
# ============== PARSE LOCATION (POINT STRING) ==============
def parse_point_string(s):
    # Expected format: "[lon,lat]"
    if isinstance(s, str):
        try:
            v = ast.literal_eval(s)
            if isinstance(v, (list, tuple)) and len(v) == 2:
                lon, lat = float(v[0]), float(v[1])
                return lat, lon
        except Exception:
            return np.nan, np.nan
    return np.nan, np.nan

needed_cols = ["id", "category", "subCategory", "severity", "ordinal", "status", "startDateTime", "endDateTime", "point"]
for c in needed_cols:
    if c not in raw_df.columns:
        raw_df[c] = np.nan

raw_df["startDateTime"] = pd.to_datetime(raw_df["startDateTime"], utc=True, errors="coerce")
raw_df["endDateTime"] = pd.to_datetime(raw_df["endDateTime"], utc=True, errors="coerce")
raw_df = raw_df.dropna(subset=["startDateTime"]).copy()

raw_df[["lat", "lon"]] = raw_df["point"].apply(lambda s: pd.Series(parse_point_string(s)))
raw_df = raw_df.dropna(subset=["lat", "lon"]).copy()

print("Disruptions with lat/lon:", len(raw_df))
raw_df[["id", "lat", "lon", "category", "ordinal", "status"]].head()


Disruptions with lat/lon: 70


Unnamed: 0,id,lat,lon,category,ordinal,status
0,TIMS-222381,51.53247,-0.236916,Works,1,Active
1,TIMS-206772,51.592491,0.212921,Works,2,Active
2,TIMS-222355,51.452782,-0.243992,Works,3,Active
3,TIMS-222317,51.45007,-0.241337,Works,4,Active
4,TIMS-213893,51.440164,-0.125602,Works,5,Active


In [32]:
# ============== VECTORIZED SPATIAL MATCHING ==============
def haversine_km_matrix(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1)[:, None]
    lon1 = np.radians(lon1)[:, None]
    lat2 = np.radians(lat2)[None, :]
    lon2 = np.radians(lon2)[None, :]

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2.0 * np.arcsin(np.sqrt(a))
    return R * c

d_lat = raw_df["lat"].to_numpy(dtype=float)
d_lon = raw_df["lon"].to_numpy(dtype=float)

p_lat = points_df["latitude"].to_numpy(dtype=float)
p_lon = points_df["longitude"].to_numpy(dtype=float)
p_ids = points_df["point_id"].to_numpy(dtype=str)

dist = haversine_km_matrix(d_lat, d_lon, p_lat, p_lon)
d_idx, p_idx = np.where(dist <= RADIUS_KM)

links = pd.DataFrame({
    "point_id": p_ids[p_idx],
    "category": raw_df.iloc[d_idx]["category"].to_numpy(),
    "ordinal": pd.to_numeric(raw_df.iloc[d_idx]["ordinal"], errors="coerce").fillna(0).astype(int).to_numpy(),
    "status": raw_df.iloc[d_idx]["status"].to_numpy(),
    "start": raw_df.iloc[d_idx]["startDateTime"].to_numpy(),
    "end": raw_df.iloc[d_idx]["endDateTime"].to_numpy(),
})

print("Point-disruption links:", len(links))
links.head()


Point-disruption links: 21


Unnamed: 0,point_id,category,ordinal,status,start,end
0,8618,Works,7,Active,2024-07-08 07:00:00+00:00,2026-04-30 19:00:00+00:00
1,38269,Works,7,Active,2024-07-08 07:00:00+00:00,2026-04-30 19:00:00+00:00
2,38368,Works,8,Active,2025-09-04 13:21:00+00:00,2026-02-28 23:59:00+00:00
3,17567,Works,9,Active,2025-12-05 20:45:00+00:00,2026-06-15 14:00:00+00:00
4,16202,Works,15,Recurring Works,2026-01-05 10:00:00+00:00,2026-01-05 14:00:00+00:00


In [33]:
# ============== HOURLY EXPANSION + AGGREGATION ==============
if len(links) == 0:
    agg_df = pd.DataFrame(columns=[
        "point_id", "tfl_time_utc",
        "disruption_count", "is_works", "is_incident", "is_active",
        "max_ordinal"
    ])
else:
    links = links.copy()
    links["end_filled"] = links["end"]
    links.loc[pd.isna(links["end_filled"]), "end_filled"] = links["start"] + np.timedelta64(1, "h")

    links["start_hour"] = pd.to_datetime(links["start"], utc=True).dt.floor("H")
    links["end_hour"] = pd.to_datetime(links["end_filled"], utc=True).dt.floor("H")

    links["hours"] = links.apply(lambda r: pd.date_range(r["start_hour"], r["end_hour"], freq="H"), axis=1)
    hourly = links.explode("hours").rename(columns={"hours": "tfl_time_utc"}).copy()

    hourly["disruption_count"] = 1
    hourly["is_works"] = (hourly["category"] == "Works").astype(int)
    hourly["is_incident"] = (hourly["category"] == "Incident").astype(int)
    hourly["is_active"] = (hourly["status"].astype(str).str.lower() == "active").astype(int)
    hourly["max_ordinal"] = pd.to_numeric(hourly["ordinal"], errors="coerce").fillna(0).astype(int)

    agg_df = (
        hourly.groupby(["point_id", "tfl_time_utc"], as_index=False)
        .agg(
            disruption_count=("disruption_count", "sum"),
            is_works=("is_works", "max"),
            is_incident=("is_incident", "max"),
            is_active=("is_active", "max"),
            max_ordinal=("max_ordinal", "max"),
        )
    )

print("Hourly TfL features shape:", agg_df.shape)
agg_df.head()


Hourly TfL features shape: (52081, 7)




Unnamed: 0,point_id,tfl_time_utc,disruption_count,is_works,is_incident,is_active,max_ordinal
0,16202,2026-01-05 10:00:00+00:00,1,1,0,0,15
1,16202,2026-01-05 11:00:00+00:00,1,1,0,0,15
2,16202,2026-01-05 12:00:00+00:00,1,1,0,0,15
3,16202,2026-01-05 13:00:00+00:00,1,1,0,0,15
4,16202,2026-01-05 14:00:00+00:00,1,1,0,0,15


In [34]:
# ============== WRITE FEATURE GROUP ==============
tfl_fg = fs.get_or_create_feature_group(
    name=TFL_FG_NAME,
    version=TFL_FG_VERSION,
    primary_key=["point_id", "tfl_time_utc"],
    event_time="tfl_time_utc",
    description="Hourly TfL road disruption features (backfilled, low-latency point-based matching)."
)

tfl_fg.insert(
    agg_df
)

print(f" TfL disruptions backfill written to {TFL_FG_NAME} v{TFL_FG_VERSION}")


Uploading Dataframe: 100.00% |██████████| Rows 52081/52081 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: tfl_disruptions_hourly_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3209/jobs/named/tfl_disruptions_hourly_fg_1_offline_fg_materialization/executions
 TfL disruptions backfill written to tfl_disruptions_hourly_fg v1


In [35]:
agg_df.head()

Unnamed: 0,point_id,tfl_time_utc,disruption_count,is_works,is_incident,is_active,max_ordinal
0,16202,2026-01-05 10:00:00+00:00,1,1,0,0,15
1,16202,2026-01-05 11:00:00+00:00,1,1,0,0,15
2,16202,2026-01-05 12:00:00+00:00,1,1,0,0,15
3,16202,2026-01-05 13:00:00+00:00,1,1,0,0,15
4,16202,2026-01-05 14:00:00+00:00,1,1,0,0,15
