In [41]:
import os
import json
from pathlib import Path

import pandas as pd
import numpy as np
import requests


In [42]:
import hopsworks

# ----------------------------
# 1) Load CSV
# ----------------------------
CSV_PATH = "data/interim/monitoring_points_metadata.csv"
df = pd.read_csv(CSV_PATH)

# Basic required columns
required = {"point_id", "latitude", "longitude"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns in CSV: {missing}")

# Clean + types
df["point_id"] = df["point_id"].astype(str)
df["latitude"] = df["latitude"].astype(float)
df["longitude"] = df["longitude"].astype(float)

# Optional: keep only a curated set if you want
# (comment out if you prefer to keep all columns in the CSV)
curated_cols = [
    "point_id",
    "latitude",
    "longitude",
    "local_authority_id",
    "road_category",
    "road_type",
    "region_id",
    "road_name",
    "link_length_km",
]
curated_cols = [c for c in curated_cols if c in df.columns]
df = df[curated_cols].copy()

# Validations
if df[["point_id", "latitude", "longitude"]].isna().any().any():
    raise ValueError("Nulls found in required columns (point_id, latitude, longitude).")

if df["point_id"].duplicated().any():
    dupes = df[df["point_id"].duplicated(keep=False)].sort_values("point_id")
    raise ValueError(f"Duplicated point_id found. Example rows:\n{dupes.head(10)}")

if not df["latitude"].between(-90, 90).all():
    raise ValueError("Latitude out of range [-90, 90].")

if not df["longitude"].between(-180, 180).all():
    raise ValueError("Longitude out of range [-180, 180].")

print("Loaded metadata rows:", len(df))
display(df.head())

# ----------------------------
# 2) Login to Hopsworks
# ----------------------------

# 1. Login to Hopsworks
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)
fs = project.get_feature_store()

# ----------------------------
# 3) Create / Get Feature Group
# ----------------------------
FG_NAME = "traffic_points_metadata"
FG_VERSION = 1
FG_DESCRIPTION = "Static metadata for traffic monitoring points (lat/lon + road/LA context)."

primary_key = ["point_id"]

fg = fs.get_or_create_feature_group(
        name=FG_NAME,
        version=FG_VERSION,
        description=FG_DESCRIPTION,
        primary_key=primary_key
    )
print(f"Feature Group created: {FG_NAME} v{FG_VERSION}")




Loaded metadata rows: 200


Unnamed: 0,point_id,latitude,longitude,local_authority_id,road_category,road_type,road_name,link_length_km
0,18526,51.529441,0.088977,168,PA,Major,A13,2.1
1,7518,51.544222,0.147921,168,PA,Major,A1240,2.8
2,6797,51.367362,0.045676,176,PA,Major,A232,1.0
3,36807,51.48211,0.101649,105,PA,Major,A209,1.6
4,28510,51.501415,-0.0758,103,PA,Major,A200,0.3


2026-01-11 14:58:29,348 INFO: Closing external client and cleaning up certificates.
2026-01-11 14:58:29,349 INFO: Connection closed.
2026-01-11 14:58:29,350 INFO: Initializing external client
2026-01-11 14:58:29,350 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 14:58:30,451 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209
Feature Group created: traffic_points_metadata v1


In [43]:
# TomTom Flow Segment Data endpoint format (Traffic API v4)
# https://api.tomtom.com/traffic/services/{version}/flowSegmentData/{style}/{zoom}/{format}?point=lat,lon&key=API_KEY
# style: relative0 is typical for relative flow; zoom: 10 is a common choice; format: json
TOMTOM_API_KEY = os.getenv("TOMTOM_API_KEY")
if not TOMTOM_API_KEY:
    raise EnvironmentError("Missing TOMTOM_API_KEY env var. Set it in your environment (or in Hopsworks/Spaces secrets).")

FLOW_STYLE = "relative0"
FLOW_ZOOM = 10
FLOW_FORMAT = "json"
TOMTOM_BASE = f"https://api.tomtom.com/traffic/services/4/flowSegmentData/{FLOW_STYLE}/{FLOW_ZOOM}/{FLOW_FORMAT}"

# Local cache to avoid re-fetching (recommended)
CACHE_PATH = Path("data/interim/point_geometry_cache.parquet")


In [44]:
def fetch_tomtom_flowsegment(lat: float, lon: float, api_key: str, timeout: int = 20) -> dict:
    params = {
        "point": f"{lat},{lon}",
        "key": api_key,
        # Optional: unit / thickness / openLr (keep simple)
    }
    r = requests.get(TOMTOM_BASE, params=params, timeout=timeout)
    r.raise_for_status()
    return r.json()

def flowsegment_to_geojson_linestring(payload: dict) -> str | None:
    """Return GeoJSON LineString string with [lon, lat] coords."""
    fsd = (payload or {}).get("flowSegmentData", {})
    coords_obj = (fsd.get("coordinates") or {}).get("coordinate", None)
    if not coords_obj:
        return None

    coords = []
    for p in coords_obj:
        # TomTom uses latitude/longitude fields in the coordinate list
        lat = p.get("latitude", None)
        lon = p.get("longitude", None)
        if lat is None or lon is None:
            continue
        coords.append([float(lon), float(lat)])

    if len(coords) < 2:
        return None

    geojson = {"type": "LineString", "coordinates": coords}
    return json.dumps(geojson)

def load_cache(path: Path) -> pd.DataFrame:
    if path.exists():
        c = pd.read_parquet(path)
        c["point_id"] = c["point_id"].astype(str)
        return c
    return pd.DataFrame(columns=["point_id", "geometry_geojson"])

def save_cache(cache_df: pd.DataFrame, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    cache_df.to_parquet(path, index=False)

cache = load_cache(CACHE_PATH)
cache.head()


Unnamed: 0,point_id,geometry_geojson
0,18526,"{""type"": ""LineString"", ""coordinates"": [[0.0782..."
1,7518,"{""type"": ""LineString"", ""coordinates"": [[0.1478..."
2,6797,"{""type"": ""LineString"", ""coordinates"": [[0.0389..."
3,36807,"{""type"": ""LineString"", ""coordinates"": [[0.1038..."
4,28510,"{""type"": ""LineString"", ""coordinates"": [[-0.077..."


In [45]:
# Enrich df with geometry, using cache
df_geom = df[["point_id", "latitude", "longitude"]].merge(
    cache, on="point_id", how="left"
)

missing_geom = df_geom["geometry_geojson"].isna()
n_missing = int(missing_geom.sum())
print(f"Points missing geometry: {n_missing} / {len(df_geom)}")

if n_missing > 0:
    rows = df_geom.loc[missing_geom, ["point_id", "latitude", "longitude"]].to_dict("records")
    new_rows = []
    for row in rows:
        pid = row["point_id"]
        lat = float(row["latitude"])
        lon = float(row["longitude"])
        try:
            payload = fetch_tomtom_flowsegment(lat, lon, TOMTOM_API_KEY)
            gj = flowsegment_to_geojson_linestring(payload)
        except Exception as e:
            print(f"[WARN] geometry fetch failed for point_id={pid}: {e}")
            gj = None

        new_rows.append({"point_id": pid, "geometry_geojson": gj})

    new_cache_part = pd.DataFrame(new_rows)
    cache_updated = (
        pd.concat([cache, new_cache_part], ignore_index=True)
          .drop_duplicates(subset=["point_id"], keep="last")
    )
    save_cache(cache_updated, CACHE_PATH)
    cache = cache_updated

# Final join back into df
df = df.merge(cache[["point_id", "geometry_geojson"]], on="point_id", how="left")

# Basic sanity
print("Geometry coverage:", df["geometry_geojson"].notna().mean())
df.head()

Points missing geometry: 0 / 200
Geometry coverage: 1.0


Unnamed: 0,point_id,latitude,longitude,local_authority_id,road_category,road_type,road_name,link_length_km,geometry_geojson
0,18526,51.529441,0.088977,168,PA,Major,A13,2.1,"{""type"": ""LineString"", ""coordinates"": [[0.0782..."
1,7518,51.544222,0.147921,168,PA,Major,A1240,2.8,"{""type"": ""LineString"", ""coordinates"": [[0.1478..."
2,6797,51.367362,0.045676,176,PA,Major,A232,1.0,"{""type"": ""LineString"", ""coordinates"": [[0.0389..."
3,36807,51.48211,0.101649,105,PA,Major,A209,1.6,"{""type"": ""LineString"", ""coordinates"": [[0.1038..."
4,28510,51.501415,-0.0758,103,PA,Major,A200,0.3,"{""type"": ""LineString"", ""coordinates"": [[-0.077..."


In [46]:
import hopsworks

FG_NAME = "traffic_points_metadata"
FG_VERSION = 1  # bump version because schema changed
FG_DESCRIPTION = "Static metadata for traffic monitoring points (lat/lon + road context + GeoJSON geometry)."

# Primary key: point_id (static table)
primary_key = ["point_id"]

fg = fs.get_or_create_feature_group(
    name=FG_NAME,
    version=FG_VERSION,
    description=FG_DESCRIPTION,
    primary_key=primary_key,
)

# Insert overwrite
fg.insert(df, overwrite=True)
print(f"Upserted Feature Group: {FG_NAME} v{FG_VERSION} | rows={len(df)}")

Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/3209/fs/3154/fg/2308


Uploading Dataframe: 100.00% |██████████| Rows 200/200 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: traffic_points_metadata_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3209/jobs/named/traffic_points_metadata_1_offline_fg_materialization/executions
Upserted Feature Group: traffic_points_metadata v1 | rows=200
