In [1]:
import os
import json
import joblib
import numpy as np
import pandas as pd
from datetime import datetime, timezone

import hopsworks


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# =========================
# Hopsworks
# =========================
PROJECT_NAME = os.environ.get("HOPSWORKS_PROJECT_NAME")
HOPSWORKS_API_KEY = os.environ.get("HOPSWORKS_API_KEY")

# =========================
# Feature Groups
# =========================
TRAFFIC_FG_NAME, TRAFFIC_FG_VER = "traffic_temporal_fg", 1
WEATHER_FG_NAME, WEATHER_FG_VER = "weather_10m_fg", 1
TFL_FG_NAME, TFL_FG_VER         = "tfl_disruptions_10m_fg", 1

# =========================
# Columns
# =========================
POINT_ID_COL = "point_id"

# If your FG already has ts_10m use that; otherwise we create it from timestamp_utc
TS_10M_COL = "ts_10m"
RAW_TS_CANDIDATES = ["timestamp_utc", "timestamp", "datetime", "time"]

# =========================
# Model
# =========================
MODEL_NAME = "traffic_speed_ratio_keras"   
MODEL_VERSION = None                      

PRED_COL_30 = "pred_speed_ratio_t+30"
PRED_COL_60 = "pred_speed_ratio_t+60"

# =========================
# Output
# =========================
N_POINTS = 50
OUT_JSON_PATH = "predictions_latest.json"


In [9]:
# 1. Login to Hopsworks
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)
fs = project.get_feature_store()
mr = project.get_model_registry()

2026-01-11 17:22:55,519 INFO: Closing external client and cleaning up certificates.
2026-01-11 17:22:55,538 INFO: Connection closed.
2026-01-11 17:22:55,540 INFO: Initializing external client
2026-01-11 17:22:55,540 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 17:22:56,803 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209


In [22]:
def find_timestamp_col(df: pd.DataFrame, candidates=RAW_TS_CANDIDATES) -> str | None:
    for c in candidates:
        if c in df.columns:
            return c
    return None

def ensure_ts_10m(df: pd.DataFrame, ts_10m_col: str = TS_10M_COL) -> pd.DataFrame:
    df = df.copy()

    if ts_10m_col in df.columns:
        df[ts_10m_col] = pd.to_datetime(df[ts_10m_col], utc=True, errors="coerce")
        return df

    ts_col = find_timestamp_col(df)
    if ts_col is None:
        raise ValueError(
            f"No timestamp column found. Looked for {RAW_TS_CANDIDATES}. Available={list(df.columns)}"
        )

    df[ts_col] = pd.to_datetime(df[ts_col], utc=True, errors="coerce")
    df[ts_10m_col] = df[ts_col].dt.floor("10min")
    return df

def read_fg(fs, name: str, version: int) -> pd.DataFrame:
    fg = fs.get_feature_group(name=name, version=version)
    return fg.read()

def join_two(left: pd.DataFrame, right: pd.DataFrame, right_name: str) -> pd.DataFrame:
    left = left.copy()
    right = right.copy()

    if TS_10M_COL not in left.columns or TS_10M_COL not in right.columns:
        raise ValueError(f"Missing {TS_10M_COL} in join: left_has={TS_10M_COL in left.columns}, right_has={TS_10M_COL in right.columns}")

    left_has_pid = POINT_ID_COL in left.columns
    right_has_pid = POINT_ID_COL in right.columns

    if left_has_pid and right_has_pid:
        keys = [POINT_ID_COL, TS_10M_COL]
        out = left.merge(right, on=keys, how="left", suffixes=("", f"_{right_name}"))
        print(f"Joined {right_name} on {keys}. Shape={out.shape}")
        return out

    keys = [TS_10M_COL]
    out = left.merge(right, on=keys, how="left", suffixes=("", f"_{right_name}"))
    print(f"Joined {right_name} on {keys} only (no point_id in one side). Shape={out.shape}")
    return out

def latest_per_point(df: pd.DataFrame, n_points: int = N_POINTS) -> pd.DataFrame:
    if POINT_ID_COL not in df.columns:
        raise ValueError(f"{POINT_ID_COL} not found; cannot take latest per point.")

    df = df.copy()
    df = df.dropna(subset=[TS_10M_COL])
    df = df.sort_values([POINT_ID_COL, TS_10M_COL])
    df = df.drop_duplicates(subset=[POINT_ID_COL], keep="last")
    df = df.sort_values(TS_10M_COL, ascending=False).head(n_points).reset_index(drop=True)
    return df


In [23]:
df_tr = read_fg(fs, TRAFFIC_FG_NAME, TRAFFIC_FG_VER)
df_we = read_fg(fs, WEATHER_FG_NAME, WEATHER_FG_VER)
df_tf = read_fg(fs, TFL_FG_NAME, TFL_FG_VER)

print("Raw shapes:")
print("traffic:", df_tr.shape)
print("weather:", df_we.shape)
print("tfl:", df_tf.shape)

df_tr = ensure_ts_10m(df_tr)
df_we = ensure_ts_10m(df_we)
df_tf = ensure_ts_10m(df_tf)

print("Has columns:")
print("traffic:", list(df_tr.columns)[:25])
print("weather:", list(df_we.columns)[:25])
print("tfl:", list(df_tf.columns)[:25])


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.91s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.51s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.61s) 
Raw shapes:
traffic: (26157, 33)
weather: (230400, 6)
tfl: (389982, 7)
Has columns:
traffic: ['timestamp_utc', 'point_id', 'frc', 'current_speed', 'free_flow_speed', 'current_travel_time', 'free_flow_travel_time', 'confidence', 'road_closure', 'ts_10m', 'speed_ratio', 'delay_seconds', 'day_of_week', 'is_weekend', 'hour', 'minute', 'is_rush_hour', 'ti_evening_peak', 'ti_midday', 'ti_morning_peak', 'ti_night', 'speed_diff', 'travel_time_ratio', 'low_confidence_flag', 'speed_roll_mean_3']
weather: ['point_id', 'ts_10m', 'precipitation', 'rain', 'snowfall', 'temperature_2m']
tfl: ['point_id', 'ts_10m', 'disruption_count', 'is_active', 'is_incident', 'is_works', 'max_ordinal']


In [24]:
df_join = df_tr.copy()
df_join = join_two(df_join, df_we, "weather")
df_join = join_two(df_join, df_tf, "tfl")

df_latest = latest_per_point(df_join, n_points=N_POINTS)

print("Inference snapshot shape:", df_latest.shape)
df_latest[[POINT_ID_COL, TS_10M_COL]].head(10)


Joined weather on ['point_id', 'ts_10m']. Shape=(26157, 37)
Joined tfl on ['point_id', 'ts_10m']. Shape=(26157, 42)
Inference snapshot shape: (50, 42)


Unnamed: 0,point_id,ts_10m
0,36825,2026-01-11 16:00:00+00:00
1,16228,2026-01-11 16:00:00+00:00
2,26114,2026-01-11 16:00:00+00:00
3,38022,2026-01-11 16:00:00+00:00
4,6097,2026-01-11 16:00:00+00:00
5,26146,2026-01-11 16:00:00+00:00
6,26182,2026-01-11 16:00:00+00:00
7,36942,2026-01-11 16:00:00+00:00
8,26434,2026-01-11 16:00:00+00:00
9,26664,2026-01-11 16:00:00+00:00


In [25]:
def _walk_files(root_dir: str):
    for r, _, files in os.walk(root_dir):
        for f in files:
            yield os.path.join(r, f)

def _find_saved_model_dir(root_dir: str):
    for path in _walk_files(root_dir):
        if os.path.basename(path) == "saved_model.pb":
            return os.path.dirname(path)
    return None

def pick_model_from_registry(mr, model_name: str, model_version: int | None = None):
    if model_version is not None:
        model = mr.get_model(model_name, version=model_version)
        print(f"Using model {model_name} v{model_version}")
        return model

    try:
        model = mr.get_model(model_name, stage="production")
        print(f"Using model {model_name} (stage=production) v{model.version}")
        return model
    except Exception:
        print("No production stage found. Falling back to latest version...")
        models = mr.get_models(model_name)
        if len(models) == 0:
            raise ValueError(f"No models found with name={model_name}")
        latest = sorted(models, key=lambda m: m.version)[-1]
        model = mr.get_model(model_name, version=latest.version)
        print(f"Using model {model_name} v{model.version} (latest)")
        return model

def load_scaler_and_keras_from_dir(local_dir: str):
    # 1) Find keras file
    keras_file = None
    for path in _walk_files(local_dir):
        low = path.lower()
        if low.endswith(".keras") or low.endswith(".h5") or low.endswith(".hdf5"):
            keras_file = path
            break

    # 2) Find SavedModel directory
    saved_model_dir = None
    if keras_file is None:
        saved_model_dir = _find_saved_model_dir(local_dir)

    # 3) Find scaler-like pkl/joblib (object has transform but not predict)
    scaler = None
    pkl_candidates = [p for p in _walk_files(local_dir) if p.lower().endswith((".pkl", ".joblib"))]
    for p in pkl_candidates:
        try:
            obj = joblib.load(p)
            if hasattr(obj, "transform") and hasattr(obj, "fit") and not hasattr(obj, "predict"):
                scaler = obj
                print("Loaded scaler artifact:", p)
                break
        except Exception:
            continue

    # 4) Load keras model
    import tensorflow as tf
    if keras_file is not None:
        keras_model = tf.keras.models.load_model(keras_file)
        print("Loaded Keras model file:", keras_file)
        return scaler, keras_model

    if saved_model_dir is not None:
        keras_model = tf.keras.models.load_model(saved_model_dir)
        print("Loaded SavedModel directory:", saved_model_dir)
        return scaler, keras_model

    # 5) Debug: print small tree
    print("Could not find .keras/.h5 or SavedModel. Directory tree (limited):")
    for r, d, f in os.walk(local_dir):
        depth = r.replace(local_dir, "").count(os.sep)
        if depth > 3:
            continue
        print("  " * depth + os.path.basename(r) + "/")
        for ff in f[:20]:
            print("  " * (depth + 1) + ff)

    raise FileNotFoundError(f"No Keras model found under {local_dir}")


In [26]:
model_meta = pick_model_from_registry(mr, MODEL_NAME, MODEL_VERSION)
local_dir = model_meta.download()
print("Downloaded to:", local_dir)

# Optional: print top-level files to debug fast
try:
    print("Top-level files:", os.listdir(local_dir))
except Exception:
    pass

scaler, keras_model = load_scaler_and_keras_from_dir(local_dir)
print("Scaler loaded:", scaler is not None)
print("Keras model loaded ✅")


No production stage found. Falling back to latest version...
Using model traffic_speed_ratio_keras v1 (latest)


Downloading: 100.000%|██████████| 1535/1535 elapsed<00:00 remaining<00:00


Downloading model artifact (0 dirs, 1 files)... 

Downloading: 100.000%|██████████| 697237/697237 elapsed<00:00 remaining<00:00


Downloading model artifact (0 dirs, 2 files)... 

Downloading: 100.000%|██████████| 312/312 elapsed<00:00 remaining<00:00


Downloading model artifact (0 dirs, 3 files)... 

Downloading: 100.000%|██████████| 180/180 elapsed<00:00 remaining<00:00


Downloading model artifact (0 dirs, 4 files)... 

Downloading: 100.000%|██████████| 956/956 elapsed<00:00 remaining<00:00
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Downloaded to: /var/folders/bl/wz4tv3rs7vsd1fmkwrz2gnsc0000gn/T/d354cf28-bf49-4ce0-8b31-8749e753414a/traffic_speed_ratio_keras/1
Top-level files: ['metrics.json', 'scaler.pkl', 'model.keras', 'feature_cols.json', 'meta.json']
Loaded scaler artifact: /var/folders/bl/wz4tv3rs7vsd1fmkwrz2gnsc0000gn/T/d354cf28-bf49-4ce0-8b31-8749e753414a/traffic_speed_ratio_keras/1/scaler.pkl
Loaded Keras model file: /var/folders/bl/wz4tv3rs7vsd1fmkwrz2gnsc0000gn/T/d354cf28-bf49-4ce0-8b31-8749e753414a/traffic_speed_ratio_keras/1/model.keras
Scaler loaded: True
Keras model loaded ✅




In [27]:
# Output columns to keep
OUTPUT_COLS = [POINT_ID_COL, TS_10M_COL]
OUTPUT_COLS = [c for c in OUTPUT_COLS if c in df_latest.columns]

# Exclude typical non-features
EXCLUDE_COLS = {
    POINT_ID_COL,
    TS_10M_COL,
    "timestamp",
    "timestamp_utc",
    # possible labels
    "speed_ratio_t+30", "speed_ratio_t+60",
    "label_t+30", "label_t+60",
}

feature_cols = [c for c in df_latest.columns if c not in EXCLUDE_COLS]
X = df_latest[feature_cols].copy()

print("X shape:", X.shape)
print("Feature cols sample:", feature_cols[:25])

# Convert bool -> int
for c in X.columns:
    if X[c].dtype == bool:
        X[c] = X[c].astype(int)

# Convert object -> numeric if possible
for c in X.columns:
    if X[c].dtype == "object":
        X[c] = pd.to_numeric(X[c], errors="coerce")

# Fill NaNs (ideally match training; default safe)
X = X.fillna(0.0)

# Ensure float32 for keras
X_values = X.values.astype(np.float32)


X shape: (50, 39)
Feature cols sample: ['frc', 'current_speed', 'free_flow_speed', 'current_travel_time', 'free_flow_travel_time', 'confidence', 'road_closure', 'speed_ratio', 'delay_seconds', 'day_of_week', 'is_weekend', 'hour', 'minute', 'is_rush_hour', 'ti_evening_peak', 'ti_midday', 'ti_morning_peak', 'ti_night', 'speed_diff', 'travel_time_ratio', 'low_confidence_flag', 'speed_roll_mean_3', 'speed_roll_std_3', 'delay_roll_mean_3', 'speed_roll_mean_6']


In [None]:
def keras_predict_two_horizons(model, X_np):
    pred = model.predict(X_np, verbose=0)

    if isinstance(pred, (list, tuple)) and len(pred) >= 2:
        p30 = np.asarray(pred[0]).reshape(-1)
        p60 = np.asarray(pred[1]).reshape(-1)
        return p30, p60

    pred = np.asarray(pred)
    if pred.ndim == 2 and pred.shape[1] >= 2:
        return pred[:, 0].reshape(-1), pred[:, 1].reshape(-1)

    if pred.ndim == 1 or (pred.ndim == 2 and pred.shape[1] == 1):
        return pred.reshape(-1), np.full((pred.shape[0],), np.nan)

    raise ValueError(f"Unexpected prediction shape: {pred.shape}")

# Scale if scaler exists
if scaler is not None:
    X_scaled = scaler.transform(X_values)
    X_scaled = np.asarray(X_scaled).astype(np.float32)
else:
    X_scaled = X_values

pred_30, pred_60 = keras_predict_two_horizons(keras_model, X_scaled)

print("Pred shapes:", pred_30.shape, pred_60.shape)
print("Pred sample:", pred_30[:5], pred_60[:5])


In [None]:
df_pred = df_latest[OUTPUT_COLS].copy()
df_pred[PRED_COL_30] = pred_30
df_pred[PRED_COL_60] = pred_60

df_pred[POINT_ID_COL] = df_pred[POINT_ID_COL].astype(str)
df_pred = df_pred.sort_values(POINT_ID_COL).reset_index(drop=True)

df_pred.head(10)
