# 08_sanity_check_feature_groups.ipynb

Sanity checks across Feature Groups before label engineering:
- Traffic temporal FG (lags/rollings/time features)
- Weather hourly FG (Open-Meteo)
- TfL disruptions hourly FG

This notebook reads only (no writes).


In [1]:
import os
import pandas as pd
import numpy as np
import hopsworks

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ================= CONFIG =================
TRAFFIC_FG_NAME = os.getenv("TRAFFIC_FG_NAME", "traffic_temporal_fg")
TRAFFIC_FG_VERSION = int(os.getenv("TRAFFIC_FG_VERSION", "1"))

WEATHER_FG_NAME = os.getenv("WEATHER_FG_NAME", "weather_hourly_fg")
WEATHER_FG_VERSION = int(os.getenv("WEATHER_FG_VERSION", "1"))

TFL_FG_NAME = os.getenv("TFL_FG_NAME", "tfl_disruptions_hourly_fg")
TFL_FG_VERSION = int(os.getenv("TFL_FG_VERSION", "1"))

TRAFFIC_TIME_COL = os.getenv("TRAFFIC_TIME_COL", "ts_10m")
WEATHER_TIME_COL = os.getenv("WEATHER_TIME_COL", "weather_time_utc")
TFL_TIME_COL = os.getenv("TFL_TIME_COL", "tfl_time_utc")

POINT_ID_COL = os.getenv("POINT_ID_COL", "point_id")

# Optional: comma-separated list of expected lag/rolling columns (strict check)
EXPECTED_LAG_COLS = os.getenv("EXPECTED_LAG_COLS", "")


In [3]:
# ============== CONNECT TO HOPSWORKS ==============
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)
fs = project.get_feature_store()


2026-01-09 14:26:03,057 INFO: Initializing external client
2026-01-09 14:26:03,059 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-09 14:26:03,962 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209


In [4]:
def basic_profile(df: pd.DataFrame, name: str, time_col: str, pk_cols: list):
    print("\n" + "="*90)
    print(f"PROFILE: {name}")
    print("="*90)
    print("shape:", df.shape)
    print("n_columns:", len(df.columns))

    # dtypes sample
    print("\nDtypes (first 30):")
    print(df.dtypes.head(30))

    # time range
    if time_col in df.columns:
        t = pd.to_datetime(df[time_col], utc=True, errors="coerce")
        print(f"\nTime column: {time_col}")
        print("time_min:", t.min())
        print("time_max:", t.max())
        print("time_nulls:", t.isna().sum())
    else:
        print(f"\nTime column missing: {time_col}")

    # PK duplicates
    missing_pk = [c for c in pk_cols if c not in df.columns]
    if missing_pk:
        print("\nMissing PK columns:", missing_pk)
    else:
        dup = df.duplicated(subset=pk_cols).sum()
        print("\nPK columns:", pk_cols)
        print("PK duplicate rows:", dup)

    # Missingness summary (top 20)
    na = df.isna().mean().sort_values(ascending=False)
    print("\nTop missingness (fraction):")
    print(na.head(20))

def assert_no_pk_dups(df: pd.DataFrame, pk_cols: list, name: str):
    if all(c in df.columns for c in pk_cols):
        dup = df.duplicated(subset=pk_cols).sum()
        if dup > 0:
            raise ValueError(f"{name}: found {dup} duplicate rows for PK {pk_cols}")


In [5]:
# ============== READ FEATURE GROUPS ==============
traffic_fg = fs.get_feature_group(name=TRAFFIC_FG_NAME, version=TRAFFIC_FG_VERSION)
weather_fg = fs.get_feature_group(name=WEATHER_FG_NAME, version=WEATHER_FG_VERSION)
tfl_fg = fs.get_feature_group(name=TFL_FG_NAME, version=TFL_FG_VERSION)

traffic_df = traffic_fg.read()
weather_df = weather_fg.read()
tfl_df = tfl_fg.read()

# Normalize types
traffic_df[POINT_ID_COL] = traffic_df[POINT_ID_COL].astype(str)
weather_df[POINT_ID_COL] = weather_df[POINT_ID_COL].astype(str)
tfl_df[POINT_ID_COL] = tfl_df[POINT_ID_COL].astype(str)

traffic_df[TRAFFIC_TIME_COL] = pd.to_datetime(traffic_df[TRAFFIC_TIME_COL], utc=True, errors="coerce")
weather_df[WEATHER_TIME_COL] = pd.to_datetime(weather_df[WEATHER_TIME_COL], utc=True, errors="coerce")
tfl_df[TFL_TIME_COL] = pd.to_datetime(tfl_df[TFL_TIME_COL], utc=True, errors="coerce")


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.71s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.52s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.54s) 


In [6]:
# ============== BASIC PROFILES + PK CHECKS ==============
basic_profile(traffic_df, f"TRAFFIC {TRAFFIC_FG_NAME} v{TRAFFIC_FG_VERSION}", TRAFFIC_TIME_COL, [POINT_ID_COL, TRAFFIC_TIME_COL])
basic_profile(weather_df, f"WEATHER {WEATHER_FG_NAME} v{WEATHER_FG_VERSION}", WEATHER_TIME_COL, [POINT_ID_COL, WEATHER_TIME_COL])
basic_profile(tfl_df, f"TFL {TFL_FG_NAME} v{TFL_FG_VERSION}", TFL_TIME_COL, [POINT_ID_COL, TFL_TIME_COL])

assert_no_pk_dups(traffic_df, [POINT_ID_COL, TRAFFIC_TIME_COL], "traffic")
assert_no_pk_dups(weather_df, [POINT_ID_COL, WEATHER_TIME_COL], "weather")
assert_no_pk_dups(tfl_df, [POINT_ID_COL, TFL_TIME_COL], "tfl")



PROFILE: TRAFFIC traffic_temporal_fg v1
shape: (20168, 33)
n_columns: 33

Dtypes (first 30):
timestamp_utc            datetime64[us, Etc/UTC]
point_id                                  object
frc                                       object
current_speed                            float64
free_flow_speed                          float64
current_travel_time                      float64
free_flow_travel_time                    float64
confidence                               float64
road_closure                                bool
ts_10m                       datetime64[us, UTC]
speed_ratio                              float64
delay_seconds                            float64
day_of_week                                int32
is_weekend                                 int64
hour                                       int32
minute                                     int32
is_rush_hour                               int64
ti_evening_peak                             bool
ti_midday               

In [7]:
# ============== LAG/ROLLING FEATURES PRESENCE (HEURISTIC) ==============
lag_like = [c for c in traffic_df.columns if any(k in c.lower() for k in ["lag", "rolling", "roll", "ema", "ewm"])]
print("\nLag/rolling-like columns found:", len(lag_like))
print(lag_like[:80])

if EXPECTED_LAG_COLS.strip():
    expected = [x.strip() for x in EXPECTED_LAG_COLS.split(",") if x.strip()]
    missing = [c for c in expected if c not in traffic_df.columns]
    print("\nExpected lag cols:", len(expected))
    print("Missing expected lag cols:", missing)



Lag/rolling-like columns found: 10
['low_confidence_flag', 'speed_roll_mean_3', 'speed_roll_std_3', 'delay_roll_mean_3', 'speed_roll_mean_6', 'speed_roll_std_6', 'delay_roll_mean_6', 'speed_roll_mean_12', 'speed_roll_std_12', 'delay_roll_mean_12']


In [8]:
# ============== WEATHER VARS MISSINGNESS ==============
weather_vars = [c for c in weather_df.columns if c not in [POINT_ID_COL, WEATHER_TIME_COL]]
print("\nWeather variables:", weather_vars)

if weather_vars:
    print("\nWeather missingness (fraction):")
    print(weather_df[weather_vars].isna().mean().sort_values(ascending=False))



Weather variables: ['temperature_2m', 'precipitation', 'rain', 'snowfall', 'wind_speed_10m', 'cloud_cover', 'pressure_msl']

Weather missingness (fraction):
temperature_2m    0.0
precipitation     0.0
rain              0.0
snowfall          0.0
wind_speed_10m    0.0
cloud_cover       0.0
pressure_msl      0.0
dtype: float64


In [9]:
# ============== TFL VARS DISTRIBUTION ==============
tfl_vars = [c for c in tfl_df.columns if c not in [POINT_ID_COL, TFL_TIME_COL]]
print("\nTfL variables:", tfl_vars)

if tfl_vars:
    print("\nTfL describe (first 30 rows):")
    display(tfl_df[tfl_vars].describe(include="all").transpose().head(30))



TfL variables: ['disruption_count', 'is_works', 'is_incident', 'is_active', 'max_ordinal']

TfL describe (first 30 rows):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
disruption_count,61709.0,1.004683,0.070378,1.0,1.0,1.0,1.0,3.0
is_works,61709.0,0.97566,0.154104,0.0,1.0,1.0,1.0,1.0
is_incident,61709.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
is_active,61709.0,0.75493,0.430132,0.0,1.0,1.0,1.0,1.0
max_ordinal,61709.0,21.060931,32.44172,3.0,3.0,3.0,24.0,102.0


In [10]:
# ============== JOIN COVERAGE CHECK (TRAFFIC <- WEATHER + TFL) ==============
traffic_df["weather_time_utc"] = traffic_df[TRAFFIC_TIME_COL].dt.floor("H")
traffic_df["tfl_time_utc"] = traffic_df[TRAFFIC_TIME_COL].dt.floor("H")

joined = traffic_df.merge(
    weather_df,
    left_on=[POINT_ID_COL, "weather_time_utc"],
    right_on=[POINT_ID_COL, WEATHER_TIME_COL],
    how="left",
    suffixes=("", "_w"),
)

joined = joined.merge(
    tfl_df,
    left_on=[POINT_ID_COL, "tfl_time_utc"],
    right_on=[POINT_ID_COL, TFL_TIME_COL],
    how="left",
    suffixes=("", "_tfl"),
)

print("\nJoined shape:", joined.shape)

# Coverage metrics
weather_join_cols = [c for c in weather_vars if c in joined.columns]
if weather_join_cols:
    weather_cov = 1.0 - joined[weather_join_cols].isna().all(axis=1).mean()
    print("Weather join coverage (>=1 var present):", weather_cov)

tfl_join_cols = [c for c in tfl_vars if c in joined.columns]
if tfl_join_cols:
    tfl_cov = 1.0 - joined[tfl_join_cols].isna().all(axis=1).mean()
    print("TfL join coverage (>=1 var present):", tfl_cov)

print("\nSample join columns:")
print(joined[[POINT_ID_COL, TRAFFIC_TIME_COL, "weather_time_utc", "tfl_time_utc"]].head())



Joined shape: (20168, 47)
Weather join coverage (>=1 var present): 1.0
TfL join coverage (>=1 var present): 0.10020825069416894

Sample join columns:
  point_id                    ts_10m          weather_time_utc              tfl_time_utc
0    37825 2026-01-05 04:00:00+00:00 2026-01-05 04:00:00+00:00 2026-01-05 04:00:00+00:00
1    46818 2026-01-06 06:20:00+00:00 2026-01-06 06:00:00+00:00 2026-01-06 06:00:00+00:00
2    17687 2026-01-06 22:40:00+00:00 2026-01-06 22:00:00+00:00 2026-01-06 22:00:00+00:00
3    17524 2026-01-07 03:50:00+00:00 2026-01-07 03:00:00+00:00 2026-01-07 03:00:00+00:00
4    38572 2026-01-06 10:30:00+00:00 2026-01-06 10:00:00+00:00 2026-01-06 10:00:00+00:00




In [11]:
# ============== QUICK SANITY: SPEED RATIO RANGE ==============
if "speed_ratio" in joined.columns:
    sr = pd.to_numeric(joined["speed_ratio"], errors="coerce")
    print("\nspeed_ratio min/max:", sr.min(), sr.max())
    print("speed_ratio <= 0 count:", int((sr <= 0).sum()))
    print("speed_ratio > 2 count:", int((sr > 2).sum()))



speed_ratio min/max: 0.08823529411764706 1.0
speed_ratio <= 0 count: 0
speed_ratio > 2 count: 0


## Interpretation guide

- PK duplicates should be 0 for all feature groups
- Weather join coverage should be high during the weather backfill period
- TfL join coverage is naturally sparse, but should not be all-zero if your range includes disruptions
- Lag/rolling columns should exist and should not be entirely NaN
