# 03_weather_feature_backfill.ipynb

This notebook performs a **historical backfill** of hourly weather features from **Open-Meteo Historical Weather API** and writes them to a dedicated Hopsworks Feature Group.

Why separate?
- Weather is external (API), hourly, and reusable across models.
- Traffic is 10-min; we will join later using `weather_time_utc = floor(ts_10m, 'H')`.

Feature Group (recommended):
- Name: `weather_hourly_fg`
- Primary key: `["point_id", "weather_time_utc"]`
- Event time: `"weather_time_utc"`


In [12]:
import os
import time
from datetime import datetime
from typing import List, Dict, Any, Union, Iterable

import pandas as pd
import numpy as np
import requests
import hopsworks

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)


In [13]:
# ================= CONFIG =================
# Source for date range (traffic FG) - optional but convenient
TRAFFIC_FG_NAME = os.getenv("TRAFFIC_FG_NAME", "traffic_flow_fg")
TRAFFIC_FG_VERSION = int(os.getenv("TRAFFIC_FG_VERSION", "1"))

# Monitoring points / metadata (must contain point_id + lat/lon)
METADATA_FG_NAME = os.getenv("METADATA_FG_NAME", "traffic_points_metadata")
METADATA_FG_VERSION = int(os.getenv("METADATA_FG_VERSION", "1"))

# Output weather feature group
WEATHER_FG_NAME = os.getenv("WEATHER_FG_NAME", "weather_hourly_fg")
WEATHER_FG_VERSION = int(os.getenv("WEATHER_FG_VERSION", "1"))

# Open-Meteo Historical Weather API (archive)
ARCHIVE_URL = os.getenv("OPEN_METEO_ARCHIVE_URL", "https://archive-api.open-meteo.com/v1/archive")

# Hourly variables to request (edit freely)
HOURLY_VARS = os.getenv(
    "OPEN_METEO_HOURLY_VARS",
    "temperature_2m,precipitation,rain,snowfall"
).split(",")

# Chunk size for multiple locations per request
CHUNK_SIZE = int(os.getenv("OPEN_METEO_CHUNK_SIZE", "50"))

# Politeness / safety
SLEEP_SECONDS = float(os.getenv("OPEN_METEO_SLEEP_SECONDS", "0.2"))
REQUEST_TIMEOUT = int(os.getenv("OPEN_METEO_TIMEOUT", "60"))
MAX_RETRIES = int(os.getenv("OPEN_METEO_MAX_RETRIES", "5"))

# Date range override (if set, use these)
START_DATE = os.getenv("OPEN_METEO_START_DATE", "")  # 'YYYY-MM-DD'
END_DATE = os.getenv("OPEN_METEO_END_DATE", "")      # 'YYYY-MM-DD'


In [14]:
# ============== CONNECT TO HOPSWORKS ==============
# 1. Login to Hopsworks
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)

fs = project.get_feature_store()

2026-01-10 22:05:10,491 INFO: Closing external client and cleaning up certificates.
2026-01-10 22:05:10,538 INFO: Connection closed.
2026-01-10 22:05:10,543 INFO: Initializing external client
2026-01-10 22:05:10,543 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-10 22:05:11,848 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209


In [15]:
# ============== READ METADATA (POINTS) ==============
meta_fg = fs.get_feature_group(name=METADATA_FG_NAME, version=METADATA_FG_VERSION)
points_df = meta_fg.read()

needed = ["point_id", "latitude", "longitude"]
missing = [c for c in needed if c not in points_df.columns]
if missing:
    raise ValueError(f"Metadata FG is missing required columns: {missing}")

points_df = points_df[needed].copy()
points_df["point_id"] = points_df["point_id"].astype(str)
points_df["latitude"] = pd.to_numeric(points_df["latitude"], errors="coerce")
points_df["longitude"] = pd.to_numeric(points_df["longitude"], errors="coerce")
points_df = points_df.dropna(subset=["latitude", "longitude"]).drop_duplicates(subset=["point_id"], keep="last")

print("Points available for weather:", len(points_df))
points_df.head()


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.49s) 
Points available for weather: 200


Unnamed: 0,point_id,latitude,longitude
0,8618,51.509307,-0.084878
1,16756,51.383489,-0.105944
2,38815,51.393451,0.029589
3,6458,51.573103,-0.212077
4,38022,51.589213,0.270734


In [16]:
# ============== DETERMINE DATE RANGE ==============
def infer_date_range_from_traffic() -> (str, str):
    traffic_fg = fs.get_feature_group(name=TRAFFIC_FG_NAME, version=TRAFFIC_FG_VERSION)
    tdf = traffic_fg.read()
    tdf = tdf[["ts_10m"]]
    tdf["ts_10m"] = pd.to_datetime(tdf["ts_10m"], utc=True, errors="coerce")
    tdf = tdf.dropna(subset=["ts_10m"])
    start = tdf["ts_10m"].min().strftime("%Y-%m-%d")
    end = tdf["ts_10m"].max().strftime("%Y-%m-%d")
    return start, end

if START_DATE and END_DATE:
    start_date, end_date = START_DATE, END_DATE
else:
    start_date, end_date = infer_date_range_from_traffic()

print("Weather backfill date range:", start_date, "->", end_date)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.00s) 
Weather backfill date range: 2026-01-04 -> 2026-01-10


In [17]:
# ============== OPEN-METEO FETCH HELPERS ==============
def chunk_dataframe(df: pd.DataFrame, chunk_size: int) -> Iterable[pd.DataFrame]:
    for i in range(0, len(df), chunk_size):
        yield df.iloc[i:i + chunk_size]

def request_with_retries(url: str, params: Dict[str, Any], timeout: int, max_retries: int) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
    last_err = None
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, params=params, timeout=timeout)
            resp.raise_for_status()
            return resp.json()
        except Exception as e:
            last_err = e
            sleep = min(2 ** attempt, 30) + np.random.rand() * 0.5
            print(f"Request failed (attempt {attempt}/{max_retries}). Sleeping {sleep:.1f}s. Error: {e}")
            time.sleep(sleep)
    raise RuntimeError(f"Open-Meteo request failed after {max_retries} retries: {last_err}")

def fetch_open_meteo_archive(lat_list: List[float], lon_list: List[float], start_date: str, end_date: str) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
    params = {
        "latitude": ",".join([str(x) for x in lat_list]),
        "longitude": ",".join([str(x) for x in lon_list]),
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ",".join(HOURLY_VARS),
        "timezone": "UTC",
    }
    return request_with_retries(ARCHIVE_URL, params=params, timeout=REQUEST_TIMEOUT, max_retries=MAX_RETRIES)

def one_location_to_df(loc_json: Dict[str, Any], point_id: str) -> pd.DataFrame:
    if "hourly" not in loc_json or "time" not in loc_json["hourly"]:
        raise ValueError("Unexpected Open-Meteo payload format: missing hourly.time")

    times = pd.to_datetime(loc_json["hourly"]["time"], utc=True, errors="coerce")
    out = pd.DataFrame({"weather_time_utc": times, "point_id": point_id})

    for v in HOURLY_VARS:
        out[v] = loc_json["hourly"].get(v, [np.nan] * len(out))

    return out

def open_meteo_payload_to_df(payload: Union[Dict[str, Any], List[Dict[str, Any]]], point_ids: List[str]) -> pd.DataFrame:
    if isinstance(payload, list):
        if len(payload) != len(point_ids):
            raise ValueError(f"Payload length {len(payload)} != point_ids length {len(point_ids)}")
        dfs = [one_location_to_df(payload[i], point_ids[i]) for i in range(len(point_ids))]
        return pd.concat(dfs, ignore_index=True)

    return one_location_to_df(payload, point_ids[0])


In [18]:
# ============== BACKFILL WEATHER (BATCH) ==============
weather_frames = []

for idx, chunk in enumerate(chunk_dataframe(points_df, CHUNK_SIZE), start=1):
    point_ids = chunk["point_id"].tolist()
    lat_list = chunk["latitude"].astype(float).tolist()
    lon_list = chunk["longitude"].astype(float).tolist()

    print(f"Fetching chunk {idx}: n_points={len(point_ids)}")
    payload = fetch_open_meteo_archive(lat_list, lon_list, start_date, end_date)
    wdf = open_meteo_payload_to_df(payload, point_ids)

    weather_frames.append(wdf)
    time.sleep(SLEEP_SECONDS)

weather_df = pd.concat(weather_frames, ignore_index=True)

weather_df["point_id"] = weather_df["point_id"].astype(str)
weather_df["weather_time_utc"] = pd.to_datetime(weather_df["weather_time_utc"], utc=True, errors="coerce")
weather_df = weather_df.dropna(subset=["weather_time_utc"])

weather_df = weather_df.sort_values(["point_id", "weather_time_utc"])
weather_df = weather_df.drop_duplicates(subset=["point_id", "weather_time_utc"], keep="last")

print("Weather DF shape:", weather_df.shape)
weather_df.head()


Fetching chunk 1: n_points=50
Fetching chunk 2: n_points=50
Fetching chunk 3: n_points=50
Fetching chunk 4: n_points=50
Weather DF shape: (33600, 6)


Unnamed: 0,weather_time_utc,point_id,temperature_2m,precipitation,rain,snowfall
8064,2026-01-04 00:00:00+00:00,16108,-1.5,0.0,0.0,0.0
8065,2026-01-04 01:00:00+00:00,16108,-1.6,0.0,0.0,0.0
8066,2026-01-04 02:00:00+00:00,16108,-1.8,0.0,0.0,0.0
8067,2026-01-04 03:00:00+00:00,16108,-1.9,0.0,0.0,0.0
8068,2026-01-04 04:00:00+00:00,16108,-2.0,0.0,0.0,0.0


In [19]:
# ============== QUALITY CHECKS ==============
dup = weather_df.duplicated(subset=["point_id", "weather_time_utc"]).sum()
print("Duplicate PK rows:", dup)
if dup > 0:
    raise ValueError("Primary key duplicates detected in weather_df")

na_report = weather_df[HOURLY_VARS].isna().mean().sort_values(ascending=False)
print("NA fraction per hourly variable:")
print(na_report)


Duplicate PK rows: 0
NA fraction per hourly variable:
temperature_2m    0.0
precipitation     0.0
rain              0.0
snowfall          0.0
dtype: float64


In [20]:
# ============== WRITE WEATHER FEATURE GROUP ==============
weather_fg = fs.get_or_create_feature_group(
    name=WEATHER_FG_NAME,
    version=WEATHER_FG_VERSION,
    primary_key=["point_id", "weather_time_utc"],
    event_time="weather_time_utc",
    description="Hourly weather features from Open-Meteo Historical Weather API (backfilled)."
)

weather_fg.insert(
    weather_df
)

print(f"Weather backfill written to {WEATHER_FG_NAME} v{WEATHER_FG_VERSION}")


Uploading Dataframe: 100.00% |██████████| Rows 33600/33600 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_hourly_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3209/jobs/named/weather_hourly_fg_1_offline_fg_materialization/executions
Weather backfill written to weather_hourly_fg v1


In [21]:
# ============== OPTIONAL: SAVE LOCAL PARQUET FOR DEBUGGING ==============
from pathlib import Path

out_path = Path("data/processed/weather_hourly_backfill.parquet")
out_path.parent.mkdir(parents=True, exist_ok=True)
weather_df.to_parquet(out_path, index=False)
print("Saved:", out_path, "rows:", len(weather_df))


Saved: data/processed/weather_hourly_backfill.parquet rows: 33600


In [22]:
n_rows = len(weather_df)
n_points = weather_df["point_id"].nunique()
n_hours = weather_df["weather_time_utc"].nunique()

print("rows:", n_rows)
print("unique points:", n_points)
print("unique hours:", n_hours)

print("rows / points:", n_rows / max(n_points, 1))
print("approx days:", (n_rows / max(n_points, 1)) / 24)


rows: 33600
unique points: 200
unique hours: 168
rows / points: 168.0
approx days: 7.0
