In [19]:
import os
import pandas as pd
import numpy as np

In [23]:
import hopsworks

# ----------------------------
# 1) Load CSV
# ----------------------------
CSV_PATH = "data/interim/monitoring_points_metadata.csv"
df = pd.read_csv(CSV_PATH)

# Basic required columns
required = {"point_id", "latitude", "longitude"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns in CSV: {missing}")

# Clean + types
df["point_id"] = df["point_id"].astype(str)
df["latitude"] = df["latitude"].astype(float)
df["longitude"] = df["longitude"].astype(float)

# Optional: keep only a curated set if you want
# (comment out if you prefer to keep all columns in the CSV)
curated_cols = [
    "point_id",
    "latitude",
    "longitude",
    "local_authority_id",
    "road_category",
    "road_type",
    "region_id",
    "road_name",
    "link_length_km",
]
curated_cols = [c for c in curated_cols if c in df.columns]
df = df[curated_cols].copy()

# Validations
if df[["point_id", "latitude", "longitude"]].isna().any().any():
    raise ValueError("Nulls found in required columns (point_id, latitude, longitude).")

if df["point_id"].duplicated().any():
    dupes = df[df["point_id"].duplicated(keep=False)].sort_values("point_id")
    raise ValueError(f"Duplicated point_id found. Example rows:\n{dupes.head(10)}")

if not df["latitude"].between(-90, 90).all():
    raise ValueError("Latitude out of range [-90, 90].")

if not df["longitude"].between(-180, 180).all():
    raise ValueError("Longitude out of range [-180, 180].")

print("Loaded metadata rows:", len(df))
display(df.head())

# ----------------------------
# 2) Login to Hopsworks
# ----------------------------

# 1. Login to Hopsworks
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    project="London_traffic"
)
fs = project.get_feature_store()

# ----------------------------
# 3) Create / Get Feature Group
# ----------------------------
FG_NAME = "traffic_points_metadata"
FG_VERSION = 1
FG_DESCRIPTION = "Static metadata for traffic monitoring points (lat/lon + road/LA context)."

primary_key = ["point_id"]

fg = fs.get_or_create_feature_group(
        name=FG_NAME,
        version=FG_VERSION,
        description=FG_DESCRIPTION,
        primary_key=primary_key
    )
print(f"Feature Group created: {FG_NAME} v{FG_VERSION}")

# ----------------------------
# 4) Insert (overwrite)
# ----------------------------
# overwrite=True is ideal for static metadata (idempotent runs)
fg.insert(df)





Loaded metadata rows: 200


Unnamed: 0,point_id,latitude,longitude,local_authority_id,road_category,road_type,road_name,link_length_km
0,18526,51.529441,0.088977,168,PA,Major,A13,2.1
1,7518,51.544222,0.147921,168,PA,Major,A1240,2.8
2,6797,51.367362,0.045676,176,PA,Major,A232,1.0
3,36807,51.48211,0.101649,105,PA,Major,A209,1.6
4,28510,51.501415,-0.0758,103,PA,Major,A200,0.3


2026-01-07 22:05:02,690 INFO: Closing external client and cleaning up certificates.
2026-01-07 22:05:02,694 INFO: Connection closed.
2026-01-07 22:05:02,696 INFO: Initializing external client
2026-01-07 22:05:02,696 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-07 22:05:03,295 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3209
Feature Group created: traffic_points_metadata v1


Uploading Dataframe: 100.00% |██████████| Rows 200/200 | Elapsed Time: 00:00 | Remaining Time: 00:00
Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/London_traffic/Resources/jobs/traffic_points_metadata_1_offline_fg_materialization/config_1767819827809) to trigger the materialization job again.


(Job('traffic_points_metadata_1_offline_fg_materialization', 'SPARK'), None)