# Notebook to parse the schema into json and and .md table

In [None]:
from pathlib import Path
import pandas as pd
import json

# Paths (edit if needed) 
csv_path = Path("../schema/schema.csv")     # your source of truth
out_json = Path("../schema/schema.json") # machine-readable output

# Load (auto-detect separator: tab vs comma) 
try:
    df = pd.read_csv(csv_path, sep="\t", dtype=str)
    if df.shape[1] == 1:  # likely not tab-separated after all
        raise ValueError("Only 1 column read with tab separator.")
except Exception:
    df = pd.read_csv(csv_path, sep=",", dtype=str)

# Normalise column names 
df.columns = [c.strip().lower() for c in df.columns]

required = {"code", "parent", "group", "name", "ukhab", "definition"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {sorted(missing)}")

# Clean whitespace / NaNs 
for col in required:
    df[col] = df[col].fillna("").astype(str).str.strip()

# Basic validation 
# codes must be unique and non-empty
if (df["code"] == "").any():
    raise ValueError("Some rows have empty 'code' values.")

dupes = df["code"][df["code"].duplicated()].unique().tolist()
if dupes:
    raise ValueError(f"Duplicate codes found: {dupes}")

# parent should exist for non-primary codes (except where code == parent)
codes = set(df["code"].tolist())
bad_parent = df[(df["code"] != df["parent"]) & (~df["parent"].isin(codes))]
if not bad_parent.empty:
    raise ValueError(
        "Some rows reference a parent that does not exist in the CSV:\n"
        + bad_parent[["code", "parent"]].to_string(index=False)
    )

# Build JSON payload 
records = df.sort_values(["group", "parent", "code"]).to_dict(orient="records")

out_json.parent.mkdir(parents=True, exist_ok=True)
out_json.write_text(json.dumps(records, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

# Check output :) 
print(f"Loaded: {len(df):,} rows from {csv_path}")
print(f"Wrote:  {out_json} ({out_json.stat().st_size/1024:.1f} KB)")
display(df.head(8))

Loaded: 66 rows from ../schema/schema.csv
Wrote:  ../schema/schema.json (19.4 KB)


Unnamed: 0,code,parent,group,name,ukhab,definition
0,a,a,Urban,Artificial land,u,"Buildings, roads, built-up areas, artificially..."
1,a1,a,Urban,Urban and transport,u1c,"Built-up land, such as buildings and car parks..."
2,a2,a,Urban,"Quarries, mineral workings and derelict land",s1,Includes both in use and decommissioned quarri...
3,a3,a,Urban,Open mosaic,u1a,Habitat found on previously disturbed or indus...
4,b,b,Cropland,Cropland,c,Land used for growing crops
5,b1,b,Cropland,Annual crops,c1c,Crops that complete their growing cycle within...
6,b2,b,Cropland,"Permanent crops (vines, bushes, energy crops)",c1d,Crops from plants that produce harvests year o...
7,b3,b,Cropland,Orchards,c1,Collections of fruit and/or nut producing tree...
