# Derive Gauge Configuration — ausvic (FloodHubMaribyrnong)

This notebook reproduces every value in the `GAUGES` list from primary sources.
Run it end-to-end to verify or update the gauge configuration.

| Field | Source |
|-------|--------|
| `gauge_id` | Caravan convention: `ausvic_` + station number (no letters) |
| `name` | Victorian Water: Hydstra `get_site_list` API; Melbourne Water: portal |
| `lat` / `lon` | Victorian Water: Hydstra `get_site_list` API; Melbourne Water: portal |
| `area_km2` | HydroBASINS Level-12 `UP_AREA` via GEE (Keilor: official VW figure) |
| Exclusions | CAMELS AUS v2 overlap check (Zenodo 13350616) |

**Steps**
1. CAMELS AUS v2 overlap — identify which candidates are already in Caravan
2. Victorian Water metadata — lat/lon + names from Hydstra API
3. Melbourne Water metadata — station verification via flow API
4. Catchment areas — `UP_AREA` from HydroBASINS via GEE
5. Compile final `GAUGES` list

In [None]:
import json
import time
import urllib.parse
import urllib.request
from pathlib import Path

print('Ready.')

## Step 1 — Candidate Station Discovery and CAMELS AUS v2 Overlap Check

**1a — Victorian Water candidates via Hydstra `230*`**  
Query the Hydstra API for all stations with prefix `230*` (Maribyrnong catchment code).
Filter to those with a discharge variable (`141.00` = ML/day).

**1b — Melbourne Water candidates via `/locations` API**  
Call `api.melbournewater.com.au/rainfall-river-level/locations` to get all MW sites.  
Filter to those with a `230` prefix (Maribyrnong basin). Use `/summary` to confirm
each site has flow data (`flowLevels.minYear` present).

**1c — CAMELS AUS v2 overlap**  
Any candidate already in Caravan via CAMELS AUS v2 (Zenodo 13350616) is excluded
to avoid duplicate gauge IDs across the global dataset.

In [None]:
# ── Step 1c: Combine all candidates then CAMELS AUS v2 overlap check ──────────
import pandas as pd

# Merge VW (from Hydstra) and MW (from /locations + /summary) candidates
ALL_CANDIDATES = {
    **{sid: name for sid, name in vw_candidates.items()},
    **{sid: v['name'] for sid, v in mw_with_flow.items()},
}
print(f'Total candidates (VW + MW, with discharge): {len(ALL_CANDIDATES)}')
for sid, name in sorted(ALL_CANDIDATES.items()):
    print(f'  {sid:<12} {name}')

# CAMELS AUS v2 overlap check
CAMELS_CSV = Path('/content/drive/MyDrive/CAMELS_AUS_Attributes&Indices_MasterTable.csv')
EXCLUDED   = set()

print()
if CAMELS_CSV.exists():
    camels     = pd.read_csv(CAMELS_CSV, dtype=str)
    camels_ids = set(camels['station_id'].str.strip())
    print(f'CAMELS AUS v2 loaded — {len(camels_ids)} stations')
    print()
    print(f'  {"Station":<12} {"Name":<45} Status')
    print('  ' + '-' * 70)
    for sid, name in sorted(ALL_CANDIDATES.items()):
        # Strip trailing letter for CAMELS comparison (230106A -> 230106)
        camels_sid = sid.rstrip('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        if camels_sid in camels_ids:
            EXCLUDED.add(sid)
            status = 'DUPLICATE -> EXCLUDED'
        else:
            status = 'OK'
        print(f'  {sid:<12} {name:<45} {status}')
else:
    print(f'CAMELS CSV not found at {CAMELS_CSV}')
    print('Using known results (verified Feb 2026 against Zenodo record 13350616):')
    EXCLUDED = {'230210', '230205', '230209'}
    for sid in sorted(EXCLUDED):
        print(f'  {sid}  EXCLUDED (CAMELS AUS v2)')

INCLUDED = {sid: name for sid, name in ALL_CANDIDATES.items() if sid not in EXCLUDED}
print(f'
Result: {len(INCLUDED)} included, {len(EXCLUDED)} excluded: {sorted(EXCLUDED)}')

In [None]:
# ── Step 1b: Discover Melbourne Water candidates via /locations API ────────────
# api.melbournewater.com.au/rainfall-river-level/locations returns all MW sites.
# Filter to those starting with '230' (Maribyrnong catchment prefix),
# then use /summary to confirm each site has flow records (flowLevels.minYear).

MELBWATER_BASE = 'https://api.melbournewater.com.au/rainfall-river-level'

MW_HEADERS = {
    'User-Agent': 'Mozilla/5.0',
    'Accept':     'application/json',
    'Origin':     'https://www.melbournewater.com.au',
    'Referer':    'https://www.melbournewater.com.au/',
}

# 1. Get the full locations list
req  = urllib.request.Request(f'{MELBWATER_BASE}/locations', headers=MW_HEADERS)
with urllib.request.urlopen(req, timeout=30) as resp:
    all_locations = json.loads(resp.read().decode())

print(f'Melbourne Water /locations returned {len(all_locations)} total sites')

# 2. Filter to Maribyrnong basin (prefix 230)
maribyrnong_sites = [
    loc for loc in all_locations
    if str(loc.get('siteId', '')).startswith('230')
]
print(f'  {len(maribyrnong_sites)} sites with prefix 230 (Maribyrnong basin)
')

# 3. Check each for flow data via /summary
mw_candidates = {}   # siteId -> {'name': ..., 'has_flow': bool, 'min_year': ...}

print(f'  {"Site ID":<12} {"Name":<40} {"Has flow":>10} {"Min year":>10}')
print('  ' + '-' * 76)

for loc in maribyrnong_sites:
    sid = str(loc.get('siteId', '')).strip()
    # Name may be in different fields depending on API version
    name = (loc.get('siteName') or loc.get('name') or loc.get('description') or sid).strip()

    summary_url = f'{MELBWATER_BASE}/{sid}/summary'
    req = urllib.request.Request(summary_url, headers=MW_HEADERS)
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            summary = json.loads(resp.read().decode())
        flow    = summary.get('flowLevels', {})
        min_yr  = flow.get('minYear')
        has_flow = min_yr is not None
    except Exception as exc:
        has_flow = False
        min_yr   = f'ERROR: {exc}'

    mw_candidates[sid] = {'name': name, 'has_flow': has_flow, 'min_year': min_yr}
    flag = 'YES' if has_flow else 'no'
    print(f'  {sid:<12} {name:<40} {flag:>10} {str(min_yr or ""):>10}')
    time.sleep(0.3)

mw_with_flow = {sid: v for sid, v in mw_candidates.items() if v['has_flow']}
print(f'
{len(mw_with_flow)} Melbourne Water sites have flow records')

## Step 2 — Victorian Water Gauge Metadata (Hydstra API)

The five Victorian Water gauges use the Hydstra web service at  
`https://data.water.vic.gov.au/cgi/webservice.exe`

The `get_site_list` function returns station name, latitude, longitude, and  
catchment area directly. No authentication required.

In [None]:
# ── Hydstra API — site metadata ───────────────────────────────────────────────
HYDSTRA_BASE = 'https://data.water.vic.gov.au/cgi/webservice.exe'

# Victorian Water station IDs (numeric, no trailing letter)
VW_STATIONS = ['230200', '230206', '230202', '230213', '230227']

params = {
    'function':  'get_site_list',
    'version':   '2',
    'site_list': ','.join(VW_STATIONS),
    'fields':    'station,stnname,latitude,longitude,catchment_area',
    'format':    'json',
}
url = HYDSTRA_BASE + '?' + urllib.parse.urlencode(params)
print(f'Calling: {url[:90]}...')

with urllib.request.urlopen(url, timeout=30) as resp:
    data = json.loads(resp.read().decode())

vw_meta = {}   # station_id -> {name, lat, lon, hydstra_area_km2}
sites   = data.get('return', {}).get('sites', [])

print()
print(f'  {"Station":<10} {"Name":<40} {"Lat":>12} {"Lon":>13} {"Area (Hydstra)":>15}')
print('  ' + '-' * 95)
for site in sites:
    sid   = site.get('station', '').strip()
    name  = site.get('stnname', '').strip()
    lat   = float(site.get('latitude', 0) or 0)
    lon   = float(site.get('longitude', 0) or 0)
    area  = site.get('catchment_area', '')
    vw_meta[sid] = {'name': name, 'lat': lat, 'lon': lon, 'hydstra_area': area}
    print(f'  {sid:<10} {name:<40} {lat:>12.6f} {lon:>13.6f} {str(area):>15}')

print(f'
{len(vw_meta)} Victorian Water gauges fetched from Hydstra API')

## Step 3 — Melbourne Water Gauge Metadata

The `/locations` endpoint already returned site IDs and names in Step 1b.
Here we call `/summary` for each included Melbourne Water gauge to confirm
current status and record the `minYear` (start of flow record).

Note: The `/summary` response does not include lat/lon — those come from the
`/locations` list if the API exposes them, otherwise they are recorded from the
[Melbourne Water river data portal](https://www.melbournewater.com.au/water-data-and-education/water-data/river-data).

In [None]:
# ── Melbourne Water — summary + lat/lon from /locations response ──────────────
# The /locations response is already in `all_locations` from Step 1b.
# Check whether lat/lon are exposed in that response, then fall back to
# portal-sourced values for any fields the API doesn't provide.

# Lat/lon from Melbourne Water portal (API does not currently expose coordinates)
MW_COORDS = {
    '230100A': (-37.4103,      144.9023     ),
    '230211A': (-37.4662,      144.7440     ),
    '230104A': (-37.5833,      144.7420     ),
    '230107A': (-37.5285,      144.8560     ),
    '230106A': (-37.76590000,  144.89500000 ),
}

# Build a lookup from the /locations response
loc_lookup = {str(loc.get('siteId', '')).strip(): loc for loc in all_locations}

print(f'  {"Site ID":<12} {"Name":<40} {"Min year":>10} {"Lat":>12} {"Lon":>13} {"Coord source"}')
print('  ' + '-' * 102)

mw_meta = {}
for sid, v in sorted(mw_with_flow.items()):
    if sid in EXCLUDED:
        continue

    loc  = loc_lookup.get(sid, {})
    name = v['name']

    # Prefer API lat/lon if available
    api_lat = loc.get('latitude') or loc.get('lat')
    api_lon = loc.get('longitude') or loc.get('lon')

    if api_lat is not None and api_lon is not None:
        lat, lon   = float(api_lat), float(api_lon)
        coord_src  = 'API /locations'
    else:
        lat, lon   = MW_COORDS.get(sid, (None, None))
        coord_src  = 'portal (manual)'

    mw_meta[sid] = {'name': name, 'lat': lat, 'lon': lon, 'min_year': v['min_year']}
    print(f'  {sid:<12} {name:<40} {str(v["min_year"]):>10} {str(lat):>12} {str(lon):>13}  {coord_src}')

print(f'
{len(mw_meta)} Melbourne Water gauges ready')

## Step 4 — Catchment Areas from HydroBASINS (GEE)

For each gauge, the `UP_AREA` field from the HydroBASINS Level-12 outlet cell
gives the upstream drainage area in km². This is used as `area_km2` in the
GAUGES config for all stations except Keilor (230200), which uses the official
Victorian Water figure of **1305.4 km²** (based on 586 gaugings 1908–2025).

This cell makes one GEE point query per gauge — no BFS tracing needed.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import ee
ee.Authenticate()
ee.Initialize(project='floodhubmaribyrnong')

In [None]:
# ── HydroBASINS UP_AREA lookup for all 10 included gauges ────────────────────
# Manually-known coordinates used here (from Hydstra + Melbourne Water portal).
# Keilor (230200) area is overridden below with the official VW figure.

ALL_GAUGE_COORDS = [
    # station_id    gauge_id              lat              lon
    ('230100A', 'ausvic_230100',  -37.4103,        144.9023      ),
    ('230211A', 'ausvic_230211',  -37.4662,        144.7440      ),
    ('230104A', 'ausvic_230104',  -37.5833,        144.7420      ),
    ('230107A', 'ausvic_230107',  -37.5285,        144.8560      ),
    ('230200',  'ausvic_230200',  -37.727706090,   144.836476100 ),
    ('230106A', 'ausvic_230106',  -37.76590000,    144.89500000  ),
    ('230206',  'ausvic_230206',  -37.475370480,   144.572443200 ),
    ('230202',  'ausvic_230202',  -37.583217370,   144.742035600 ),
    ('230213',  'ausvic_230213',  -37.418904970,   144.584809600 ),
    ('230227',  'ausvic_230227',  -37.396121060,   144.660394900 ),
]

# Keilor official catchment area (Victorian Water, 586 gaugings 1908-2025)
KEILOR_OFFICIAL_AREA_KM2 = 1305.4

basins   = ee.FeatureCollection('WWF/HydroSHEDS/v1/Basins/hybas_12')
up_areas = {}  # gauge_id -> area_km2

print(f'  {"Station":<10} {"Gauge ID":<20} {"HydroBASINS UP_AREA":>20} {"area_km2 used":>14}')
print('  ' + '-' * 70)

for sid, gid, lat, lon in ALL_GAUGE_COORDS:
    point   = ee.Geometry.Point([lon, lat])
    outlet  = basins.filterBounds(point).first().getInfo()
    up_area = outlet['properties']['UP_AREA']

    # Keilor: use official Victorian Water figure
    if sid == '230200':
        area_used = KEILOR_OFFICIAL_AREA_KM2
        note      = f' <- official VW (HydroBASINS: {up_area:.1f})'
    else:
        area_used = round(up_area, 1)
        note      = ''

    up_areas[gid] = area_used
    print(f'  {sid:<10} {gid:<20} {up_area:>20.1f} {area_used:>14.1f}{note}')

print(f'
Areas fetched for {len(up_areas)} gauges.')

## Step 5 — Compile Final GAUGES List

Combines all sources into the final `GAUGES` configuration.
The output matches `gauges_config.py` in the project repository.

In [None]:
# ── Compile GAUGES from all derived sources ───────────────────────────────────

# Full metadata table — (station_id, gauge_id, api_source)
# Names and coords below come from Hydstra API (VW) or Melbourne Water portal (MW)
GAUGE_META = [
    # Victorian Water / Hydstra — lat/lon/name from get_site_list above
    ('230200',  'ausvic_230200', 'hydstra'),
    ('230206',  'ausvic_230206', 'hydstra'),
    ('230202',  'ausvic_230202', 'hydstra'),
    ('230213',  'ausvic_230213', 'hydstra'),
    ('230227',  'ausvic_230227', 'hydstra'),
    # Melbourne Water portal — lat/lon sourced manually
    ('230100A', 'ausvic_230100', 'melbwater'),
    ('230211A', 'ausvic_230211', 'melbwater'),
    ('230104A', 'ausvic_230104', 'melbwater'),
    ('230107A', 'ausvic_230107', 'melbwater'),
    ('230106A', 'ausvic_230106', 'melbwater'),
]

# ── Canonical name overrides ──────────────────────────────────────────────────
# The Melbourne Water API returns names that don't match the gauging station's
# actual river — corrected here using the Jacobs/Melbourne Water Oct 2022
# post-event flood analysis as the authoritative source.
#   230100A: API may return "Maribyrnong River at Darraweit" — wrong river.
#            Deep Creek is a major tributary, not the mainstem.
#   230211A: API may return "Maribyrnong River at Clarkefield" — wrong river.
#            The gauge is on Bolinda Creek, not the mainstem.
NAME_OVERRIDES = {
    '230100A': 'Deep Creek at Darraweit Guim',
    '230211A': 'Bolinda Creek at Clarkefield',
}

# Build lookup from Hydstra API results (Step 2)
hydstra_lookup = {
    sid: meta for sid, meta in vw_meta.items()
}
# Build lookup from Melbourne Water portal (Step 3)
mw_lookup = {
    sid: meta for sid, meta in mw_meta.items()
}

GAUGES = []
for sid, gid, source in GAUGE_META:
    if source == 'hydstra':
        meta = hydstra_lookup.get(sid, {})
    else:
        meta = mw_lookup.get(sid, {})

    api_name = meta.get('name', f'Station {sid}')
    name     = NAME_OVERRIDES.get(sid, api_name)
    if name != api_name:
        print(f'  Name override for {sid}: {api_name!r} -> {name!r}')

    GAUGES.append({
        'gauge_id':  gid,
        'name':      name,
        'lat':       meta.get('lat'),
        'lon':       meta.get('lon'),
        'area_km2':  up_areas.get(gid),
    })

# Sort to match gauges_config.py order (mainstem first, then tributaries)
ORDER = [
    'ausvic_230100', 'ausvic_230211', 'ausvic_230104', 'ausvic_230107',
    'ausvic_230200', 'ausvic_230106',
    'ausvic_230206', 'ausvic_230202', 'ausvic_230213', 'ausvic_230227',
]
GAUGES.sort(key=lambda g: ORDER.index(g['gauge_id']) if g['gauge_id'] in ORDER else 99)

print(f'{len(GAUGES)} gauges compiled\n')
print(f'  {"gauge_id":<20} {"name":<42} {"lat":>12} {"lon":>13} {"area_km2":>10}')
print('  ' + '-' * 103)
for g in GAUGES:
    print(f"  {g['gauge_id']:<20} {g['name']:<42} {g['lat']:>12.6f} {g['lon']:>13.6f} {g['area_km2']:>10.1f}")

In [None]:
# ── Print as Python dict literal (copy into gauges_config.py if updated) ──────
print('GAUGES = [')
for g in GAUGES:
    print(f"    {{'gauge_id': {repr(g['gauge_id']):<22} 'name': {repr(g['name']):<46} "
          f"'lat': {g['lat']:<16} 'lon': {g['lon']:<16} 'area_km2': {g['area_km2']}}},")
print(']')

In [None]:
# ── Validation ────────────────────────────────────────────────────────────────
errors = []

for g in GAUGES:
    gid = g['gauge_id']
    if len(gid.split('_')) != 2:
        errors.append(f"{gid}: gauge_id must have exactly 2 parts")
    if not gid.startswith('ausvic_'):
        errors.append(f"{gid}: must start with 'ausvic_'")
    if g['lat'] is None or g['lon'] is None:
        errors.append(f"{gid}: missing lat/lon")
    if g['area_km2'] is None or g['area_km2'] <= 0:
        errors.append(f"{gid}: invalid area_km2")
    if not (-90 <= g['lat'] <= 90 and 100 <= g['lon'] <= 160):
        errors.append(f"{gid}: coordinates outside Victoria bounds")

if errors:
    print('ERRORS:')
    for e in errors:
        print(f'  {e}')
else:
    print(f'All {len(GAUGES)} gauges passed validation.')
    print('  - gauge_id format: OK (ausvic_XXXXXX, 2 parts)')
    print('  - lat/lon present: OK')
    print('  - area_km2 > 0:    OK')
    print('  - coords in VIC:   OK')

In [None]:
# ── Save GAUGES to Google Drive as JSON ──────────────────────────────────────
# Downstream notebooks (0b-fetch_catchments, etc.) load from this file
# instead of hardcoding the gauge list — single source of truth.

GAUGES_JSON = Path('/content/drive/MyDrive/caravan_maribyrnong_gee/gauges_ausvic.json')
GAUGES_JSON.parent.mkdir(parents=True, exist_ok=True)

with open(GAUGES_JSON, 'w') as f:
    json.dump(GAUGES, f, indent=2)

print(f'GAUGES saved: {GAUGES_JSON}')
print(f'  {len(GAUGES)} gauges, fields: {list(GAUGES[0].keys())}')