In [1]:
from masphd.hsp import HSPClient, extract_service_locations

hsp = HSPClient(timeout_secs=15)

raw = hsp.get_service_details_raw("202602037672804")
parsed = extract_service_locations(raw) if raw else None
print(parsed)

[{'rid': '202602037672804', 'ssd': '2026-02-03', 'toc_code': 'SW', 'is_main_journey': 0, 'tpl': 'POO', 'pta': None, 'ptd': '0650', 'ata': None, 'atd': '0649', 'late_canc_reason': None, 'tiploc2': 'POOLE', 'hsp_location': 'POO', 'hsp_gbtt_pta': None, 'hsp_gbtt_ptd': '0650', 'hsp_actual_ta': None, 'hsp_actual_td': '0649'}, {'rid': '202602037672804', 'ssd': '2026-02-03', 'toc_code': 'SW', 'is_main_journey': 0, 'tpl': 'PKS', 'pta': '0654', 'ptd': '0654', 'ata': '0651', 'atd': '0653', 'late_canc_reason': None, 'tiploc2': 'PSTONE', 'hsp_location': 'PKS', 'hsp_gbtt_pta': '0654', 'hsp_gbtt_ptd': '0654', 'hsp_actual_ta': '0651', 'hsp_actual_td': '0653'}, {'rid': '202602037672804', 'ssd': '2026-02-03', 'toc_code': 'SW', 'is_main_journey': 0, 'tpl': 'BSM', 'pta': '0657', 'ptd': '0657', 'ata': '0656', 'atd': '0658', 'late_canc_reason': None, 'tiploc2': 'BRANKSM', 'hsp_location': 'BSM', 'hsp_gbtt_pta': '0657', 'hsp_gbtt_ptd': '0657', 'hsp_actual_ta': '0656', 'hsp_actual_td': '0658'}, {'rid': '20260

In [1]:
# Notebook test cell: time conversion + actual_arr_delay sanity check (FIXED for HSP HHMM)

from zoneinfo import ZoneInfo
import sqlite3

from masphd.darwin.time_utils import combine_date_time_smart, diff_minutes_wrap
from masphd.dao.actual_arrivals_hsp import compute_actual_arrival_delay_min, make_actual_arrival_record
from masphd.io.paths import DATABASE

LONDON = ZoneInfo("Europe/London")


def hhmm_to_hh_colon_mm(v: str | None):
    """
    Convert HSP HHMM ('0657') -> Darwin HH:MM ('06:57').
    Also accepts already-colon formats and returns them unchanged.
    """
    if v is None:
        return None
    s = str(v).strip()
    if not s:
        return None
    if ":" in s:
        return s
    if len(s) == 4 and s.isdigit():
        return f"{s[:2]}:{s[2:]}"
    return None


def show_dt(ssd: str, hhmm: str, base_hhmm: str | None = None):
    hh = hhmm_to_hh_colon_mm(hhmm)
    base = hhmm_to_hh_colon_mm(base_hhmm) if base_hhmm else None
    base_dt = combine_date_time_smart(ssd, base, tz=LONDON) if base else None
    dt = combine_date_time_smart(ssd, hh, base_dt=base_dt, tz=LONDON)
    return dt


def show_case(label: str, ssd: str, planned: str, actual: str, base: str | None = None):
    planned_dt = show_dt(ssd, planned, base_hhmm=base)

    actual_s = hhmm_to_hh_colon_mm(actual)
    actual_dt = combine_date_time_smart(ssd, actual_s, base_dt=planned_dt, tz=LONDON)

    d = diff_minutes_wrap(planned_dt, actual_dt)

    # this calls your library function (after you update it to normalise HHMM internally)
    d2 = compute_actual_arrival_delay_min(
        ssd=ssd,
        planned_arr_hhmm=planned,
        actual_arr_hhmm=actual,
        base_hhmm=base,
        tz=LONDON,
    )

    print(f"\n--- {label} ---")
    print("ssd:", ssd, "base:", base, "->", show_dt(ssd, base, base_hhmm=None) if base else None)
    print("planned:", planned, "=>", hhmm_to_hh_colon_mm(planned), "->", planned_dt)
    print("actual :", actual,  "=>", hhmm_to_hh_colon_mm(actual),  "->", actual_dt)
    print("diff_minutes_wrap:", d)
    print("compute_actual_arrival_delay_min:", d2)


# 1) Basic sanity
show_case("Late +3 min",  "2026-02-03", planned="0657", actual="0700", base="0650")
show_case("On time",      "2026-02-03", planned="0657", actual="0657", base="0650")
show_case("Early -2 min", "2026-02-03", planned="0657", actual="0655", base="0650")

# 2) Midnight wrap sanity
show_case("Midnight wrap +7", "2026-02-03", planned="2358", actual="0005", base="2345")

# 3) End-to-end record creation
pred_row = {
    "rid": "202602037672804",
    "ssd": "2026-02-03",
    "first": "POOLE",       # TIPLOC2 example
    "second": "PSTONE",     # TIPLOC2 example
    "planned_dep": "0650",  # HSP-style HHMM
    "predicted_delay": 4.2,
}

# Mimic hsp_by_tiploc2[second]
hsp_by_tiploc2 = {
    "PSTONE": {
        "tpl": "PKE",      # CRS (example only)
        "pta": "0705",     # planned arrival (HHMM)
        "ata": "0710",     # actual arrival (HHMM)
        "toc_code": "SW",
    }
}

rec = make_actual_arrival_record(pred_row=pred_row, hsp_by_tiploc2=hsp_by_tiploc2, tz=LONDON)
print("\n--- make_actual_arrival_record output ---")
print(rec)

# 4) Inspect DB rows (optional)
conn = sqlite3.connect(DATABASE)
conn.row_factory = sqlite3.Row

rows = conn.execute("""
SELECT rid, ssd, first, second, planned_dep, planned_arr, actual_arr, actual_arr_delay, hsp_location_crs
FROM actual_arrivals_hsp
ORDER BY id DESC
LIMIT 20
""").fetchall()

print("\n--- last 20 actual_arrivals_hsp rows ---")
for r in rows:
    print(dict(r))

conn.close()



--- Late +3 min ---
ssd: 2026-02-03 base: 0650 -> 2026-02-03 06:50:00+00:00
planned: 0657 => 06:57 -> 2026-02-03 06:57:00+00:00
actual : 0700 => 07:00 -> 2026-02-03 07:00:00+00:00
diff_minutes_wrap: 3.0
compute_actual_arrival_delay_min: 3.0

--- On time ---
ssd: 2026-02-03 base: 0650 -> 2026-02-03 06:50:00+00:00
planned: 0657 => 06:57 -> 2026-02-03 06:57:00+00:00
actual : 0657 => 06:57 -> 2026-02-03 06:57:00+00:00
diff_minutes_wrap: 0.0
compute_actual_arrival_delay_min: 0.0

--- Early -2 min ---
ssd: 2026-02-03 base: 0650 -> 2026-02-03 06:50:00+00:00
planned: 0657 => 06:57 -> 2026-02-03 06:57:00+00:00
actual : 0655 => 06:55 -> 2026-02-03 06:55:00+00:00
diff_minutes_wrap: -2.0
compute_actual_arrival_delay_min: -2.0

--- Midnight wrap +7 ---
ssd: 2026-02-03 base: 2345 -> 2026-02-03 23:45:00+00:00
planned: 2358 => 23:58 -> 2026-02-03 23:58:00+00:00
actual : 0005 => 00:05 -> 2026-02-04 00:05:00+00:00
diff_minutes_wrap: 7.0
compute_actual_arrival_delay_min: 7.0

--- make_actual_arrival_rec

In [3]:
# Notebook cell: verify is_main_journey logic using your existing code (no re-implementations)

from masphd.hsp import HSPClient, extract_service_locations
from masphd.darwin.station_pairs import CRSS

# Your parser sets is_main_journey on each returned row (service-level flag copied to all rows)

hsp = HSPClient(timeout_secs=15)

rid = "202602037673914"
raw = hsp.get_service_details_raw(rid)
parsed = extract_service_locations(raw) if raw else []

print("RID =", rid)
print("Parsed rows =", len(parsed))

# 1) Extract CRS list from parsed rows (HSP 'tpl' is CRS by your confirmation)
seen_crs = []
for row in parsed:
    tpl = row.get("tpl")
    if tpl:
        seen_crs.append(tpl)

seen_crs_set = set(seen_crs)

route_crs = [c for c in CRSS if c]  # remove None
route_crs_set = set(route_crs)

print("\nRoute CRS count =", len(route_crs_set))
print("Seen  CRS count =", len(seen_crs_set))

# 2) Print each route CRS and whether it is present in the service
print("\n--- Route CRS coverage check ---")
for c in route_crs:
    ok = c in seen_crs_set
    print(f"{c}: {'YES' if ok else 'NO'}")

# 3) Show the parser's computed flag (should be identical across all rows)
parser_flags = sorted({row.get("is_main_journey") for row in parsed})
print("\nParser is_main_journey unique values =", parser_flags)

# 4) Call the function that checks is_main_journey (from your updated parser module)
# This assumes your module exposes _route_crs_set (internal) and/or uses it inside.
# We'll call the same computation indirectly by checking subset logic exactly as the parser does:
expected = int(route_crs_set.issubset(seen_crs_set)) if route_crs_set else 0
print("Expected is_main_journey (route_crs ⊆ seen_crs) =", expected)

# 5) Compare expected vs parser output
if parser_flags:
    print("PASS" if (len(parser_flags) == 1 and parser_flags[0] == expected) else "MISMATCH")
else:
    print("No parsed rows, cannot validate.")


RID = 202602037673914
Parsed rows = 15

Route CRS count = 20
Seen  CRS count = 15

--- Route CRS coverage check ---
WEY: NO
UPW: NO
DCH: NO
WOO: NO
WRM: NO
HAM: NO
POO: YES
PKS: NO
BSM: NO
BMH: YES
POK: YES
CHR: YES
NWM: YES
BCU: YES
SOU: YES
SOA: YES
WIN: YES
BSK: NO
CLJ: NO
WAT: NO

Parser is_main_journey unique values = [0]
Expected is_main_journey (route_crs ⊆ seen_crs) = 0
PASS
