In [1]:
# -------------------------

# This notebook includes the "Schedule Thumbs Up" tool.

# -------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import warnings
warnings.simplefilter(action='ignore')
from tqdm import tqdm
from pathlib import Path
import seaborn as sns
import zipfile
import geopandas as gpd
from shapely.geometry import Point, LineString
from scipy import stats
import datetime
import matplotlib.dates as mdates
import plotly.graph_objects as go
from matplotlib.ticker import MaxNLocator
import re


In [2]:
season_queries = {
    "2023-1": "service_date >= '2022-12-18' & service_date <= '2023-03-11'",
    "2023-2": "service_date >= '2023-03-12' & service_date <= '2023-07-02'",
    "2023-3": "service_date > '2023-07-02' & service_date <= '2023-08-26'",
    "2023-4"  : "service_date >= '2023-08-27' & service_date <= '2023-12-16'",
    "2024-1": "service_date >= '2023-12-17' & service_date <= '2024-04-06'",
    "2024-2": "service_date >= '2024-04-07' & service_date <= '2024-06-15'",
    "2024-3": "service_date >= '2024-06-16' & service_date <= '2024-08-24'",
    "2024-4"  : "service_date > '2024-08-24' & service_date <= '2024-12-14'",
    "2025-1": "service_date > '2024-12-14' & service_date <= '2025-04-06'",
    "2025-2": "service_date > '2025-04-06' & service_date <= '2025-06-14'",
}

In [3]:
# -------------------------

# Read pre-calculated dataframes from data_process.ipynb
# df_all: cleaned AVL data
# sd_all: trip-level runtimes
# combined: HASTUS schedules between 2023-1 to 2025-3

# -------------------------


df_all = pd.read_csv('df_all.csv')

df_all["service_date"]     = pd.to_datetime(df_all["service_date"],     format="%Y-%m-%d",              errors="coerce")
df_all["scheduled_boston"] = pd.to_datetime(df_all["scheduled_boston"], format="%Y-%m-%d %H:%M:%S",     errors="coerce")
df_all["actual_boston"]    = pd.to_datetime(df_all["actual_boston"],    format="%Y-%m-%d %H:%M:%S",     errors="coerce")

all_available_routes = df_all.drop_duplicates('route_id')
all_available_routes = all_available_routes[all_available_routes['route_id'].astype(str).str.strip().str.fullmatch(r'\d+')]



sd_all = pd.read_csv('sd_all.csv')
sd_all['service_date']    = pd.to_datetime(sd_all['service_date'])


combined = pd.read_csv('combined.csv')


In [4]:
# -------------------------

# One digit routes (such as Route 1) are recorded as '01' in database, so we are assigning a 
# route key to avoid integer/string mismatch

# -------------------------

def _route_key(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()

    m = re.fullmatch(r'0*(\d+)([A-Za-z]*)', s)
    if m:
        num = str(int(m.group(1)))           # drop leading zeros
        suf = m.group(2).upper()             # normalize suffix casing
        return num + suf

    return s.upper()
    
if 'route_key' not in sd_all.columns:
    sd_all = sd_all.copy()
    sd_all['route_key'] = sd_all['route_id'].map(_route_key)

if 'route_key' not in combined.columns:
    combined = combined.copy()
    combined['route_key'] = combined['Route'].map(_route_key)


In [5]:
# -------------------------

# Main block for the tool

# -------------------------


def compare_schedule_vs_runtime(
    sd_all: pd.DataFrame,
    combined: pd.DataFrame,
    *,
    route: str | int,
    direction: str,          # 'Outbound' / 'Inbound'
    dow: str,                # e.g., 'Weekday'
    schedule_season: str,    # e.g., '2024-4' 
    schedule_type: str = 'N',# School or No school
    schedule_variant: str | None = None,   # if None, uses dominant Variant
    runtime_seasons: tuple[str, ...] = ('2024-3',),  # 1+ seasons
    runtime_type: str = 'N',     # School or No school
    rt_stat: str = 'quantile',   # 'quantile'|'median'|'mean'
    q: float = 0.9,              # used if rt_stat == 'quantile'
    bin_minutes: int = 30,       # hour_Startpoint bin size
    use_top_pair: bool = True,   # keep only most common Start–End pair
    return_frames: bool = False  # also return merged hourly table
):
    
    # -------------------------
    # Schedule side (single season)
    # -------------------------
    rk = _route_key(route)
    sch = combined[
        (combined['route_key'] == rk)
        & (combined['Direction'] == direction)
        & (combined['rat_id'] == schedule_season)
        & (combined['type_d'] == schedule_type)
        & (combined['DOW'] == dow)
    ].copy()

    if sch.empty:
        raise ValueError("No schedule rows matched the provided filters.")

    # Pick Variant: explicit or dominant
    if 'Variant' in sch.columns:
        if schedule_variant is not None:
            sch = sch[sch['Variant'] == schedule_variant].copy()
            if sch.empty:
                raise ValueError(f"No schedule rows after filtering Variant='{schedule_variant}'.")
        else:
            # dominant (mode) Variant if available
            if sch['Variant'].notna().any():
                dom = sch['Variant'].mode(dropna=True)
                if len(dom):
                    sch = sch[sch['Variant'] == dom.iloc[0]].copy()

    # Normalize HH:MM or HH:MM:SS strings
    def _normalize_hhmm(s: pd.Series) -> pd.Series:
        s = s.astype(str).str.strip()
        s = s.str.replace(r'\.\d+$', '', regex=True)
        needs_sec = s.str.count(':') == 1
        return s.where(~needs_sec, s + ':00')

    s1 = pd.to_datetime(sch['Start'], errors='coerce')
    s2 = pd.to_datetime(_normalize_hhmm(sch['Start']), format='%H:%M:%S', errors='coerce')
    sch['Start_dt'] = s1.fillna(s2)

    # hour bins (e.g., 5.0, 5.5, ...)
    sch['hour_Startpoint'] = (
        sch['Start_dt'].dt.hour
        + (sch['Start_dt'].dt.minute // bin_minutes) * (bin_minutes / 60.0)
    )

    # Trip minutes from Start/End with midnight wrap
    start_td = pd.to_timedelta(_normalize_hhmm(sch['Start']), errors='coerce')
    end_td   = pd.to_timedelta(_normalize_hhmm(sch['End']),   errors='coerce')
    dur_td   = (end_td - start_td) % pd.Timedelta(days=1)
    sch['trip_minutes'] = dur_td / pd.Timedelta(minutes=1)

    layover_col = 'Act_Layover' if 'Act_Layover' in sch.columns else (
        'Min_Layover' if 'Min_Layover' in sch.columns else None
    )
    if layover_col is None:
        raise ValueError("Neither 'Act_Layover' nor 'Min_Layover' found in schedule DataFrame.")

    sch['schLO'] = sch['trip_minutes'] + sch[layover_col].fillna(0)
    schg = sch.groupby('hour_Startpoint', as_index=False)['schLO'].mean()

    # -------------------------
    # Runtime side (1+ seasons union)
    # -------------------------
    
    rts = sd_all[
        (sd_all['route_key'] == rk)
        & (sd_all['direction_id'] == direction)
        & (sd_all['season'].isin(runtime_seasons))
        & (sd_all['DOW'] == dow)
        & (sd_all['schedule_type'] == runtime_type)
    ].copy()

    if rts.empty:
        raise ValueError("No runtime rows matched the provided filters.")

    if use_top_pair:
        # most common Start–End timepoint pair across selected runtime seasons
        pair_counts = (rts.groupby(['timepointid_Startpoint','timepointid_Endpoint'], dropna=False)
                         .size().reset_index(name='count'))
        top_count = pair_counts['count'].max()
        top_pairs = pair_counts.loc[pair_counts['count'] == top_count,
                                    ['timepointid_Startpoint','timepointid_Endpoint']]
        # keep only those trips whose pair is one of the top ones
        rts = rts.merge(top_pairs,
                        on=['timepointid_Startpoint','timepointid_Endpoint'],
                        how='inner')

    # Group runtime to hourly bins with chosen stat
    if rt_stat == 'quantile':
        rtg = (rts.groupby('hour_Startpoint', as_index=False)['actual_run']
                  .quantile(q))
    elif rt_stat == 'median':
        rtg = rts.groupby('hour_Startpoint', as_index=False)['actual_run'].median()
    elif rt_stat == 'mean':
        rtg = rts.groupby('hour_Startpoint', as_index=False)['actual_run'].mean()
    else:
        raise ValueError("rt_stat must be one of {'quantile','median','mean'}.")

    # -------------------------
    # Merge + metrics
    # -------------------------
    comp = schg.merge(rtg, on='hour_Startpoint', how='inner')
    if comp.empty:
        raise ValueError("No overlapping hour bins between schedule and runtime.")

    comp['delta'] = comp['actual_run'] - comp['schLO']
    denom = comp['schLO'].sum()
    if denom == 0 or pd.isna(denom):
        raise ValueError("Sum of schLO is zero/NaN; cannot compute ratios.")

    deficiency_ratio = comp.loc[comp['delta'] > 0, 'delta'].sum() / denom
    surplus_ratio    = comp.loc[comp['delta'] < 0, 'delta'].sum() / denom  # negative

    out = {
        'route': str(route),
        'direction': direction,
        'dow': dow,
        'schedule_season': schedule_season,
        'schedule_type': schedule_type,
        'schedule_variant_used': (schedule_variant
                                  if schedule_variant is not None
                                  else (sch['Variant'].iloc[0] if 'Variant' in sch.columns and len(sch) else None)),
        'runtime_seasons': list(runtime_seasons),
        'runtime_type': runtime_type,
        'rt_stat': rt_stat,
        'q': q if rt_stat == 'quantile' else None,
        'bin_minutes': bin_minutes,
        'use_top_pair': use_top_pair,
        'n_runtime_rows': int(len(rts)),
        'n_overlap_hours': int(len(comp)),
        'deficiency_ratio': float(deficiency_ratio),
        'surplus_ratio': float(surplus_ratio),
    }
    return (out, comp) if return_frames else out


def batch_compare(
    sd_all: pd.DataFrame,
    combined: pd.DataFrame,
    *,
    route: str | int,
    direction: str,
    dow: str,
    schedule_season: str,
    schedule_type: str = 'N',
    schedule_variant: str | None = None,
    runtime_groups: list[tuple[str, tuple[str, ...]]],  # list of (label, seasons_tuple)
    **kwargs
) -> pd.DataFrame:
    """
    Run multiple runtime bundles against the same schedule season.
    runtime_groups example:
        [
          ("2024-3", ("2024-3",)),
          ("2024-3+2024-2+2024-1", ("2024-3","2024-2","2024-1")),
          ("2022-4+2023-4", ("2022-4","2023-4")),
        ]
    """
    rows = []
    for label, seasons in runtime_groups:
        res = compare_schedule_vs_runtime(
            sd_all, combined,
            route=route, direction=direction, dow=dow,
            schedule_season=schedule_season, schedule_type=schedule_type,
            schedule_variant=schedule_variant,
            runtime_seasons=seasons,
            **kwargs
        )
        res['runtime_label'] = label
        rows.append(res)
    return pd.DataFrame(rows)


In [20]:
# -------------------------

# Set the scenarios to compare with the given schedule

# -------------------------

runtime_groups = [
    ("2025-1", ("2025-1",)),
    ("2024-4", ("2024-4",)),
    ("2024-1 to 2025-4", ("2024-1","2024-2","2024-3","2024-4")),
    ("2023-1 and 2024-1", ("2023-1","2024-1",)),
]

In [21]:
# -------------------------

# The algorithm's schedule input is in this format

# -------------------------

combined[(combined['Route']=='23')&(combined['rat_id']=='2025-1')].head()

Unnamed: 0.1,Unnamed: 0,DOW,Route,Variant,Direction,Start,End,Min_Layover,Act_Layover,year,rating,type_d,rat_id,route_key
590374,590374,Weekday,23,_,Inbound,04:49,05:14,3,2.0,2025,1,NS,2025-1,23
590375,590375,Weekday,23,_,Inbound,05:02,05:27,3,4.0,2025,1,NS,2025-1,23
590376,590376,Weekday,23,_,Inbound,05:15,05:40,3,5.0,2025,1,NS,2025-1,23
590377,590377,Weekday,23,_,Inbound,05:28,05:53,4,11.0,2025,1,NS,2025-1,23
590378,590378,Weekday,23,_,Inbound,05:39,06:04,4,6.0,2025,1,NS,2025-1,23


In [22]:
table = batch_compare(
    sd_all, 
    combined,
    route='23',  
    direction='Outbound',             # 'Outbound' / 'Inbound'
    dow='Weekday',                    # 'Sunday' / 'Weekday' / 'Saturday'
    schedule_season='2025-1',         # e.g., '2024-4' 
    schedule_type='N',                # School or No school (NS)
    runtime_groups=runtime_groups,
    rt_stat='quantile',               # 'quantile'|'median'|'mean'
    q=0.9,                            # 90th vs 95th quantile
    use_top_pair=True                 # keep only most common Start–End pair
)

In [27]:
table[['runtime_label','deficiency_ratio','surplus_ratio']]

Unnamed: 0,runtime_label,deficiency_ratio,surplus_ratio
0,2025-1,0.005219,-0.071819
1,2024-4,0.008231,-0.068137
2,2024-1 to 2025-4,0.001986,-0.085233
3,2023-1 and 2024-1,0.000687,-0.130683


In [24]:
runtime_groups = [
    ("2024-2", ("2024-2",)),
    ("2024-1", ("2024-1",)),
    ("2023-2 to 2024-1", ("2023-2","2023-3","2023-4","2024-1",)),
    ("2023-2", ("2023-2",)),
]
table2 = batch_compare(
    sd_all, combined,
    route='34', direction='Outbound', dow='Weekday',
    schedule_season='2024-2', schedule_type='N',
    runtime_groups=runtime_groups,
    rt_stat='quantile', q=0.9, use_top_pair=True
)

In [28]:
table2[['runtime_label','deficiency_ratio','surplus_ratio']].iloc[1:]

Unnamed: 0,runtime_label,deficiency_ratio,surplus_ratio
1,2024-1,0.0,-0.255768
2,2023-2 to 2024-1,0.0,-0.249994
3,2023-2,0.0,-0.254986
