In [1]:

# -------------------------

# This notebook includes data cleaning and processing AVL data (df_all) and calculating trip-level runtimes (sd_all).

# -------------------------

import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from tqdm import tqdm
from pathlib import Path
import seaborn as sns
import zipfile
import geopandas as gpd
from shapely.geometry import Point, LineString
from scipy import stats
import datetime
import matplotlib.dates as mdates
import plotly.graph_objects as go
from matplotlib.ticker import MaxNLocator
import shapely
import zipfile, io, os, re
import re
import duckdb


In [2]:


# -------------------------

# Calendar days for holiday, “school” weeks, and “no school” weeks, obtained from http://opstech.mbta.com/schrpts/Calendar/

# -------------------------


season_queries = {
    "2023-1": "service_date > '2022-12-18' & service_date < '2023-03-11'",
    "2023-2": "service_date > '2023-03-12' & service_date < '2023-07-02'",
    "2023-3": "service_date > '2023-07-02' & service_date < '2023-08-26'",
    "2023-4"  : "service_date > '2023-08-27' & service_date < '2023-12-16'",
    "2024-1": "service_date > '2023-12-17' & service_date < '2024-04-06'",
    "2024-2": "service_date > '2024-04-07' & service_date < '2024-06-15'",
    "2024-3": "service_date > '2024-06-16' & service_date < '2024-08-24'",
    "2024-4"  : "service_date > '2024-08-24' & service_date < '2024-12-14'",
    "2025-1": "service_date > '2024-12-14' & service_date < '2025-04-06'",
    "2025-2": "service_date > '2025-04-06' & service_date < '2025-06-14'",
}

calendars = {
    "2023": {
        "holidays": [
            "2023-01-01",  # New Year's Day – Sunday Schedule
            "2023-01-02",  # New Year's Day (observed) – Sunday Schedule
            "2023-01-16",  # Martin Luther King, Jr. Day – Saturday Schedule
            "2023-02-20",  # President's Day – Saturday Schedule
            "2023-04-17",  # Patriot's Day – Modified Saturday/Weekday service
            "2023-05-29",  # Memorial Day – Sunday Schedule
            "2023-06-19",  # Juneteenth – Weekday No School
            "2023-07-04",  # July 4th – Sunday Schedule
            "2023-09-04",  # Labor Day – Sunday Schedule
            "2023-10-09",  # Columbus/Indigenous People's Day – Saturday Schedule
            "2023-11-10",  # Veteran's Day (Observed) – Weekday service
            "2023-11-23",  # Thanksgiving Day – Sunday Schedule
            "2023-11-24",
            "2023-12-25",  # Christmas Day – Sunday Schedule
            "2023-12-31",  # New Year's Eve – Sunday-service with enhanced evening
        ]
    },
    "2024": {
        "holidays": [
            "2024-01-01",  # New Year's Day – Sunday Schedule
            "2024-01-15",  # Martin Luther King, Jr. Day – Saturday Schedule
            "2024-02-19",  # President's Day – Saturday Schedule
            "2024-04-15",  # Patriot's Day – Modified Saturday/Weekday service
            "2024-05-27",  # Memorial Day – Sunday Schedule
            "2024-07-04",  # July 4th – Modified Sunday Schedule
            "2024-09-02",  # Labor Day – Sunday Schedule
            "2024-10-14",  # Columbus/Indigenous People's Day – Saturday Schedule
            "2024-11-11",  # Veteran's Day – Weekday service
            "2024-11-28",  # Thanksgiving Day – Sunday Schedule
            "2024-12-25",  # Christmas Day – Sunday Schedule
            "2024-12-31",  # New Year's Eve – Weekday No School / Enhanced service
            "2024-03-17",  # St. Patrick's Day Parade – Red Line Special Schedule
        ],
        "weekday_no_school": [
            # February bus no-school week
            "2024-02-20", "2024-02-21", "2024-02-22", "2024-02-23",
            # April bus no-school week
            "2024-04-16", "2024-04-17", "2024-04-18", "2024-04-19",
            # June bus no-school day
            "2024-06-19",
            # November bus no-school day (Day after Thanksgiving)
            "2024-11-29",
            # December bus no-school days
            "2024-12-23", "2024-12-24", "2024-12-26", "2024-12-27", "2024-12-30",
        ],
    },
    "2025": {
        "holidays": [
            "2025-01-01",  # New Year's Day – Sunday Schedule
            "2025-01-20",  # Martin Luther King, Jr. Day – Saturday Schedule
            "2025-02-17",  # President's Day – Saturday Schedule
            "2025-04-21",  # Patriot's Day – Modified Saturday/Weekday service
            "2025-05-26",  # Memorial Day – Sunday Schedule
            "2025-07-04",  # July 4th – Modified Sunday Schedule
            "2025-09-01",  # Labor Day – Sunday Schedule
            "2025-10-13",  # Columbus/Indigenous People's Day – Saturday Schedule
            "2025-11-11",  # Veteran's Day – Weekday service
            "2025-11-27",  # Thanksgiving Day – Sunday Schedule
            "2025-12-25",  # Christmas Day – Sunday Schedule
            "2025-12-31",  # New Year's Eve – Weekday No School / Enhanced service
             "2025-03-16"  # St. Patrick's Day Parade – Red Line Special Schedule
        ],
           
        "weekday_no_school": [
            # February bus no-school week
            "2025-02-18", "2025-02-19", "2025-02-20", "2025-02-21",
            # April bus no-school period
            "2025-04-22", "2025-04-23", "2025-04-24", "2025-04-25",
            # June bus no-school day
            "2025-06-19",
            # November bus no-school day (Day after Thanksgiving)
            "2025-11-28",
            # December bus no-school days
            "2025-12-24", "2025-12-26", "2025-12-29", "2025-12-30",
        ],
    },
}
day_switch= { '2023-01-01':'Sunday',
  '2023-01-02':'Sunday',
  '2023-01-16':'Saturday',
  '2023-02-20':'Saturday',
  '2023-04-17':'Saturday',
  '2023-05-29':'Sunday',
  '2023-07-04':'Sunday',
  '2023-09-04':'Sunday',
  '2023-10-09':'Saturday',
  '2023-11-23':'Sunday',
  '2023-12-25':'Sunday',
  '2023-12-31':'Sunday',
  '2024-01-01':'Sunday',
  '2024-01-15':'Saturday',
  '2024-02-19':'Saturday',
  '2024-05-27':'Sunday',
  '2024-07-04':'Sunday',
  '2024-09-02':'Sunday',
  '2024-10-14':'Saturday',
  '2024-11-28':'Sunday',
  '2024-12-25':'Sunday',
  '2024-03-17':'Sunday',
  '2025-01-01':'Sunday',
  '2025-01-20':'Saturday',
  '2025-02-17':'Saturday',
  '2025-04-21':'Saturday',
  '2025-05-26':'Sunday',
  '2025-07-04':'Sunday',
  '2025-09-01':'Sunday',
  '2025-10-13':'Saturday',
  '2025-11-27':'Sunday',
  '2025-12-25':'Sunday'}

In [3]:
# 1. Extract holidays into its own dict:
holidays = {
    year: info["holidays"]
    for year, info in calendars.items()
}

# 2. Build a second dict of no-school days (weekday_no_school + summer breaks):
summer_periods = {
    "2024": ("2024-06-24", "2024-08-23"),
    "2025": ("2025-06-24", "2025-08-22"),
}

no_school_days = {}
for year, info in calendars.items():
    # start with any explicitly listed no-school dates
    days = set(info.get("weekday_no_school", []))

    # if this year has a summer break window, add all weekdays in that span
    if year in summer_periods:
        start_str, end_str = summer_periods[year]
        start = datetime.datetime.strptime(start_str, "%Y-%m-%d")
        end   = datetime.datetime.strptime(end_str,   "%Y-%m-%d")
        curr  = start
        while curr <= end:
            # Mon-Fri are weekday() 0–4
            if curr.weekday() < 5:
                days.add(curr.strftime("%Y-%m-%d"))
            curr += datetime.timedelta(days=1)

    # store a sorted list
    no_school_days[year] = sorted(days)


In [4]:
# -------------------------

# Bus routes per garage 

# -------------------------


southhampton = ['16', '28', '39', '741', '742', '743', '746', '749', '751']
somerville   = ['61', '62', '64', '67', '68', '69', '70', '71', '73', '74', '75', '76', '77', '78', '80', '83', '85', '86', '87', '88', '350', '351']
quincy       = ['201', '202', '210', '211', '215', '216', '217', '220', '222', '225', '226', '230', '236', '238', '240', '245']
lynn         = ['114', '116', '119', '120', '121', '424', '426', '428', '429', '435', '436', '439', '441', '442', '450', '451', '455', '456']
charlestown  = ['89', '90', '91', '92', '93', '94', '95', '96', '97', '99', '100', '101', '104', '105', '106', '108', '109', '110', '111', '112', '131', '132', '134', '137', '194', '354', '411', '430']
cabot        = ['1', '4', '7', '8', '9', '10', '11', '15', '16', '17', '18', '19', '22', '23', '28', '43', '44', '45', '47', '55', '57', '59', '60', '65', '66', '171', '191', '193', '501', '504', '505', '553', '554', '556', '558', '708', '747']
arborway     = ['14', '21', '24', '26', '28', '29', '30', '31', '32', '33', '34', '34E', '35', '36', '37', '38', '40', '41', '42', '50', '51', '52', '192']


In [5]:

# -------------------------

# Read all the HASTUS outputs for every schedule from 2023-1 through 2025-3

# -------------------------

ZIP_PATH  = "hastus2.zip"
KEEP_COLS = [
    "DOW", "Route", "Variant", "Direction",
    "Start", "End", "Min_Layover", "Act_Layover",
]

clean_re = re.compile(r"^Trips-(\d)-(\d{4})\.xlsx$", re.I)   # strict match
clean_re2 = re.compile(r"^Trips-(\d)-(\d{4})-NS.xlsx$", re.I)   # strict match

dfs = []

with zipfile.ZipFile(ZIP_PATH) as z:
    for member in z.namelist():
        base = os.path.basename(member)
        m = clean_re.match(base)              # ignore "summary", etc.
        m2 = clean_re2.match(base)   
        if not m and not m2:
            continue

        print(base)
        try:
            rating, year = map(int, m.groups())   # quarter, year from filename
        except:
            rating, year = map(int, m2.groups())
            add = True

        with z.open(member) as f:
            df = pd.read_excel(
                io.BytesIO(f.read()),
                sheet_name="Trips",
                engine="openpyxl"             # all matched files are .xlsx
            )

        df = df.rename(
            columns={
                "Min Layover": "Min_Layover",
                "Act Layover": "Act_Layover",
            }
        )

        for col in KEEP_COLS:
            if col not in df.columns:
                df[col] = pd.NA
        df = df[KEEP_COLS]

        df["year"]   = year
        df["rating"] = rating
        if add:
            df["type_d"]  = 'NS'
        else:
            df["type_d"]  = 'N'
        
        add= False

        dfs.append(df)

combined = pd.concat(dfs, ignore_index=True)
combined['rat_id'] = combined['year'].astype(str) + '-' + combined['rating'].astype(str)


Trips-1-2024-NS.xlsx
Trips-1-2024.xlsx
Trips-4-2025.xlsx
Trips-4-2025-NS.xlsx
Trips-3-2025.xlsx
Trips-3-2025-NS.xlsx
Trips-1-2023.xlsx
Trips-2-2025.xlsx
Trips-2-2025-NS.xlsx
Trips-3-2024.xlsx
Trips-4-2024.xlsx
Trips-4-2024-NS.xlsx
Trips-2-2024.xlsx
Trips-2-2024-NS.xlsx
Trips-2-2023.xlsx
Trips-3-2023.xlsx
Trips-4-2023.xlsx
Trips-1-2025-NS.xlsx
Trips-1-2025.xlsx


In [6]:
time_pat = re.compile(r"^\s*(\d{1,2}):(\d{2})(?::\d{2})?\s*$")

def to_hhmm(val):
    """Normalise any Start/End value to 'HH:MM' or <NA>."""
    if pd.isna(val):
        return pd.NA

    # 1) numeric Excel day-fraction 
    if isinstance(val, (int, float, np.number)):
        total_seconds = val * 24 * 60 * 60
        secs = (datetime.datetime.min + datetime.timedelta(seconds=total_seconds)).time()
        return f"{secs.hour:02d}:{secs.minute:02d}"

    # 2) datetime.time object
    if isinstance(val, dt.time):
        return val.strftime("%H:%M")

    # 3) datetime.datetime object
    if isinstance(val, dt.datetime):
        return val.strftime("%H:%M")

    # 4) plain time string, e.g. '05:06:00' or '5:06'
    if isinstance(val, str):
        m = time_pat.match(val)
        if m:
            h, mnt = map(int, m.groups()[:2])
            return f"{h:02d}:{mnt:02d}"

    t = pd.to_datetime(val, errors="coerce")
    if not pd.isna(t):
        return t.strftime("%H:%M")

    return pd.NA

combined["Start"] = combined["Start"].apply(to_hhmm)
combined["End"]   = combined["End"].apply(to_hhmm)
combined['DOW'] = combined['DOW'].str.strip()

In [7]:
# ---------------------------------------------------------

# Save the scheduled times

# ---------------------------------------------------------
combined.to_csv('combined.csv')

combined.head()

Unnamed: 0,DOW,Route,Variant,Direction,Start,End,Min_Layover,Act_Layover,year,rating,type_d,rat_id
0,Weekday,1,_,Inbound,05:06,05:30,2,7.0,2024,1,NS,2024-1
1,Weekday,1,_,Inbound,05:21,05:45,2,4.0,2024,1,NS,2024-1
2,Weekday,1,_,Inbound,05:36,06:00,3,7.0,2024,1,NS,2024-1
3,Weekday,1,_,Inbound,05:51,06:20,3,5.0,2024,1,NS,2024-1
4,Weekday,1,_,Inbound,06:06,06:35,3,8.0,2024,1,NS,2024-1


In [8]:

# -------------------------

# Read all the available AVL data from https://mbta-massdot.opendata.arcgis.com/search?tags=bus

# -------------------------

def load_months(year: int, months: list[int]) -> pd.DataFrame:
    files = [
        f"MBTA-Bus-Arrival-Departure-Times_{year}-{month:02d}.csv"
        for month in months
    ]
    dfs = [pd.read_csv(Path(fn)) for fn in files]
    return pd.concat(dfs, ignore_index=True)

df_23 = load_months(2023, list(range(1, 13)))
df_24 = load_months(2024, list(range(1, 13)))
df_25 = load_months(2025, list(range(1, 7)))


# The data format includes days before 2024-05-31 in Boston timezone, days after this in UTC time zone

df_all = pd.concat([df_23,df_24,df_25])
no_correction = df_all[df_all['service_date']<='2024-05-31']
yes_correction = df_all[df_all['service_date']>'2024-05-31']


In [29]:
def process_arrivals(df: pd.DataFrame, correction = True) -> pd.DataFrame:
    df = df.copy()

    df['scheduled']    = pd.to_datetime(df['scheduled'],    utc=True)
    df['actual']       = pd.to_datetime(df['actual'],       utc=True)
    df['service_date'] = pd.to_datetime(df['service_date'])
    
    def tod(col):
        utc = df[col]
        seconds = (
            (utc.dt.day - 1) * 86400
            + utc.dt.hour * 3600
            + utc.dt.minute * 60
            + utc.dt.second
        )
        return pd.to_timedelta(seconds, unit='s')
    
    full_sched_utc = df['service_date'] + tod('scheduled')
    full_act_utc   = df['service_date'] + tod('actual')
    
    if correction==True:
        df['scheduled_boston'] = full_sched_utc - pd.Timedelta(hours=5)
        df['actual_boston']    = full_act_utc   - pd.Timedelta(hours=5)
    else:
        df['scheduled_boston'] = full_sched_utc 
        df['actual_boston']    = full_act_utc   
    
    df = (
        df.sort_values('scheduled_boston')
    )
    df['hour'] = (
    df['scheduled_boston'].dt.hour
  + (df['scheduled_boston'].dt.minute // 30) * 0.5)

    df['day'] = (
    df['scheduled_boston'].dt.day)

    df['month'] = (
    df['scheduled_boston'].dt.month)
    
    df['year'] = (
    df['scheduled_boston'].dt.year)
    
    return df

no_correction  = process_arrivals(no_correction,False)
yes_correction  = process_arrivals(yes_correction,True)
df_all = pd.concat([no_correction,yes_correction])

original_l1 = len(df_all.half_trip_id.unique())

In [30]:
df_all['scheduledhm'] = df_all['scheduled_boston'].dt.strftime('%H:%M')

df_all["service_date"] = pd.to_datetime(df_all["service_date"])

dow_map = {0: "Weekday", 1: "Weekday", 2: "Weekday", 3: "Weekday", 4: "Weekday",
           5: "Saturday", 6: "Sunday"}

df_all["DOW"] = df_all["service_date"].dt.dayofweek.map(dow_map)

df_all['date_str'] = df_all['service_date'].dt.strftime('%Y-%m-%d')
df_all['DOW'] = df_all['date_str'].map(day_switch).fillna(df_all['DOW'])
df_all.drop('date_str', axis=1, inplace=True)

In [11]:
# ---------------------------------------------------------

# DATA CLEANING 1.

# We drop rows (any Startpoint, Midpoint or Endpoint) that didn't record a timestamp (actual_boston)
# If a unique trip recorded multiple timestamps in a specific stop we keep only the first record

# ---------------------------------------------------------

df_all = duckdb.sql("""
    SELECT *
    FROM df_all
    WHERE actual_boston IS NOT NULL
    QUALIFY ROW_NUMBER() OVER (
        PARTITION BY service_date, route_id, direction_id, half_trip_id, stop_id
        ORDER BY actual_boston
    ) = 1
""").df()
print(df_all['half_trip_id'].nunique() / original_l1)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

0.9703087619757549


In [12]:
# ---------------------------------------------------------

# DATA CLEANING 2.

# This code filters unique trips to keep only those with exactly one Startpoint and one Endpoint

# ---------------------------------------------------------


def multiple_pt_remove_duck(df):
    con = duckdb.connect()
    con.register("df", df)
    out = con.sql("""
        WITH base AS (
            SELECT *
            FROM df
            WHERE actual_boston IS NOT NULL
        ),
        trip_counts AS (
            SELECT
                half_trip_id,
                COUNT(*) FILTER (WHERE point_type = 'Startpoint') AS n_start,
                COUNT(*) FILTER (WHERE point_type = 'Endpoint')  AS n_end
            FROM base
            GROUP BY half_trip_id
        ),
        good_trips AS (
            SELECT half_trip_id
            FROM trip_counts
            WHERE n_start = 1 AND n_end = 1
        )
        SELECT b.*
        FROM base b
        SEMI JOIN good_trips g USING (half_trip_id)
    """).df()
    return out

df_all = multiple_pt_remove_duck(df_all)
print(df_all['half_trip_id'].nunique() / original_l1)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

0.8241541691581242


In [13]:
# ---------------------------------------------------------

# DATA CLEANING 3.

# Keep only half_trip_ids that occur on exactly one date

# ---------------------------------------------------------

df_all = df_all.rename(columns={'time_point_id':'timepointid'})
df_all['service_date2'] = pd.to_datetime(df_all['service_date']).dt.date

df_all = df_all[
    df_all.groupby('half_trip_id')['service_date2'].transform('nunique') == 1
].copy()

print(df_all['half_trip_id'].nunique() / original_l1)


0.8238366926672047


In [14]:
# ---------------------------------------------------------

# Trip ids following the max 15 mins late and 10 mins early filters

# ---------------------------------------------------------

df_all['delay_min'] = (df_all['actual_boston'] - df_all['scheduled_boston'])\
                        .dt.total_seconds() / 60

starts = df_all[(df_all['point_type']=='Startpoint')]
id_keeps = starts[(starts['delay_min']<=15)&(starts['delay_min']>-10)]['half_trip_id']

In [21]:
# ---------------------------------------------------------

# Save the cleaned runtimes

# ---------------------------------------------------------
df_all.to_csv('df_all.csv')

df_all.head()

Unnamed: 0,service_date,route_id,direction_id,half_trip_id,stop_id,timepointid,time_point_order,point_type,standard_type,scheduled,...,headway,scheduled_boston,actual_boston,hour,day,month,year,scheduledhm,DOW,service_date2
0,2024-03-18,222,Outbound,62627698.0,13844,plwas,6,Endpoint,Schedule,1900-01-01 13:41:00-04:56,...,,2024-03-18 18:37:00,2024-03-18 18:43:14,18.5,18,3,2024,18:37,Weekday,2024-03-18
1,2024-03-18,222,Outbound,62627766.0,3539,bknsq,4,Midpoint,Schedule,1900-01-01 11:16:00-04:56,...,,2024-03-18 16:12:00,2024-03-18 16:12:55,16.0,18,3,2024,16:12,Weekday,2024-03-18
2,2024-03-18,222,Outbound,62627782.0,32004,qnctr,1,Startpoint,Schedule,1900-01-01 06:59:00-04:56,...,,2024-03-18 11:55:00,2024-03-18 11:57:59,11.5,18,3,2024,11:55,Weekday,2024-03-18
3,2024-03-18,222,Outbound,62627796.0,3525,wassa,3,Midpoint,Schedule,1900-01-01 10:36:00-04:56,...,,2024-03-18 15:32:00,2024-03-18 15:38:55,15.5,18,3,2024,15:32,Weekday,2024-03-18
4,2024-03-18,225,Inbound,62627506.0,32001,qnctr,7,Endpoint,Schedule,1900-01-01 15:04:00-04:56,...,,2024-03-18 20:00:00,2024-03-18 19:56:47,20.0,18,3,2024,20:00,Weekday,2024-03-18


In [15]:

# ---------------------------------------------------------

# Calculate trip level runtimes for schedule deficinecy calculations

# ---------------------------------------------------------


def sched_def(df):
    ends = df[df['point_type'].isin(['Startpoint','Endpoint'])]
    
    runs = (
        ends
        .pivot(index='half_trip_id',
               columns='point_type',
               values=['scheduled_boston','actual_boston','hour','scheduledhm','timepointid'])
    )
    
    runs.columns = [f"{col[0].split('_')[0]}_{col[1]}" for col in runs.columns]
        
    runs['scheduled_run'] = runs['scheduled_Endpoint'] - runs['scheduled_Startpoint']
    runs['actual_run']    = runs['actual_Endpoint']    - runs['actual_Startpoint']
    
    runs["scheduled_run"] = runs["scheduled_run"].dt.total_seconds()        / 60
    runs["actual_run"] = runs["actual_run"].dt.total_seconds()        / 60
    
    result = runs[['scheduled_run','actual_run','hour_Startpoint','hour_Endpoint',
                   'scheduledhm_Startpoint','timepointid_Startpoint','timepointid_Endpoint']].reset_index()

        
    return result


def build_season_df(df_full, query, season_label):

    df_season = df_full.query(query).copy()
    runtimes = sched_def(df_season)

    cols_to_keep = [
        "half_trip_id", 'route_id',       # key for merge
        "direction_id",
        "service_date",
        "month", "day",        
        "weekday_num",'DOW'      
    ]
    cols_to_keep = [c for c in cols_to_keep if c in df_season.columns]

    runtimes = runtimes.merge(
        df_season[cols_to_keep].drop_duplicates("half_trip_id"),
        on="half_trip_id",
        how="left",
    )

    runtimes["season"] = season_label


    return runtimes


In [16]:

all_frames = [
    build_season_df(df_all, q, season)  
    for season, q in season_queries.items()
]

sd_all = (
    pd.concat(all_frames, ignore_index=True)
      .drop_duplicates("half_trip_id") 
      .sort_values(["season", "service_date", "half_trip_id"])
)

In [17]:
# ---------------------------------------------------------

# Assign holiday, school and no school days to trip level runtimes.

# ---------------------------------------------------------


sd_all = sd_all[sd_all['route_id'].astype(str).str.strip().str.fullmatch(r'\d+')]

holiday_set   = {d for dates in holidays.values()     for d in dates}
no_school_set = {d for dates in no_school_days.values() for d in dates}

sd_all['date_str'] = sd_all['service_date'].dt.strftime('%Y-%m-%d')

sd_all['schedule_type'] = (
    sd_all['date_str']
      .map(lambda d: 'NS'  if d in no_school_set
                    else 'N')
)
sd_all['holiday'] = (
    sd_all['date_str']
      .map(lambda d: 'H'  if d in holiday_set
                    else 'NH')
)
sd_all.loc[sd_all['season'] == '2024-3', 'schedule_type'] = 'N'

sd_all["time_dir"] = (
    sd_all["scheduledhm_Startpoint"].astype(str)
    + sd_all["direction_id"].astype(str)+ sd_all["season"].astype(str)+ sd_all["DOW"].astype(str)+ sd_all["schedule_type"].astype(str)+ sd_all["route_id"].astype(int).astype(str)
)

In [18]:
# ---------------------------------------------------------

# Save the calculated run times

# ---------------------------------------------------------
#sd_all.to_csv('df_all.csv')

sd_all.head()

Unnamed: 0,half_trip_id,scheduled_run,actual_run,hour_Startpoint,hour_Endpoint,scheduledhm_Startpoint,timepointid_Startpoint,timepointid_Endpoint,route_id,direction_id,service_date,month,day,DOW,season,date_str,schedule_type,holiday,time_dir
79,58061087.0,16.0,8.866667,8.5,9.0,08:55,ashmt,fhill,21,Inbound,2023-01-01,1,1,Sunday,2023-1,2023-01-01,N,H,08:55Inbound2023-1SundayN21
80,58061088.0,12.0,10.416667,9.0,9.5,09:25,fhill,nubn,42,Inbound,2023-01-01,1,1,Sunday,2023-1,2023-01-01,N,H,09:25Inbound2023-1SundayN42
81,58061089.0,11.0,10.316667,9.5,9.5,09:45,nubn,fhill,42,Outbound,2023-01-01,1,1,Sunday,2023-1,2023-01-01,N,H,09:45Outbound2023-1SundayN42
82,58061090.0,18.0,11.866667,10.0,10.0,10:05,fhill,matpn,30,Outbound,2023-01-01,1,1,Sunday,2023-1,2023-01-01,N,H,10:05Outbound2023-1SundayN30
83,58061091.0,19.0,13.633333,10.5,10.5,10:30,matpn,fhill,30,Inbound,2023-01-01,1,1,Sunday,2023-1,2023-01-01,N,H,10:30Inbound2023-1SundayN30


In [31]:

# ---------------------------------------------------------

# Route 22 cleaning example

# ---------------------------------------------------------


rt_22 = df_all[df_all['route_id']=='22']
rt_22 = rt_22.query(season_queries['2024-4'])

original_l1 = len(rt_22.half_trip_id.unique())

In [32]:
rt_22.to_csv('rt_22_raw.csv')

In [33]:
original_l1

20085

In [34]:
# ---------------------------------------------------------

# DATA CLEANING 1.

# ---------------------------------------------------------

rt_22 = duckdb.sql("""
    SELECT *
    FROM rt_22
    WHERE actual_boston IS NOT NULL
    QUALIFY ROW_NUMBER() OVER (
        PARTITION BY service_date, route_id, direction_id, half_trip_id, stop_id
        ORDER BY actual_boston
    ) = 1
""").df()
print(rt_22['half_trip_id'].nunique() / original_l1)


0.9743091859596714


In [35]:
# ---------------------------------------------------------

# DATA CLEANING 2.

# ---------------------------------------------------------

def multiple_pt_remove_duck(df):
    con = duckdb.connect()
    con.register("df", df)
    out = con.sql("""
        WITH base AS (
            SELECT *
            FROM df
            WHERE actual_boston IS NOT NULL
        ),
        trip_counts AS (
            SELECT
                half_trip_id,
                COUNT(*) FILTER (WHERE point_type = 'Startpoint') AS n_start,
                COUNT(*) FILTER (WHERE point_type = 'Endpoint')  AS n_end
            FROM base
            GROUP BY half_trip_id
        ),
        good_trips AS (
            SELECT half_trip_id
            FROM trip_counts
            WHERE n_start = 1 AND n_end = 1
        )
        SELECT b.*
        FROM base b
        SEMI JOIN good_trips g USING (half_trip_id)
    """).df()
    return out

rt_22 = multiple_pt_remove_duck(rt_22)
print(rt_22['half_trip_id'].nunique() / original_l1)


0.9003734129947722


In [36]:
# ---------------------------------------------------------

# DATA CLEANING 3.

# ---------------------------------------------------------

rt_22 = rt_22.rename(columns={'time_point_id':'timepointid'})
rt_22['service_date2'] = pd.to_datetime(rt_22['service_date']).dt.date

rt_22 = rt_22[
    rt_22.groupby('half_trip_id')['service_date2'].transform('nunique') == 1
].copy()

print(rt_22['half_trip_id'].nunique() / original_l1)


0.9003734129947722


In [37]:

target_seq = ['ashmt','codmn','bltal','frnpk', 'svrhm','egles','jasst','roxbs','rugg']

seqs = (rt_22[rt_22['direction_id']=='Inbound']
        .sort_values(['half_trip_id', 'time_point_order'])
        .groupby('half_trip_id')['timepointid']
        .apply(list)
       )

bad_trips_in = [ht for ht, seq in seqs.items() if seq != target_seq]


target_seq.reverse()

seqs = (rt_22[rt_22['direction_id']=='Outbound']
        .sort_values(['half_trip_id', 'time_point_order'])
        .groupby('half_trip_id')['timepointid']
        .apply(list)
       )

bad_trips_out = [ht for ht, seq in seqs.items() if seq != target_seq]

rt_22 = rt_22[~rt_22['half_trip_id'].isin(bad_trips_in)]
rt_22 = rt_22[~rt_22['half_trip_id'].isin(bad_trips_out)]
print(rt_22['half_trip_id'].nunique() / original_l1)


0.845108289768484


In [38]:
def process_apc(apc_data):

    new_cols = ["TRIP_DATE","BUS","ROUTE","VARIATION",'DAY_OF_WK',"BLOCK","DIRECTION","TRIP","STOP_ID","STOP_SEQ_ID","STOP_NAME",
    "ACT_STOP_TIME","ACT_DEP_TIME","ACT_MOVE_TIME","PSGR_ON","PSGR_OFF","PSGR_LOAD","LATITUDE","LONGITUDE","SCH_TIME",
    "DEVIATION_MINS","SCH_RUN_MINS","ACT_RUN_MINS","DOOR_CYCLES","GPS_ERROR_FT"]

    apc_data.columns = new_cols
    
    apc_data['schd_Stop']= apc_data['SCH_TIME'].astype(str) +apc_data['STOP_ID'].astype(str) 
    
    apc_data['TRIP_DATE']    = pd.to_datetime(apc_data['TRIP_DATE'])
    apc_data['SCH_TIME']    = pd.to_datetime(apc_data['SCH_TIME'])
    apc_data['ACT_STOP_TIME']    = pd.to_datetime(apc_data['ACT_STOP_TIME'])
    apc_data['ACT_DEP_TIME']    = pd.to_datetime(apc_data['ACT_DEP_TIME'])
    apc_data['ACT_MOVE_TIME']    = pd.to_datetime(apc_data['ACT_MOVE_TIME'])
    
    apc_data = apc_data[apc_data['DEVIATION_MINS']!=99]
    apc_data = apc_data[apc_data['SCH_RUN_MINS']!=99.9]
    apc_data = apc_data[apc_data['ACT_RUN_MINS']!=99.9]
    
    apc_data['id']=apc_data['TRIP_DATE'].astype(str)+apc_data['BUS'].astype(str)+apc_data['BLOCK'].astype(str)+apc_data['TRIP'].astype(str)
    
    apc_data['hour'] =  (
        apc_data['SCH_TIME'].dt.hour
      + (apc_data['SCH_TIME'].dt.minute // 30) * 0.5)

    # In a stop recorded multiple departure timestamps
    
    counts = pd.DataFrame(apc_data.value_counts('schd_Stop'))
    counts = counts[counts['count']==2]
    bad_ids2 = list(counts.index)
    apc_data = apc_data[~apc_data['schd_Stop'].isin(bad_ids2)]

    return apc_data


apc_data22 = pd.read_csv('Fall 24 Winter 25 Route 22 Ridecheck.csv')
apc_data = process_apc(apc_data22)

fall24_apc = apc_data[(apc_data['TRIP_DATE']>='2024-08-24')&(apc_data['TRIP_DATE']<='2024-12-14')]

In [39]:
# ---------------------------------------------------------

# Keep the trips if AVL and APC measurements agree in x minutes threshold

# ---------------------------------------------------------

fall24_apc['DIRECTION'] = fall24_apc['DIRECTION'].replace(1, 'Inbound')
fall24_apc['DIRECTION'] = fall24_apc['DIRECTION'].replace(2, 'Outbound')
fall24_apc = fall24_apc[['SCH_TIME','ACT_DEP_TIME','DIRECTION','STOP_ID']]

rt_22_match = rt_22[['half_trip_id','scheduled_boston','actual_boston','direction_id','stop_id']]

fall24_apc['id'] = fall24_apc['SCH_TIME'].astype(str)+fall24_apc['STOP_ID'].astype(str)+fall24_apc['DIRECTION'].astype(str)
rt_22_match['id'] = rt_22_match['scheduled_boston'].astype(str)+rt_22_match['stop_id'].astype(str)+rt_22_match['direction_id'].astype(str)

fall24_apc = fall24_apc.merge(rt_22_match,on='id')
fall24_apc['diff'] = (fall24_apc['actual_boston'] - fall24_apc['ACT_DEP_TIME']) \
             .dt.total_seconds() / 60

fall24_apc = fall24_apc[(fall24_apc['diff']<5)&(fall24_apc['diff']>-5)]

rt_22['key'] = (
    rt_22['half_trip_id'].astype(str) 
    + rt_22['stop_id'].astype(str) 
    + rt_22['direction_id'].astype(str)
)

fall24_apc['key'] = (
    fall24_apc['half_trip_id'].astype(str) 
    + fall24_apc['stop_id'].astype(str) 
    + fall24_apc['direction_id'].astype(str)
)

rt_22= rt_22[rt_22['key'].isin(fall24_apc['key'])]

In [339]:
# ---------------------------------------------------------

# Alternative filtering for maximum 15 minutes late or 10 minutes early

# ---------------------------------------------------------

rt_22 = rt_22[rt_22['half_trip_id'].isin(id_keeps)]

In [40]:
print(rt_22['half_trip_id'].nunique() / original_l1)


0.8195170525267612


In [41]:
rt_22.to_csv('rt_22_cleaned.csv')

In [42]:
all_frames_22 = [
    build_season_df(rt_22, season_queries['2024-4'] , '2024-4')  
]

sd_all_22 = (
    pd.concat(all_frames_22, ignore_index=True)
      .drop_duplicates("half_trip_id") 
      .sort_values(["season", "service_date", "half_trip_id"])
)

sd_all_22 = sd_all_22[sd_all_22['route_id'].astype(str).str.strip().str.fullmatch(r'\d+')]

holiday_set   = {d for dates in holidays.values()     for d in dates}
no_school_set = {d for dates in no_school_days.values() for d in dates}

sd_all_22['date_str'] = sd_all_22['service_date'].dt.strftime('%Y-%m-%d')

sd_all_22['schedule_type'] = (
    sd_all_22['date_str']
      .map(lambda d: 'NS'  if d in no_school_set
                    else 'N')
)
sd_all_22['holiday'] = (
    sd_all_22['date_str']
      .map(lambda d: 'H'  if d in holiday_set
                    else 'NH')
)

sd_all_22["time_dir"] = (
    sd_all_22["scheduledhm_Startpoint"].astype(str)
    + sd_all_22["direction_id"].astype(str)+ sd_all_22["season"].astype(str)+ sd_all_22["DOW"].astype(str)+ sd_all_22["schedule_type"].astype(str)+ sd_all_22["route_id"].astype(int).astype(str)
)

In [43]:
combined = combined[combined['Route'].astype(str).str.strip().str.fullmatch(r'\d+')]

combined_28 = combined[combined['Route'].astype(int).isin(sd_all_22['route_id'].astype(int).unique())]

combined_28['time_dir'] = combined_28['Start'].astype(str)+combined_28['Direction'].astype(str)+ \
         combined_28['rat_id'].astype(str)+combined_28['DOW'].astype(str)+combined_28['type_d'].astype(str)+combined_28['Route'].astype(str)

cnts = combined_28.value_counts('time_dir').reset_index()
cnts = cnts[cnts['count']==1]
combined_28 = combined_28[combined_28['time_dir'].isin(cnts['time_dir'])]

In [44]:
sd_all_mrg = sd_all_22.merge(combined_28[['time_dir','Act_Layover','Variant']])


sd_all_mrg = (
    sd_all_mrg[pd.to_numeric(sd_all_mrg['Act_Layover'], errors='coerce').notna()]
      .assign(Act_Layover=lambda df: df['Act_Layover'].astype(float))
)

sd_all_mrg['schdLO']  = sd_all_mrg['scheduled_run'] + sd_all_mrg['Act_Layover']
sd_all_mrg = sd_all_mrg[sd_all_mrg['schedule_type']=='N']

In [45]:
wo_22 = sd_all_mrg[(sd_all_mrg['direction_id']=='Inbound')&(sd_all_mrg['DOW']=='Weekday')]
wo_22=wo_22.dropna()
len(wo_22)

3294

In [46]:
# hour_Startpoint
agg = wo_22.groupby('scheduledhm_Startpoint').agg(
    actual_run_q90=('actual_run', lambda x: x.quantile(0.90)),
    mean_schd     =('scheduled_run',    'mean'),
    schdLO     =('schdLO',    'mean')).reset_index()

defs = agg[agg['actual_run_q90']>agg['schdLO']]
np.sum((defs['actual_run_q90']-defs['schdLO']))/np.sum(agg['schdLO'])

0.04085323145495067