This notebook will show how to get trip_update data and test agencies to check whether they provide lateness estimates


In [None]:
import pandas as pd
from datetime import datetime
from itertools import chain

from pathlib import Path
import json


In [None]:
class TripDelayData:
    def __init__(
        # self, delay, stop_time_updates, timestamp=None, trip_id=None, route_id=None, vehicle_id=None
        self, delay, stop_time_updates, timestamp=None, trip_id=None
    ):
        if trip_id:
            self.trip_id = trip_id
        self.timestamp = datetime.fromtimestamp(timestamp) if timestamp else pd.NaT
        self.delay = delay
        # keys: ["stop_sequence", "stop_id",
        #        "arrival_delay", "arrival_time", "arrival_uncertainty",
        #        "departure_delay", "departure_time", "departure_uncertainty"]
        self.stop_time_updates = []
        for st in stop_time_updates:
            self.stop_time_updates.append(
                {
                    **({"stop_id": st["stop_id"]} if "stop_id" in st else {}),
                    **({"stop_sequence": st["stop_sequence"]} if "stop_sequence" in st else {}),
                    "arrival_time": datetime.fromtimestamp(st["arrival_time"]) if "arrival_time" in st else pd.NaT,
                    "arrival_delay": st.get("arrival_delay"),
                    **({"arrival_uncertainty": st["arrival_uncertainty"]} if "arrival_uncertainty" in st else {}),
                    "departure_time": datetime.fromtimestamp(st["departure_time"]) if "departure_time" in st else pd.NaT,
                    "departure_delay": st.get("departure_delay"),
                    **({"departure_uncertainty": st["departure_uncertainty"]} if "departure_uncertainty" in st else {}),
                }
            )

        # self.stop_time_updates = [
        #     {
        #         "stop_id": st.get("stop_id"),
        #         "stop_sequence": st.get("stop_sequence"),
        #         "arrival_time": (
        #             datetime.fromtimestamp(st["arrival_time"])
        #             if st.get("arrival_time")
        #             else pd.NaT
        #         ),
        #         "arrival_delay": st.get("arrival_delay"),
        #         "arrival_uncertainty": st.get("arrival_uncertainty"),
        #         "departure_time": (
        #             datetime.fromtimestamp(st["departure_time"])
        #             if st.get("departure_time")
        #             else pd.NaT
        #         ),
        #         "departure_delay": st.get("departure_delay"),
        #         "departure_uncertainty": st.get("departure_uncertainty"),
        #     }
        #     for st in stop_time_updates
        # ]
    
    def flatten(self):
        # un-nest the stop time updates
        parent_vars = {k:v for k, v in vars(self).items() if k != "stop_time_updates"}
        return [dict(parent_vars, **st) for st in self.stop_time_updates]

    def __repr__(self):
        type_name = type(self).__name__
        attr_string = ", ".join([f"{k}={v!r}" for k, v in vars(self).items()])
        return f"{type_name}({attr_string})"

    def __str__(self):
        type_name = type(self).__name__
        max_attr_len = len(max(vars(self), key=len))
        attr_strings = [f"{k:{max_attr_len}}  {v}" for k, v in vars(self).items()]
        return "\n  ".join([type_name] + attr_strings)


Poll the trip updates the GTFS Realtime TripUpdates feed directly


In [None]:
from tsp_gtfs_realtime.gtfs_realtime_api_poller import GTFSRealtimeAPIPoller, GTFSRealtimeConfig


Point `config_folder` to a folder with agency json config files, or set `config_files` manually


In [None]:
# get all config files
config_folder = Path("../tsp_gtfs_realtime/config/agencies")
config_files = sorted(config_folder.glob("*.json"))


Poll each feed `num_samples` times

Instantiating the pollers outside the loop allows them each to manage the polling rate. This could be parallelized.


In [None]:
# create pollers for each feed
agency_gtfs_realtime_api_poller = {
    config_file.stem: GTFSRealtimeAPIPoller(GTFSRealtimeConfig.from_inputs(config_file=config_file)) for config_file in config_files
}
agency_trip_delay_data = {config_file.stem: [] for config_file in config_files}


In [None]:
num_samples = 10

for sample_idx in range(num_samples):
    for agency, gtfs_realtime_api_poller in agency_gtfs_realtime_api_poller.items():
        print(f"Sample {sample_idx}, Polling {agency}")
        try:
            gtfs_realtime_api_poller.poll_trip_updates()
            agency_trip_delay_data[agency].extend(
                [
                    TripDelayData(
                        trip_id=fields.get("trip_id"),
                        timestamp=fields.get("timestamp"),
                        delay=fields.get("delay"),
                        stop_time_updates=fields.get("stop_time_updates"),
                    )
                    for _, fields in gtfs_realtime_api_poller.trip_updates
                ]
            )
        except Exception as e:
            print(e)
            print(f"Error, skipping {agency} {sample_idx}")
            continue


In [None]:
# in case some agency never got good data delete it
for agency in list(agency_trip_delay_data):
    if not agency_trip_delay_data[agency]:
        print(f"{agency} never received good data, deleting")
        del agency_gtfs_realtime_api_poller[agency]
        del agency_trip_delay_data[agency]


In [None]:
df_agency_trip_delay_data = {
    agency: pd.DataFrame(
        chain(*[trip_delay.flatten() for trip_delay in trip_delays])
    )
    for agency, trip_delays in agency_trip_delay_data.items()
}

The data can also be saved to a csv file for further investigation or just appending data from different polling times


In [None]:
should_save_csv = True
should_append = True

csv_folder = Path("./output/csv_trip_delay/")
csv_folder.mkdir(parents=True, exist_ok=True)
if should_save_csv:
    for agency in df_agency_trip_delay_data.keys():
        csv_file = csv_folder.joinpath(f"{agency}.csv")
        if should_append and csv_file.is_file():
            df_agency_trip_delay_data[agency] = pd.concat(
                [pd.read_csv(csv_file), df_agency_trip_delay_data[agency]],
                ignore_index=True
            ).drop_duplicates()
        df_agency_trip_delay_data[agency].to_csv(csv_file, index=False)


Create a markdown summary of the agencies

This notes if any fields are missing, splitting these agencies to another section

It also notes the location with a link to the developer resources provided by the agency, and makes a table from a sample excerpt of the data

This is mainly to make copying to confluence easy, which can either be pasted directly, or require right-click > "Paste and Match Style" while editing a document.


In [None]:
for agency, df_trip_delay_data in df_agency_trip_delay_data.items():
    # only rows with arrival or departure delay populated
    df_delays = df_trip_delay_data.dropna(how="all", subset=["arrival_delay", "departure_delay"])
    # rows with different arrival and departure time (not including nans)
    df_different_delays = df_delays[df_delays["arrival_delay"] != df_delays["departure_delay"]].dropna(how="any", subset=["arrival_delay", "departure_delay"])
    # rows with only arrival or departure, but not both
    df_either_delays = df_delays[df_delays[["arrival_delay", "departure_delay"]].isna().any(axis=1)]

    if "arrival_time" not in df_trip_delay_data:
        df_times = df_either_times = df_trip_delay_data.dropna(how="all", subset=["departure_time"])
        df_different_times = pd.DataFrame(columns=df_times.columns)
    else:
        # only rows with arrival or departure times populated
        df_times = df_trip_delay_data.dropna(how="all", subset=["arrival_time", "departure_time"])
        # rows with different arrival and departure time (not including nans)
        df_different_times = df_times[df_times["arrival_time"] != df_times["departure_time"]].dropna(how="any", subset=["arrival_time", "departure_time"])
        # rows with only arrival or departure, but not both
        df_either_times = df_times[df_times[["arrival_time", "departure_time"]].isna().any(axis=1)]

    # print statements about the data to try to see patterns
    print(f"{agency} - {len(df_trip_delay_data)} ({len(df_delays)}, {len(df_different_delays)}, {len(df_either_delays)}) ({len(df_times)}, {len(df_different_times)}, {len(df_either_times)})")
    if df_delays.empty and df_times.empty:
        print("  no times or delays provided")
        continue
    if df_delays.empty:
        print("  delays not provided")
    else:
        if not len(df_delays) == len(df_trip_delay_data):
            pct = min(.999, len(df_delays)/len(df_trip_delay_data))
            print(f"  delays only {pct:.1%} populated")
        # disjoint
        if len(df_delays) == len(df_either_delays):
            if df_either_delays["departure_delay"].count() == len(df_delays):
                print("  only departure_delay used")
            elif df_either_delays["arrival_delay"].count() == len(df_delays):
                print("  only arrival_delay used")
            elif 0 < df_either_delays["departure_delay"].count() < df_either_delays["arrival_delay"].count():
                print("  departure delay is provided if currently at stop")
        # fully overlapping
        elif df_delays["departure_delay"].count() == df_delays["arrival_delay"].count():
            if df_delays["departure_delay"].equals(df_delays["arrival_delay"]):
                print("  departure and arrival delays all equal")
            else:
                pct = min(.999, 1-(len(df_different_delays)/len(df_delays)))
                print(f"  departure and arrival delays completely overlap, but only {pct:.1%} equal")
        # partially overlapping
        else:
            print("  departure and arrival delays partially overlap")
    if df_times.empty:
        print("  times not provided")
    else:
        if len(df_times) < len(df_trip_delay_data):
            pct = min(.999, len(df_times)/len(df_trip_delay_data))
            print(f"  times only {pct:.1%} populated")
        # disjoint
        if len(df_times) == len(df_either_times):
            if df_either_times["departure_time"].count() == len(df_times):
                print("  only departure_time used")
            elif df_either_times["arrival_time"].count() == len(df_times):
                print("  only arrival_time used")
            elif 0 < df_either_times["departure_time"].count() < df_either_times["arrival_time"].count():
                print("  departure time is provided if currently at stop")
        # fully overlapping
        elif df_times["departure_time"].count() == df_times["arrival_time"].count():
            if df_times["departure_time"].equals(df_times["arrival_time"]):
                print("  departure and arrival times all equal")
            else:
                pct = min(.999, 1-(len(df_different_times)/len(df_times)))
                print(f"  departure and arrival times completely overlap, but only {pct:.1%} equal")
        # partially overlapping
        else:
            print("  departure and arrival times partially overlap")
    # if "arrival_time" in df_trip_delay_data:
    df_trip_delay_data["arrival_time_diff"] = df_trip_delay_data["arrival_time"] - df_trip_delay_data["timestamp"]
    df_trip_delay_data["arrival_time_diff_min"] = df_trip_delay_data["arrival_time"] - df_trip_delay_data["timestamp"].min()
    df_trip_delay_data["arrival_time_diff_max"] = df_trip_delay_data["arrival_time"] - df_trip_delay_data["timestamp"].max()
    df_trip_delay_data["departure_time_diff"] = df_trip_delay_data["departure_time"] - df_trip_delay_data["timestamp"]
    df_trip_delay_data["departure_time_diff_min"] = df_trip_delay_data["departure_time"] - df_trip_delay_data["timestamp"].min()
    df_trip_delay_data["departure_time_diff_max"] = df_trip_delay_data["departure_time"] - df_trip_delay_data["timestamp"].max()


In [None]:
# get all the data, determine which agencies fully populate the gps data, save summary to markdown file
md_file = csv_folder.joinpath("summary.md")
trip_level_md_lines = []
some_populated_md_lines = []
none_populated_md_lines = []

for agency, df_trip_delay_data in df_agency_trip_delay_data.items():
    agency_name = agency.title().replace("_", " ")
    # get agency feed documentation link
    with open(config_folder.joinpath(f"{agency}.json")) as fp:
        data = json.load(fp)
        documentation_url = data.get("developer_resources")
        location = data.get("location")
        location_url_md = f"[{location}]({documentation_url})"

    md_lines = [f"### {agency_name}", location_url_md]

    # get trip-level summary if trip.delay used
    if df_trip_delay_data["delay"].any():
        df_trip_data = df_trip_delay_data.drop_duplicates(
            subset=["trip_id", "timestamp", "delay"]
        )
        total_trip_count = df_trip_delay_data["trip_id"].nunique()
        any_trip_delay_count = (
            df_trip_data.groupby("trip_id")["delay"].any().sum()
        )
        mean_trip_delay_count_coverage = (
            df_trip_data.groupby("trip_id")["delay"]
            .apply(lambda x: x.notnull().mean())
            .mean()
        )
        trip_coverage_summary = pd.DataFrame(
            {
                f"trip_delay_counts, total={total_trip_count}": [f"{any_trip_delay_count}"],
                "mean_trip_delay_coverage": [f"{mean_trip_delay_count_coverage:.1%}"],
                "any_trip_delay_coverage": [f"{any_trip_delay_count/total_trip_count:.1%}"],
            },
        )
        md_lines += [
            "#### trip-level summary",
            trip_coverage_summary.to_markdown(index=False, disable_numparse=True),
            "",
            "#### Sample of trip-level delay data",
            df_trip_data.sample(10).to_markdown(index=False),
            "",
        ]
        trip_level_md_lines += md_lines
        continue
    # else
    # get stop-level summary using each stop.delay
    total_stop_count = len(df_trip_delay_data[["arrival_delay", "departure_delay"]])
    df_trip_delay_data["Either"] = df_trip_delay_data[["arrival_delay", "departure_delay"]].mean(axis=1)
    stop_delay_keys = ["arrival_delay", "departure_delay", "Either"]
    stop_delay_counts = df_trip_delay_data[stop_delay_keys].count()
    stop_coverage_summary = pd.DataFrame(
        {
            f"stop_delay_counts, total={total_stop_count}": [f"{x}" for x in stop_delay_counts],
            "total_stop_delay_coverage": [f"{x/total_stop_count:.1%}" for x in stop_delay_counts],
        },
        index=stop_delay_keys,
    )

    # get trip-level summary using stop.delay per trip
    any_trip_stop_delay_counts = (
        df_trip_delay_data.groupby("trip_id")[stop_delay_keys]
        .apply(lambda x: x.notnull().any())
        .sum()
    )
    mean_trip_stop_delay_count_coverage = (
            df_trip_delay_data.groupby("trip_id")[stop_delay_keys]
            .apply(lambda x: x.notnull().mean())
            .mean()
        )
    total_trip_count = df_trip_delay_data["trip_id"].nunique()
    trip_coverage_summary = pd.DataFrame(
        {
            f"any_trip_stop_delay_counts, total={total_trip_count}": [f"{x}" for x in any_trip_stop_delay_counts],
            "mean_stop_delay_coverage_per_trip": [f"{x:.1%}" for x in mean_trip_stop_delay_count_coverage],
            "any_trip_delay_coverage": [f"{x/total_trip_count:.1%}" for x in any_trip_stop_delay_counts],
        },
        index=stop_delay_keys,
    )
    # print(partial_trip_summary.to_markdown(index = False, disable_numparse=True))

    md_lines += [
        "#### stop-level summary",
        stop_coverage_summary.to_markdown(disable_numparse=True),
        "",
        "#### stop-level summary per trip",
        trip_coverage_summary.to_markdown(disable_numparse=True),
        "",
        "#### Sample of stop-level delay data",
        df_trip_delay_data.sample(10).to_markdown(index=False),
        "",
    ]

    # split agencies into groups that provide all, some and no delay data
    if any_trip_stop_delay_counts["Either"] > 0:
        some_populated_md_lines += md_lines
    else:
        none_populated_md_lines += md_lines


In [None]:
# write each group to a markdown file
with open(md_file, "w") as fp:
    print(f"## Agencies with Trip-Level trip_delay_data", file=fp)
    print(
        "This only seems to be used by MTA_NYC, though it does seem like the simplest",
        "way to get provide current schedule adherence that is not as granular as",
        "the predicted delay for each stop. Table includes first StopTimeUpdate.",
        sep=" ",
        file=fp
    )
    print(*trip_level_md_lines, sep="\n", file=fp)
    print(f"## Agencies with trip_delay_data partially provided in StopTimeUpdates", file=fp)
    print(*some_populated_md_lines, sep="\n", file=fp)
    print(f"## Agencies with trip_delay_data not directly provided", file=fp)
    print(*none_populated_md_lines, sep="\n", file=fp)


In [None]:
# Stop the execution here when selecting "Run All", while not closing the kernel,
# still allowing the below cells to be run manually
raise SystemExit("Done, the following cells can be executed manually")


#

Alternatively, you could get the data from redis, with the api poller putting it there


In [None]:
from tsp_gtfs_realtime import AWSConfig
from tsp_gtfs_realtime.vehicle_manager import VehicleSubscriber


In [None]:
# create video subscriber and subscribe to all vehicle updates
aws_cfg = AWSConfig(
    local_development=True,
    redis_url="localhost",
    redis_port=6379,
)
vehicle_subscriber = VehicleSubscriber(aws_cfg)


In [None]:
# first message is the subscription notification
msg = vehicle_subscriber.pubsub.get_message()

# get new vehicle_position message and vehicle_id
msg = vehicle_subscriber.pubsub.get_message()
vehicle_id = msg["channel"].split(":")[-1]


In [None]:
# get new vehicle_position data from cache
vehicle_subscriber.get_updated_vehicle_position(vehicle_id)

# get fields for trip_delay_data AirLink Telemetry Protocol (ATP)
trip_delay_data = GPSData(
    vehicle_id=vehicle_subscriber.vehicle_positions[vehicle_id].get("vehicle_id"),
    timestamp=vehicle_subscriber.vehicle_positions[vehicle_id].get("timestamp"),
    latitude=vehicle_subscriber.vehicle_positions[vehicle_id].get("latitude"),
    longitude=vehicle_subscriber.vehicle_positions[vehicle_id].get("longitude"),
    bearing=vehicle_subscriber.vehicle_positions[vehicle_id].get("bearing"),
    speed=vehicle_subscriber.vehicle_positions[vehicle_id].get("speed"),
)

print(f"{vehicle_id}: {trip_delay_data!r}")
