GPS data is normally sent to the CreateRTRADIOMsg lambda in AirLink Telemetry Protocol (ATP)

For this solution, it will be filled in with data from GTFS Realtime feeds, with estimated values compensating for the slower message rate

This notebook will show how to get that and test agencies to ensure they have fully provided the GPS data in their feed


In [None]:
import pandas as pd
from datetime import datetime

from pathlib import Path
import json


In [None]:
class GPSData:
    def __init__(
        self, latitude, longitude, bearing, speed, timestamp=None, vehicle_id=None
    ):
        if vehicle_id:
            self.vehicle_id = vehicle_id
        if timestamp:
            self.timestamp = datetime.fromtimestamp(timestamp)
        self.latitude = latitude
        self.longitude = longitude
        self.bearing = bearing
        self.speed = speed

    def __repr__(self):
        type_name = type(self).__name__
        attr_string = ", ".join([f"{k}={v!r}" for k, v in vars(self).items()])
        return f"{type_name}({attr_string})"

    def __str__(self):
        type_name = type(self).__name__
        max_attr_len = len(max(vars(self), key=len))
        attr_strings = [f"{k:{max_attr_len}}  {v}" for k, v in vars(self).items()]
        return "\n  ".join([type_name] + attr_strings)


Poll the vehicle position the GTFS Realtime VehiclePosition feed directly


In [None]:
from tsp_gtfs_realtime.core.gtfs_realtime import GTFSRealtimeAPIPoller, GTFSRealtimeConfig


Point `config_folder` to a folder with agency json config files, or set `config_files` manually


In [None]:
# get all config files
config_folder = Path("../tsp_gtfs_realtime/config/agencies")
config_files = sorted(config_folder.glob("*.json"))


Poll each feed `num_samples` times

Instantiating the pollers outside the loop allows them each to manage the polling rate. This could be parallelized.


In [None]:
# create pollers for each feed
agency_gtfs_realtime_api_poller = {
    config_file.stem: GTFSRealtimeAPIPoller(GTFSRealtimeConfig.from_inputs(config_file=config_file)) for config_file in config_files
}
agency_gps_data = {config_file.stem: [] for config_file in config_files}


In [None]:
num_samples = 10

for sample_idx in range(num_samples):
    for agency, gtfs_realtime_api_poller in agency_gtfs_realtime_api_poller.items():
        print(f"Sample {sample_idx}, Polling {agency}")
        try:
            gtfs_realtime_api_poller.poll_vehicle_positions()
            agency_gps_data[agency].extend(
                [
                    GPSData(
                        vehicle_id=fields.get("vehicle_id"),
                        timestamp=fields.get("timestamp"),
                        latitude=fields.get("latitude"),
                        longitude=fields.get("longitude"),
                        bearing=fields.get("bearing"),
                        speed=fields.get("speed"),
                    )
                    for _, fields in gtfs_realtime_api_poller.vehicle_positions
                ]
            )
        except Exception as e:
            print(e)
            print(f"Error, skipping {agency} {sample_idx}")
            continue


In [None]:
# in case some agency never got good data delete it
for agency in list(agency_gps_data):
    if not agency_gps_data[agency]:
        print(f"{agency} never received good data, deleting")
        del agency_gtfs_realtime_api_poller[agency]
        del agency_gps_data[agency]


In [None]:
# this probably should just replace agency_gps_data entirely
# create dataframe for each agency
df_agency_gps_data = {
    agency: pd.DataFrame([vars(d) for d in gps_data])
    for agency, gps_data in agency_gps_data.items()
}


The data can also be saved to a csv file for further investigation or just appending data from different polling times


In [None]:
should_save_csv = True
should_append = True

csv_folder = Path("./output/csv/")
csv_folder.mkdir(parents=True, exist_ok=True)

if should_save_csv:
    for agency, df_gps_data in df_agency_gps_data.items():
        csv_file = csv_folder.joinpath(f"{agency}.csv")
        if should_append and csv_file.is_file():
            df_gps_data = pd.concat(
                [pd.read_csv(csv_file), df_gps_data], ignore_index=True
            ).drop_duplicates()
        df_gps_data.to_csv(csv_file, index=False)


Create a markdown summary of the agencies

This notes if any fields are missing, splitting these agencies to another section

It also notes the location with a link to the developer resources provided by the agency, and makes a table from a sample excerpt of the data

This is mainly to make copying to confluence easy, which can either be pasted directly, or require right-click > "Paste and Match Style" while editing a document.


In [None]:
# get all the data, determine which agencies fully populate the gps data, save summary to markdown file
md_file = csv_folder.joinpath("summary.md")
missing_md_lines = []
populated_md_lines = []

for agency, df_gps_data in df_agency_gps_data.items():
    agency_name = agency.title().replace("_", " ")
    # get agency feed documentation link
    with open(config_folder.joinpath(f"{agency}.json")) as fp:
        data = json.load(fp)
        documentation_url = data.get("developer_resources")
        location = data.get("location")
        location_url_md = f"[{location}]({documentation_url})"

    missing_fields = (
        df_gps_data[["latitude", "longitude", "bearing", "speed"]] == 0
    ).all()
    if missing_fields.any():
        # bold missing fields
        missing_fields_str = ", ".join(
            [f"**{a}**" for a in missing_fields[missing_fields].index]
        )
        missing_md_lines += [
            f"### {agency_name}",
            location_url_md,
            "",
            f"Missing {missing_fields_str}",
            df_gps_data.head(10).to_markdown(index=False),
            "",
        ]
    else:
        populated_md_lines += [
            f"### {agency_name}",
            location_url_md,
            # print sample from fully populated rows
            df_gps_data.loc[(df_gps_data != 0).all(axis=1)]
            .head(10)
            .to_markdown(index=False),
            "",
        ]

with open(md_file, "w") as fp:
    print(f"## Agencies with Missing gps_data Fields", file=fp)
    print(*missing_md_lines, sep="\n", file=fp)
    print(f"## Agencies with Fully Populated gps_data", file=fp)
    print(*populated_md_lines, sep="\n", file=fp)


In [None]:
# Stop the execution here when selecting "Run All", while not closing the kernel,
# still allowing the below cells to be run manually
raise SystemExit("Done, the following cells can be executed manually")


#

Alternatively, you could get the data from redis, with the api poller putting it there


In [None]:
from tsp_gtfs_realtime import AWSConfig
from tsp_gtfs_realtime.vehicle_manager import VehicleSubscriber


In [None]:
# create video subscriber and subscribe to all vehicle updates
aws_cfg = AWSConfig(
    local_development=True,
    redis_url="localhost",
    redis_port=6379,
)
vehicle_subscriber = VehicleSubscriber(aws_cfg)


In [None]:
# first message is the subscription notification
msg = vehicle_subscriber.pubsub.get_message()

# get new vehicle_position message and vehicle_id
msg = vehicle_subscriber.pubsub.get_message()
vehicle_id = msg["channel"].split(":")[-1]


In [None]:
# get new vehicle_position data from cache
vehicle_subscriber.get_updated_vehicle_position(vehicle_id)

# get fields for gps_data AirLink Telemetry Protocol (ATP)
gps_data = GPSData(
    vehicle_id=vehicle_subscriber.vehicle_positions[vehicle_id].get("vehicle_id"),
    timestamp=vehicle_subscriber.vehicle_positions[vehicle_id].get("timestamp"),
    latitude=vehicle_subscriber.vehicle_positions[vehicle_id].get("latitude"),
    longitude=vehicle_subscriber.vehicle_positions[vehicle_id].get("longitude"),
    bearing=vehicle_subscriber.vehicle_positions[vehicle_id].get("bearing"),
    speed=vehicle_subscriber.vehicle_positions[vehicle_id].get("speed"),
)

print(f"{vehicle_id}: {gps_data!r}")
