# How well do operators report their buses in BODS?

- download the timetables for a given day.
- download the live location zip file for a given day.
- count how many `trip_id` from the timetable appear at least once in the live location data
- group by operator
- make a chart by region and operator 
- put the chart online and set up a GH action to update this daily

Notes:
- only save the output data file, do everything else in memory
- include a retry for getting the data

In [None]:
GTFSRT_URL = "https://data.datalibrary.uk/transport/BODS-ARCHIVE/gtfsrt/"
# https://data.datalibrary.uk/transport/BODS-ARCHIVE/gtfsrt/2025/07/22/gtfsrt-20250722.zip

In [None]:
from datetime import datetime
from datetime import timedelta
import os
import requests
from pathlib import Path
import pandas as pd

In [None]:
# Create the file path we need to download based on the date
yesterday = (datetime.today() - timedelta(days=1)).date()
fname = 'gtfsrt-' + yesterday.strftime("%Y%m%d") + '.zip'
file_to_download = os.path.join(GTFSRT_URL, 
                                str(yesterday.year), 
                                str(yesterday.month).zfill(2), 
                                str(yesterday.day).zfill(2),
                                fname)

In [None]:
def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    return local_filename

# if file_to_download exists, dont download it again
if Path(fname).exists():
    print(f"{fname} already exists, skipping download.")
else:
    print(f"Downloading {file_to_download}...")
    # Download the file 
    download_file(file_to_download)

In [None]:
from zipfile import ZipFile

with ZipFile('gtfsrt-20250722.zip') as zf:
    i = 0
    for file in zf.namelist():
        with zf.open(file, 'r') as f:
            # get the file name
            file_name = Path(file).name
            # read the content
            content = f.read()
            # write to a new file in the current directory
            with open("temp/" + file_name, 'wb') as out_file:
                out_file.write(content)

In [None]:
from gtfs_realtime_utils import get_gtfs_entities_from_directory
from gtfsrt_to_csv import entities_to_dataframe, remove_duplicate_reports
from operator import attrgetter

entities = get_gtfs_entities_from_directory("temp")
# df = entities_to_dataframe(entities, round=5)
getter = attrgetter('vehicle.trip.trip_id')
trip_ids = list(map(getter, entities))
# trip_ids = [e.vehicle.trip.trip_id for e in entities]
print("got records")
df = pd.DataFrame(data={'trip_id': trip_ids})
print("Added to df")

In [None]:
num_unique_ids_realtime = df.trip_id.nunique()
print("Number of unique trip_ids: ", num_unique_ids_realtime)
unique_ids_realtime = df.trip_id.unique()

In [None]:
from gtfs_utils import GTFSTimetable
timetable = GTFSTimetable("timetables/itm_london_gtfs_20250722.zip")

In [None]:
agency = timetable.dfs['agency']
routes = timetable.dfs['routes']
trips = timetable.dfs['trips']
calendar = timetable.dfs['calendar']

full = agency.merge(routes, on='agency_id', how='inner').merge(trips, on='route_id', how='inner').merge(calendar, on='service_id', how='inner')

In [None]:
today = datetime.today()
today_as_int = int(today.strftime("%Y%m%d"))
day_of_week = today.strftime("%A").lower()
print(today_as_int, day_of_week)

# Start date is today is today or before, end date is today or after, and service runs on this weekday
full_for_today = full[(full.start_date <= today_as_int) & (full.end_date >= today_as_int) & (full[day_of_week]==1)]

In [None]:
filtered = full_for_today[full_for_today.trip_id.isin(unique_ids_realtime)].filter(items=['agency_id', 'agency_name', 'route_short_name', 'trip_id'])
number_in_real = filtered.groupby(['agency_name', 'agency_id'])['trip_id'].size().reset_index().rename(columns={'trip_id': 'real'})

In [None]:
number_in_timetable = full_for_today.groupby(['agency_name', 'agency_id'])['trip_id'].size().reset_index().rename(columns={'trip_id': 'timetable'})

In [None]:
res = number_in_real.merge(number_in_timetable,on=['agency_name', 'agency_id'], how='left')
res['percentage_real_in_timetable'] = (res['real'] / res['timetable']).mul(100).round(2)
res.sort_values(by=['percentage_real_in_timetable'], inplace=True, ascending=True)

In [None]:
# Top 5 and Bottom 5 based on 'score'
top_bottom_df = pd.concat([
    res.nlargest(5, 'percentage_real_in_timetable'),
    res.nsmallest(5, 'percentage_real_in_timetable')
]).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt

# Set figure height dynamically (e.g., 0.5 inches per label)
fig_height = 0.5 * len(top_bottom_df.agency_name)

plt.figure(figsize=(8, fig_height))  # width=8, height=variable
plt.barh(top_bottom_df.agency_name, top_bottom_df.percentage_real_in_timetable)
plt.xlabel('Percentage of timetable trips observed at least once in BODS real-time data')
plt.title(yesterday)
plt.tight_layout()
plt.grid(axis='x')
plt.xlim(0,100)
plt.show()