Point of this notebook is to evaluate good the data is for LCRCA area in terms of coverage compared to the timetable and coverage of individual trips.

## Loading packages and setting paths

In [82]:
from gtfs_realtime_utils import *
from utils import *
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
ROOT = Path("../")
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/bus-tracking')

## Loading the timetables for a specific region

In [83]:
# Load the timetable
region = 'north_west'
date = '20240915'
tt_agencies, tt_routes, tt_trips, tt_stops, tt_stop_times, tt_calendar, tt_calendar_dates = load_full_gtfs(ROOT / f"18SepGB_GTFS_Timetables_Downloaded/itm_{region}_gtfs.zip")

File "../18SepGB_GTFS_Timetables_Downloaded/itm_north_west_gtfs.zip" is a zip file. Unzipping and reading...


In [84]:
def unique_timetabled_trips_for_date(trips, calendar, date):
    '''Calculate the number of unique trips for a given date using GTFS data loaded into a pandas dataframe'''
    int_date = int(date)
    str_date = str(date)
    assert type(trips) == pd.DataFrame, '"trips" is not a dataframe.'
    assert type(calendar) == pd.DataFrame, '"calendar" is not a dataframe.'
    assert 'service_id' in calendar.columns, f"'service_id' not in {calendar.columns}"
    assert 'trip_id' in trips.columns, f"'service_id' not in {trips.columns}"

    p = trips.merge(calendar, on='service_id', how='inner')
    # Convert to datetime object
    date_obj = datetime.fromisoformat(str_date)
    # Get the day of the week
    day_of_week = date_obj.strftime('%A').lower()
    # Select rows that run on that day of the week
    p = p[p[day_of_week]==1]
    # Select rows where the service date range covers the input date
    p = p[(p.start_date <= int_date) & (p.end_date >= int_date)]
    # Return the number of unique trip_id for that date of the timetable.
    return p

def count_unique_trip_id(df):
    assert 'trip_id' in df.columns
    return len(df.trip_id.unique())

In [85]:
# count_unique_trip_id(unique_timetabled_trips_for_date(tt_trips, tt_calendar, date=date))

## Loading all bus data for England (excl. London) on a specific day

In [86]:
rt_data = pd.read_csv(ROOT / f"data/gtfs-rt/csv/{date}.csv",low_memory=False)
rt_data['trip_id'] = rt_data['trip_id'].fillna('')
# rt_data

## Recovering trip_ids that drop out and reappear for the same vehicle_id

In [87]:
# How many trip_ids can we recover
trip_id_list = rt_data.trip_id.to_list()
filled_dropouts = fill_trip_ids(trip_id_list)
before_filling_gaps = fraction_with_trip_id(trip_id_list)
after_filling_gaps = fraction_with_trip_id(filled_dropouts)

## How many of the Trip IDs in the North West timetable have at least one stop inside the Liverpool City Region CA boundary?

In [88]:
# Load the boundary of LCRCA
bndry = gpd.read_file(ROOT / "data/geojson/LCRCA_May2023_Boundary_EN_BGC.geojson")

In [89]:
def count_cancelled_buses(data):
    subset = data[data.schedule_relationship != 0]
    return len(subset)

In [90]:
# Get a dataframe of trips that run on input date
trips_on_this_date_df = unique_timetabled_trips_for_date(tt_trips, tt_calendar, date=date)

# Get a list of trip_ids that run on input date
list_of_trip_ids_on_this_date = trips_on_this_date_df.trip_id.unique()
# Use that to filter the stop_times down to only ones on input date
tt_stop_times_this_date = tt_stop_times[tt_stop_times.trip_id.isin(list_of_trip_ids_on_this_date)]

# Get all the stops that are on trips that run on input date
full_stop_info = tt_stops.merge(tt_stop_times_this_date, on='stop_id', how='inner').loc[:, ['trip_id', 'stop_lat', 'stop_lon']]

# Create Point geometries for the stop coordinates.
stop_points = [Point(xy) for xy in zip(full_stop_info['stop_lon'], full_stop_info['stop_lat'])]

# Create a GeoDataFrame of the stops that are stopped at on the input date
points_gdf = gpd.GeoDataFrame(full_stop_info, geometry=stop_points, crs='EPSG:4326')

# Perform spatial join with the boundary based on whether each stop is within the boundary or not.
joined_gdf = gpd.sjoin(points_gdf, bndry, how="left", predicate="within")

# Filter the geo-df to only stops that are within the boundary (right index is not NA)
points_inside_bndry = joined_gdf[joined_gdf.index_right.notna()]

# Get a list of the unique Trip IDs that have at least one stop in the boundary
list_of_trips_in_boundary = points_inside_bndry.trip_id.unique()

# Determine how many of the above there are.
number_in_boundary = len(list_of_trips_in_boundary)

# How many of the trip_ids that have at least 1 stop inside the boundary are in the live data?
real_data_in_boundary = rt_data[rt_data.trip_id.isin(list_of_trips_in_boundary)]

number_in_boundary_and_live_data = len(real_data_in_boundary.trip_id.unique())

# Count the occurrences of each trip_id
trip_counts = real_data_in_boundary['trip_id'].value_counts()

# Filter to get trip_ids that appear at least 10 times
filtered_trip_ids = trip_counts[trip_counts >= 10]

# Total number of unique trip_ids
total_unique_trips = real_data_in_boundary['trip_id'].nunique()

# Calculate the fraction
fraction = len(filtered_trip_ids) / total_unique_trips
with open(ROOT / f"data/evaluate/{date}.txt", 'w') as f:
    f.writelines(f"Date: {date}")
    f.writelines("\n--BODS (all England)--")
    f.writelines(f"\nPercentage of de-duplicated BODS data with trip_id(s) before filling gaps: {round(before_filling_gaps,2)}%")
    f.writelines(f"\nPercentage of de-duplicated BODS data with trip_id(s) after filling gaps: {round(after_filling_gaps, 2)}%")
    f.writelines(f"\nPercentage of de-duplicated BODS data without trip_id: {round(100-after_filling_gaps,2)}%")
    f.writelines(f"\nPercentage of de-duplicated BODS data we recovered a trip_id: {round(after_filling_gaps - before_filling_gaps, 2)}%")
    f.writelines("\n\n--LCRCA ONLY--")
    # f.writelines(f"\nNumber of unique trips timetabled in Liverpool City Region on {date}: {count_unique_trip_id(trips_on_this_date_df)}")
    # f.writelines(f"\nNumber of unique trip_ids with at least one stop in Liverpool City Region CA: {number_in_boundary}")
    # f.writelines(f"\nNumber of those trip_ids that are also in our live data: {number_in_boundary_and_live_data}")
    f.writelines(f"\nPercentage of timetabled buses for which we tracked at least 1 data point: {round(100*number_in_boundary_and_live_data/number_in_boundary,2)}%")
    f.writelines(f"\nCancelled buses that were in the real data: {count_cancelled_buses(real_data_in_boundary)}")
    f.writelines("\nBuses not tracked because they were cancelled: unknown")
    f.writelines("\nBuses that never appear in BODS, but the buses still ran: unknown")
    f.writelines(f"\nPercentage of buses we tracked with trip_ids and at least 10 different locations/times: {round(100*fraction, 2)}")
    f.writelines("\n-------------------------------")