Point of this notebook is to evaluate good the data is for LCRCA area in terms of coverage compared to the timetable and coverage of individual trips.

## Loading packages and setting paths

In [21]:
from gtfs_realtime_utils import *
from utils import *
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
ROOT = Path("../")
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/bus-tracking')

## Loading the timetables for a specific region

In [22]:
# Load the timetable
region = 'north_west'
date = '20240923'
tt_agencies, tt_routes, tt_trips, tt_stops, tt_stop_times, tt_calendar, tt_calendar_dates = load_full_gtfs(ROOT / f"18SepGB_GTFS_Timetables_Downloaded/itm_{region}_gtfs.zip")

File "../18SepGB_GTFS_Timetables_Downloaded/itm_north_west_gtfs.zip" is a zip file. Unzipping and reading...


In [23]:
def unique_timetabled_trips_for_date(trips, calendar, date):
    '''Calculate the number of unique trips for a given date using GTFS data loaded into a pandas dataframe'''
    int_date = int(date)
    str_date = str(date)
    assert type(trips) == pd.DataFrame, '"trips" is not a dataframe.'
    assert type(calendar) == pd.DataFrame, '"calendar" is not a dataframe.'
    assert 'service_id' in calendar.columns, f"'service_id' not in {calendar.columns}"
    assert 'trip_id' in trips.columns, f"'service_id' not in {trips.columns}"

    p = trips.merge(calendar, on='service_id', how='inner')
    # Convert to datetime object
    date_obj = datetime.fromisoformat(str_date)
    # Get the day of the week
    day_of_week = date_obj.strftime('%A').lower()
    # Select rows that run on that day of the week
    p = p[p[day_of_week]==1]
    # Select rows where the service date range covers the input date
    p = p[(p.start_date <= int_date) & (p.end_date >= int_date)]
    # Return the number of unique trip_id for that date of the timetable.
    return p

def count_unique_trip_id(df):
    assert 'trip_id' in df.columns
    return len(df.trip_id.unique())

In [24]:
count_unique_trip_id(unique_timetabled_trips_for_date(tt_trips, tt_calendar, date=date))

39275

## Loading all bus data for England (excl. London) on a specific day

In [25]:
rt_data = pd.read_csv(ROOT / f"data/gtfs-rt/csv/{date}.csv",low_memory=False)
rt_data['trip_id'] = rt_data['trip_id'].fillna('')
rt_data

Unnamed: 0,trip_id,start_time,start_date,schedule_relationship,route_id,latitude,longitude,bearing,stop_sequence,status,timestamp,vehicle_id
0,,,,0,,52.87360,0.50088,0.0,0,2,1726844771,.2-YJ55BJE
1,,,,0,,52.75412,0.39976,0.0,0,2,1727084117,.2-YJ55BJE
2,,,,0,,52.75440,0.39968,38.0,0,2,1727084205,.2-YJ55BJE
3,VJ211559a568a7bd128bcd4560f90f0b7a4920f27b,09:40:00,20240923.0,0,5240.0,52.75459,0.40012,0.0,1,2,1727084268,.2-YJ55BJE
4,VJ211559a568a7bd128bcd4560f90f0b7a4920f27b,09:40:00,20240923.0,0,5240.0,52.75459,0.40012,0.0,1,2,1727084301,.2-YJ55BJE
...,...,...,...,...,...,...,...,...,...,...,...,...
10834228,VJbc5e4ca32ba5041ecbfc81c9db7dd79f58169f50,20:19:00,20240923.0,0,3982.0,53.39408,-2.61254,189.0,9,2,1727123048,zDepot__999_
10834229,VJbc5e4ca32ba5041ecbfc81c9db7dd79f58169f50,20:19:00,20240923.0,0,3982.0,53.38960,-2.61134,164.0,11,2,1727123094,zDepot__999_
10834230,VJbc5e4ca32ba5041ecbfc81c9db7dd79f58169f50,20:19:00,20240923.0,0,3982.0,53.38882,-2.60631,87.0,11,2,1727123171,zDepot__999_
10834231,,,,0,,53.38918,-2.59720,348.0,0,2,1727123231,zDepot__999_


## Recovering trip_ids that drop out and reappear for the same vehicle_id

In [26]:
# How many trip_ids can we recover
trip_id_list = rt_data.trip_id.to_list()
filled_dropouts = fill_trip_ids(trip_id_list)
print("Fraction of data with trip_id(s) before filling gaps:", fraction_with_trip_id(trip_id_list))
print("Fraction of data with trip_id(s) after filling gaps:", fraction_with_trip_id(filled_dropouts))

Fraction of data with trip_id(s) before filling gaps: 84.76987
Fraction of data with trip_id(s) after filling gaps: 86.61888


## How many of the Trip IDs in the North West timetable have at least one stop inside the Liverpool City Region CA boundary?

In [27]:
# Load the boundary of LCRCA
bndry = gpd.read_file(ROOT / "data/geojson/LCRCA_May2023_Boundary_EN_BGC.geojson")

In [28]:
def count_cancelled_buses(data):
    subset = data[data.schedule_relationship != 0]
    return len(subset)

In [29]:
# Get a dataframe of trips that run on input date
trips_on_this_date_df = unique_timetabled_trips_for_date(tt_trips, tt_calendar, date=date)
print("Date:", date)
print(f"Number of unique trips timetabled in Liverpool City Region on {date}: {count_unique_trip_id(trips_on_this_date_df)}")
# Get a list of trip_ids that run on input date
list_of_trip_ids_on_this_date = trips_on_this_date_df.trip_id.unique()
# Use that to filter the stop_times down to only ones on input date
tt_stop_times_this_date = tt_stop_times[tt_stop_times.trip_id.isin(list_of_trip_ids_on_this_date)]

# Get all the stops that are on trips that run on input date
full_stop_info = tt_stops.merge(tt_stop_times_this_date, on='stop_id', how='inner').loc[:, ['trip_id', 'stop_lat', 'stop_lon']]

# Create Point geometries for the stop coordinates.
stop_points = [Point(xy) for xy in zip(full_stop_info['stop_lon'], full_stop_info['stop_lat'])]

# Create a GeoDataFrame of the stops that are stopped at on the input date
points_gdf = gpd.GeoDataFrame(full_stop_info, geometry=stop_points, crs='EPSG:4326')

# Perform spatial join with the boundary based on whether each stop is within the boundary or not.
joined_gdf = gpd.sjoin(points_gdf, bndry, how="left", predicate="within")

# Filter the geo-df to only stops that are within the boundary (right index is not NA)
points_inside_bndry = joined_gdf[joined_gdf.index_right.notna()]

# Get a list of the unique Trip IDs that have at least one stop in the boundary
list_of_trips_in_boundary = points_inside_bndry.trip_id.unique()

# Determine how many of the above there are.
number_in_boundary = len(list_of_trips_in_boundary)

# How many of the trip_ids that have at least 1 stop inside the boundary are in the live data?
real_data_in_boundary = rt_data[rt_data.trip_id.isin(list_of_trips_in_boundary)]
number_in_boundary_and_live_data = len(real_data_in_boundary.trip_id.unique())

print(f"Number of unique trip_ids with at least one stop in Liverpool City Region CA:", number_in_boundary)
print(f"Number of those trip_ids that are also in our live data:", number_in_boundary_and_live_data)
print(f"Percentage: {round(100*number_in_boundary_and_live_data/number_in_boundary,2)}%")
print("Cancelled buses that were in the real data:", count_cancelled_buses(real_data_in_boundary))
print("-------------------------------")

Date: 20240923
Number of unique trips timetabled in Liverpool City Region on 20240923: 39275
Number of unique trip_ids with at least one stop in Liverpool City Region CA: 9934
Number of those trip_ids that are also in our live data: 5883
Percentage: 59.22%
Cancelled buses that were in the real data: 0
-------------------------------


In [30]:
# real_data_in_boundary.groupby('trip_id')['trip_id'].value_counts() >= 10