In [None]:
from gtfs_realtime_utils import *
from utils import *
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
ROOT = Path("../")
ROOT.resolve()

In [None]:
def gtfsrt_filepaths_to_list(dir, date):
    date_with_dashes = f"{date[0:4]}-{date[4:6]}-{date[6:8]}"
    # Create an empty list to store file paths
    gtfs_rt_file_paths = []

    # Walk through the directory
    for root, dirs, files in os.walk(dir):
        for file in files:
            # Get the full path of the file and append it to the list
            if file[0:10] == date_with_dashes: #@TODO improve the slice here to a regex match for the date.
                full_path = os.path.abspath(os.path.join(root, file))
                gtfs_rt_file_paths.append(full_path)
    return gtfs_rt_file_paths
date='20240916'
REALTIME_DATADIR = ROOT / f"data/gtfs-rt"
gtfsrt_filepaths = gtfsrt_filepaths_to_list(dir=REALTIME_DATADIR, date=date)

In [None]:
# Initialise the feed object
feed = gtfs_realtime_pb2.FeedMessage()
# Add all the entities to a list to iterate through later.
entities = entities_to_list(feed, gtfsrt_filepaths)

In [None]:
def entity_list_to_df(entities):
    trip_ids = []
    start_times = []
    start_dates = []
    schedule_relationships = []
    route_ids = []
    latitude = []
    longitude = []
    bearing = []
    stop_sequence = []
    status = []
    timestamps = []
    vehicle_ids = []
    for e in entities:
        v = e.vehicle
        trip_ids.append(v.trip.trip_id)
        start_times.append(v.trip.start_time)
        start_dates.append(v.trip.start_date)
        schedule_relationships.append(v.trip.schedule_relationship)
        route_ids.append(v.trip.route_id)
        longitude.append(v.position.longitude)
        latitude.append(v.position.latitude)
        bearing.append(v.position.bearing)
        stop_sequence.append(v.current_stop_sequence)
        status.append(v.current_status)
        timestamps.append(v.timestamp)
        vehicle_ids.append(v.vehicle.id)

    data = pd.DataFrame({'trip_id': trip_ids, 'start_time': start_times, 'start_date': start_dates, 'schedule_relationship': schedule_relationships, 'route_id': route_ids,
                        'latitude': latitude, 'longitude': longitude, 'bearing': bearing, 'stop_sequence': stop_sequence, 'status': status, 'timestamp': timestamps, 'vehicle_id': vehicle_ids})

    data['human_time'] = pd.to_datetime(data['timestamp'], unit='s')
    data['latitude'] = data['latitude'].round(5)
    data['longitude'] = data['longitude'].round(5)

    # Sort the data
    data.sort_values(by=['vehicle_id', 'timestamp', 'trip_id'], ascending=True, inplace=True)

    return data

data = entity_list_to_df(entities)

In [None]:
# Load the timetable
region = 'north_west'
tt_agencies, tt_routes, tt_trips, tt_stops, tt_stop_times, tt_calendar, tt_calendar_dates = load_full_gtfs(ROOT / f"18SepGB_GTFS_Timetables_Downloaded/itm_{region}_gtfs.zip")

In [None]:
def unique_timetabled_trips_for_date(trips, calendar, date):
    '''Calculate the number of unique trips for a given date using GTFS data loaded into a pandas dataframe'''
    int_date = int(date)
    str_date = str(date)
    assert type(trips) == pd.DataFrame, '"trips" is not a dataframe.'
    assert type(calendar) == pd.DataFrame, '"calendar" is not a dataframe.'
    assert 'service_id' in calendar.columns, f"'service_id' not in {calendar.columns}"
    assert 'trip_id' in trips.columns, f"'service_id' not in {trips.columns}"

    p = trips.merge(calendar, on='service_id', how='inner')
    # Convert to datetime object
    date_obj = datetime.fromisoformat(str_date)
    # Get the day of the week
    day_of_week = date_obj.strftime('%A').lower()
    # Select rows that run on that day of the week
    p = p[p[day_of_week]==1]
    # Select rows where the service date range covers the input date
    p = p[(p.start_date <= int_date) & (p.end_date >= int_date)]
    # Return the number of unique trip_id for that date of the timetable.
    return p

def count_unique_trip_id(df):
    assert 'trip_id' in df.columns
    return len(df.trip_id.unique())

## Q1. How many of the Trip IDs in the North West timetable have at least one stop inside the Liverpool City Region CA boundary?

In [None]:
# Load the boundary of LCRCA
bndry = gpd.read_file(ROOT / "data/geojson/LCRCA_May2023_Boundary_EN_BGC.geojson")

In [None]:
# Get a dataframe of trips that run on input date
for date in range(20240915, 20240924):
    trips_on_this_date_df = unique_timetabled_trips_for_date(tt_trips, tt_calendar, date=date)
    print("Date:", date)
    print(f"Number of unique trips timetabled, {count_unique_trip_id(trips_on_this_date_df)}")
    # Get a list of trip_ids that run on input date
    list_of_trip_ids_on_this_date = trips_on_this_date_df.trip_id.unique()
    # Use that to filter the stop_times down to only ones on input date
    tt_stop_times_this_date = tt_stop_times[tt_stop_times.trip_id.isin(list_of_trip_ids_on_this_date)]

    # Get all the stops that are on trips that run on input date
    full_stop_info = tt_stops.merge(tt_stop_times_this_date, on='stop_id', how='inner').loc[:, ['trip_id', 'stop_lat', 'stop_lon']]

    # Create Point geometries for the stop coordinates.
    stop_points = [Point(xy) for xy in zip(full_stop_info['stop_lon'], full_stop_info['stop_lat'])]
    # Create a GeoDataFrame of the stops that are stopped at on the input date
    points_gdf = gpd.GeoDataFrame(full_stop_info, geometry=stop_points, crs='EPSG:4326')

    # Perform spatial join with the boundary based on whether each stop is within the boundary or not.
    joined_gdf = gpd.sjoin(points_gdf, bndry, how="left", predicate="within")

    # Filter the geodf to only stops that are within the boundary (right index is not NA)
    points_inside_bndry = joined_gdf[joined_gdf.index_right.notna()]

    # Get a list of the unique Trip IDs that have at least one stop in the boundary
    list_of_trips_in_boundary = points_inside_bndry.trip_id.unique()

    # Determine how many of the above there are.
    number_in_boundary = len(list_of_trips_in_boundary)

    # How many of the trip_ids that have at least 1 stop inside the boundary are in the live data?
    number_in_boundary_and_live_data = len(data[data.trip_id.isin(list_of_trips_in_boundary)].trip_id.unique())

    print(f"Number of unique trip_ids with at least one stop in Liverpool City Region CA:", number_in_boundary)
    print(f"Number of those trip_ids that are also in our live data:", number_in_boundary_and_live_data)
    print(f"Percentage: {round(100*number_in_boundary_and_live_data/number_in_boundary,2)}%")
    print("-------------------------------")