# Info
This script was our original method of updating timetables with live location data. It's based on calculating the distance to stops and checking the bearing of the bus in relation to stops. This works well when the location data doesn't contain information about which stop the bus is close to.

We've since moved to a method that uses the `current_stop_status` and `current_stop_sequence` provided by the GTFS-RT format, and also utilises interpolation to fill in stops that we don't match to. This is detailed in `gtfsrt2gtfs_interpolation.ipynb`


Import modules

In [1]:
from gtfs_realtime_utils import *
from utils import *

Set the root path

In [None]:
ROOT = Path("../")
ROOT.resolve()

Define a date and a region.
date must be ISO8061 format.
region must be the name of the region.
Your timetable GTFS file must be follow the name convention `<date>_<region>.gtfs.zip`

In [3]:
iso8061_date = '20240923'
region = 'north_west'

Ensure the timetables are unzipped so we can read them.

In [None]:
TIMETABLE_FILE = ROOT / f"data/gtfs/timetable/{region}_20240918.gtfs.zip"

# Define the directory where you want to extract files from the timetable
EXTRACT_DIR = ROOT / f"18SepGB_GTFS_Timetables_Downloaded/{region}"

unzip_file(os.path.abspath(TIMETABLE_FILE), EXTRACT_DIR)

### Dictionaries
To be able to match the live buses to a trip/route etc, we need a way to lookup a given bus's `trip_id` and find its timetabled information. We'll do this using dictionaries because they have [O(1) lookup time thanks to hashmaps](https://dev.to/ajipelumi/how-dictionary-lookup-operations-are-o1-49pk).

We load just the parts we need into pandas dataframes, then use the `to_dict()` method to create our dictionaries for various ID->ID combinations.

We can get more detailed information about UK stops from the [National Public Transport Access Codes data](https://beta-naptan.dft.gov.uk/download). We need the bearing to help match buses to stops on the correct side of the road for their direction of travel.

In [None]:
# Load the ID parts of various files that we extracted from the GTFS timetable.
agencies, routes, stops, stop_times, trips = load_gtfs_ids(EXTRACT_DIR)

In [None]:
# Load the stop names and bearings for UK stops
stop_names_bearings = get_stop_names_and_bearings(ROOT)
stop_names_bearings.head()

In [29]:
# Make the dataframes to be used to make dicts
agency2route = agencies.merge(routes, on='agency_id').set_index('agency_id')
AgencyNOC2AgencyIDDict = agency2route['agency_noc'].drop_duplicates().to_dict()

In [30]:
# Each trip_id has multiple associated stops. Aggregate these into a list per trip_id.
trip2stoptimes = trips.merge(stop_times, on='trip_id')[['trip_id', 'stop_id']].groupby('trip_id').agg(list)
Trip2StopIDDict = trip2stoptimes['stop_id'].to_dict()

In [31]:
# There are some stop_id in stops.txt that are not in stop_times.txt. 
# We can't use these. Our merge removes these as we complete an "inner" join.
stoptimes2stops = stop_times.merge(stops, on='stop_id').merge(stop_names_bearings, how='inner', on='stop_id')
stoptimes2stops.drop_duplicates(subset='stop_id', keep='first', inplace=True) # We only need unique stop_ids
stoptimes2stops.set_index('stop_id', inplace=True)
StopID2StopLocDict = stoptimes2stops[['stop_lat', 'stop_lon', 'Bearing']].to_dict(orient='index')

### Get the paths of the GTFS-RT files.

For each of the extracted GTFS-RT files we get their full paths and save it in `gtfs_rt_file_paths`. This is to save some compute time downstream.

In [5]:
REALTIME_DATADIR = ROOT / f"data/gtfs-rt"
date_with_dashes = f"{iso8061_date[0:4]}-{iso8061_date[4:6]}-{iso8061_date[6:8]}"
# Create an empty list to store file paths
gtfs_rt_file_paths = []

# Walk through the directory
for root, dirs, files in os.walk(REALTIME_DATADIR):
    for file in files:
        # Get the full path of the file and append it to the list
        if file[0:10] == date_with_dashes:
            full_path = os.path.abspath(os.path.join(root, file))
            gtfs_rt_file_paths.append(full_path)

### Load the feed entities 
We load each feed entity into a list (array in most other languages) to loop through later. We could write this in one big loop, but we've split it up to save compute time and make the code easier to follow/debug.

In [34]:
feed = gtfs_realtime_pb2.FeedMessage()
entities = entities_to_list(feed, gtfs_rt_file_paths)

Details of the following algorithm can be found in the readme

In [None]:
BusDetailsBag = []
count = 0
t0 = time.time()
# For the moment assuming that `vehicle` is passed. there are other mutually exclusive 
# options: 'trip_update', 'alert', and 'shape'. See the DOCS https://gtfs.org/documentation/realtime/reference/#message-feedentity
for entity in entities:
    BD = BusDetail()
    # These are the two main parts of entity
    vehicle = entity.vehicle
    # # These are sub parts
    trip = vehicle.trip
    pos = vehicle.position
    # These are individual values
    BD.feed_uid = entity.id
    BD.trip_id = trip.trip_id
    BD.route_id = trip.route_id
    BD.lat = round(pos.latitude, 6)
    BD.lon = round(pos.longitude, 6)
    BD.bearing = pos.bearing
    BD.ts = vehicle.timestamp
    BD.v_id = vehicle.vehicle.id
    BD.occupancy_status = vehicle.occupancy_status
    BD.current_stop_sequence = vehicle.current_stop_sequence
    BD.current_status = vehicle.current_status
    BD.start_time = trip.start_time
    BD.start_date = trip.start_date

    if BD.trip_id:
        stops_on_route = Trip2StopIDDict.get(BD.trip_id)
        # If stops isn't none - i.e. if this trip ID is part of the timetable
        if stops_on_route:
            # Get the stop_id, lat, and lon for each stop on the route.
            # Scaling the longitudes because they get closer together nearer the poles. Could replace BD.lat with cos(~53) for UK average
            actual_stop_locations_on_route = [
                (stopid, StopID2StopLocDict[stopid]['stop_lat'], 
                 StopID2StopLocDict[stopid]['stop_lon'], 
                 abs(StopID2StopLocDict[stopid]['Bearing'] - BD.bearing), 
                 abs(StopID2StopLocDict[stopid]['stop_lat'] - BD.lat), 
                 abs((StopID2StopLocDict[stopid]['stop_lon'] - BD.lon)/np.cos(BD.lat))
                ) for stopid in stops_on_route
            ]
            
            box_size = 0.003 # 0.01 is ~1.1km 
            
            # Filter out all stops that are pointing the "wrong direction". Absolute difference between stop bearing and bus bearing should be < 90 degrees. This gives a semi-circle's worth of error margin.
            stops_in_bounds = [item for item in actual_stop_locations_on_route if (item[4] < box_size) and (item[5] < box_size) and (item[3] < 90) and (item[3] != 'nan')]
            
            if len(stops_in_bounds) > 0:
                candidate_stops_and_distances = [item + (haversine(BD.lat, BD.lon, item[1], item[2]),) for item in stops_in_bounds]
    
                n_possible_stops = len(candidate_stops_and_distances)
                if n_possible_stops == 1:
                    # Found the nearest stop already. Get the distance and stop_id.
                    closest_stop_id = candidate_stops_and_distances[0][0]
                    closest_stop_distance = candidate_stops_and_distances[0][6]
                else:
                    # Need to get min distance from n stops.
                    index_of_smallest_distance = min(range(len(candidate_stops_and_distances)), key=lambda i: candidate_stops_and_distances[i][6])
                    closest_stop_id = candidate_stops_and_distances[index_of_smallest_distance][0]
                    closest_stop_distance = candidate_stops_and_distances[index_of_smallest_distance][6]
                
                # Add the details to the current bus.
                BD.NearestStopOnRoute = closest_stop_id
                BD.NearestStopDistance = closest_stop_distance * 1e3 #convert to m

                # Ensure we found a nearest stop and that the bus is "sufficiently" close.
                if BD.NearestStopOnRoute != None and BD.NearestStopDistance < 200:
                    BusDetailsBag.append(BD)

    # Creating some info to see progress
    if count!= 0 and count % 500000 == 0:
        t1 = time.time()
        print('Time elapsed:', round(t1-t0, 3), 's')
        print(f"{count} of {len(entities)} entities parsed.")
    count +=1

In [None]:
data = [{'trip_id': bus.trip_id, "route_id": bus.route_id, 'timestamp': bus.ts, 'nearest_stop_id': bus.NearestStopOnRoute, 'distance': bus.NearestStopDistance} for bus in BusDetailsBag]

# Add the data to a dataframe
df = pd.DataFrame(data)

# Work out the human readable time
df['human_time'] = pd.to_datetime(df['timestamp'], unit='s')

# Timezone is currently BST =  UTC + 1, so need to add 1 hour.
df['uk_bst_time_only'] = (df['human_time'] + pd.Timedelta(hours=1)).dt.strftime('%H:%M:%S')

# Weird rows where they aren't the right date. Binning them
df = df.loc[df.human_time.dt.date == pd.to_datetime(f'{iso8061_date[0:4]}-{iso8061_date[4:6]}-{iso8061_date[6:8]}').date()]

# only remove duplictaes that have same stop_id, route_id and nearest_stop_id
df.drop_duplicates(subset=['trip_id', 'route_id', 'nearest_stop_id'], keep='first', inplace=True)

FilteredOrderedBusLocations = df.copy()

In [None]:
agencies, routes, trips, stops, stop_times, calendar, calendar_dates, feed_info, shapes = load_full_gtfs(EXTRACT_DIR, include=['feed_info.txt', 'shapes.txt'])

In [None]:
REAL_DIR = ROOT / f"data/gtfs/real/{region}/{iso8061_date}"
os.makedirs(os.path.abspath(REAL_DIR), exist_ok=True)

In [None]:
RealRouteIDs = FilteredOrderedBusLocations['route_id'].astype(int)
RealRoutes = routes[routes['route_id'].isin(RealRouteIDs)]
RealRoutes.to_csv(REAL_DIR / "routes.txt", index=False)
RealRoutes

In [None]:
RealAgencyIDs = routes['agency_id']
RealAgencies = agencies[agencies['agency_id'].isin(RealAgencyIDs)]
RealAgencies.to_csv(REAL_DIR / "agency.txt", index=False)
RealAgencies

In [None]:
RealTripIDs = FilteredOrderedBusLocations['trip_id']
RealTrips = trips[trips['trip_id'].isin(RealTripIDs)]
RealTrips.to_csv(REAL_DIR / "trips.txt", index=False)
RealTrips

In [None]:
RealShapeIDs = RealTrips['shape_id'].unique()
RealShapes = shapes[shapes.shape_id.isin(RealShapeIDs)]
RealShapes.to_csv(REAL_DIR / "shapes.txt", index=False)

In [None]:
RealServiceIDs = RealTrips['service_id'].unique()
RealCalendar = calendar[calendar['service_id'].isin(RealServiceIDs)]
RealCalendarDates = calendar_dates[calendar_dates['service_id'].isin(RealServiceIDs)]
RealCalendar.to_csv(REAL_DIR / "calendar.txt", index=False)
RealCalendarDates.to_csv(REAL_DIR / "calendar_dates.txt", index=False)

In [None]:
RealTripIDs_list = RealTripIDs.to_list()
FilteredStopTimes = stop_times[stop_times.trip_id.isin(RealTripIDs_list)]
FilteredOrderedBusLocations.rename(columns={'nearest_stop_id': 'stop_id'}, inplace=True)
RealTrips2RealStopsDict = FilteredStopTimes.groupby('trip_id')['stop_id'].agg(list).to_dict()

RealStopTimes = FilteredStopTimes.merge(FilteredOrderedBusLocations, on=['trip_id', 'stop_id'], how='inner')
RealStopTimes['arrival_time'] = RealStopTimes['uk_bst_time_only']
RealStopTimes['departure_time'] = RealStopTimes['uk_bst_time_only']

# A trip must visit more than one stop in stop_times.txt to be usable by passengers for boarding and alighting.
stop_counts = RealStopTimes.groupby('trip_id')['stop_id'].count()
trip_ids_with_one_stop = stop_counts[stop_counts == 1].index.to_list()
RealStopTimes = RealStopTimes[~RealStopTimes.trip_id.isin(trip_ids_with_one_stop)] # Exclude trips with only 1 stop. (The "~" is negation)

# When sorted by stop_times.stop_sequence, two consecutive entries in stop_times.txt 
# should have increasing distance, based on the field shape_dist_traveled. 
# If the values are equal, this is considered as an error.
RealStopTimes.sort_values(by=['trip_id', 'stop_sequence'], ascending=True, inplace=True)

# If pick up type == 1, no pickup is available (Guessing this means you can't get on the bus here?)
RealStopTimes = RealStopTimes[RealStopTimes.pickup_type != 1]

# Remove duplicate rows based on the specified columns but keep the first occurrence
RealStopTimes = RealStopTimes.drop_duplicates(subset=['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'shape_dist_traveled'], keep='first')

# Filtering only columns we need to write stop_times.txt
RealStopTimes = RealStopTimes[['trip_id','arrival_time','departure_time','stop_id','stop_sequence','stop_headsign','pickup_type','drop_off_type','shape_dist_traveled','timepoint']]

RealStopTimes.to_csv(REAL_DIR / "stop_times.txt",index=False)

In [None]:
RealStopIDs = FilteredOrderedBusLocations['stop_id'].to_list()
RealStops = stops[stops.stop_id.isin(RealStopIDs)].copy()

# Location type must be able to be parsed as an integer.
RealStops['location_type'] = RealStops['location_type'].astype('Int64')
RealStops.drop(columns='parent_station', inplace=True)
RealStops.to_csv(REAL_DIR / "stops.txt", index=False)

In [None]:
feed_info.to_csv(REAL_DIR / "feed_info.txt", index=False)

In [None]:
import shutil
def zip_directory(folder_path, output_dir_path, output_filename):
    # Ensure the output filename does not have a .zip extension
    if not output_filename.endswith('.zip'):
        output_filename += '.zip'
    # Join the output directory and filename
    output_zip_path = output_dir_path / output_filename
    
    # Create the zip archive in the specified directory
    shutil.make_archive(output_zip_path.with_suffix(''), 'zip', folder_path)
    print(f"Directory '{folder_path}' successfully zipped as '{output_filename}'.")

In [None]:
zip_directory(REAL_DIR, ROOT / 'data/real', f'{region}_{iso8061_date}.gtfs')

In [None]:
metadata = get_entity_metadata(entities, ROOT / f'data/gtfs-rt/metadata/{region}_{iso8061_date}.csv')
metadata