In [1]:
import os
import pandas as pd
import re
import numpy as np
import json
os.chdir("../")
from pathlib import Path
from pipelines.utils import load_full_gtfs, convert_to_unix_timestamp, get_stop_names_and_bearings
ROOT = Path(os.getcwd())
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/bus-tracking')

In [2]:
# Read the BODS data catalogue
bdc = pd.read_csv(ROOT / "web/bodsdatacatalogue/timetables_data_catalogue.csv")

In [3]:
region = 'north_west'

In [4]:
# Load the GTFSRT data
dates = [f'202409{i}' for i in range(15, 24)]
date_strs = [f"{date[0:4]}-{date[4:6]}-{date[6:8]}" for date in dates]
gtfsrt_data = [[load_full_gtfs(ROOT / f"data/real/{region}_{date}.gtfs.zip", ['shapes.txt']), date_str] for date, date_str in zip(dates, date_strs)]

# Load the timetable
tt_agencies, tt_routes, tt_trips, tt_stops, tt_stop_times, tt_calendar, tt_calendar_dates = load_full_gtfs(ROOT / f"18SepGB_GTFS_Timetables_Downloaded/itm_{region}_gtfs.zip")

File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240915.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240916.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240917.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240918.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240919.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240920.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240921.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/lukestrange/Code/bus-tracking/data/real/north_west_20240922.gtfs.zip" is a zip file. Unzipping and reading...
File "/Users/luk

In [5]:
def glue_data(data, item, subset=None, drop_duplicates=True):
    ''''''
    result = pd.DataFrame()
    
    for i in range(len(data)):
        if not result.empty:
            result = pd.concat([result, data[i][0][item]])

            if item==4:
                result['date_str'] = result['date_str'].fillna(data[i][1])
                
        else:
            result = data[i][0][item]
            if item==4:
                result['date_str'] = data[i][1]
            
    if drop_duplicates:
        result.drop_duplicates(subset, inplace=True, keep='first')

    return result

all_agency = glue_data(gtfsrt_data, 0, drop_duplicates=True, subset='agency_id')
all_routes = glue_data(gtfsrt_data, 1, drop_duplicates=True, subset='route_id')
all_trips = glue_data(gtfsrt_data, 2, drop_duplicates=False)
all_stop_times = glue_data(gtfsrt_data, 4, drop_duplicates=True, subset=['trip_id', 'stop_id', 'stop_sequence', 'date_str'])
all_stops = glue_data(gtfsrt_data, 3, drop_duplicates=True, subset='stop_id')
all_calendars = glue_data(gtfsrt_data, 5, drop_duplicates=True, subset='service_id')
all_shapes = glue_data(gtfsrt_data, 7, drop_duplicates=True, subset=['shape_id', 'shape_pt_sequence'])

In [6]:
def tidy_route_names(string: str):
    '''
    Simplify the route names:
        -Remove reference to bus stations. 
        -Limit to A-z characters.
        -Replace spaces with hyphens.
        -Replace double hyphens with single.
        -Remove trailing hyphens.
    '''
    bus_station_pattern = r"(city bus station|bus station)"
    string = re.sub(bus_station_pattern, '', string, flags=re.IGNORECASE)
    string = string.strip()
    string = re.sub(r'[^a-zA-Z]+', ' ', string)
    return string

def kebab_case(string:str):
    string = re.sub(r' ', '-', string)
    string = re.sub(r"--", "-", string)
    string = re.sub(r"-$","", string)
    string = string.lower()
    return string

def get_detailed_route_info(bdc, route_short_name:str, agency_noc:str):
    """"""
    # Split the line names by a space
    data = bdc.copy()
    data['XML:Line Name'] = data['XML:Line Name'].str.split(' ')
    # Explode the list
    exploded_bdc = data.explode('XML:Line Name')
    # Filter to one row
    filtered_result = exploded_bdc[(exploded_bdc['XML:Line Name'] == route_short_name) & (exploded_bdc['XML:National Operator Code']==agency_noc)]
    start = filtered_result['OTC:Start Point'].values[0]
    finish = filtered_result['OTC:Finish Point'].values[0]
    via = filtered_result['OTC:Via'].values[0]
        
    names = [start, finish, via]
    tidy_names = []
    kebab_names = []
    for name in names:
        if type(name) == str:
            tidy_name = tidy_route_names(name)
            kebab_name = kebab_case(tidy_name)
        else:
            tidy_name = name
            kebab_name = name
        tidy_names.append(tidy_name)
        kebab_names.append(kebab_name)

    return tidy_names, kebab_names

def get_row_info(row):
    route_id = row['route_id']
    agency_id = row['agency_id']
    route_short_name = row['route_short_name']
    agency_noc = row['agency_noc']
    agency_name = row['agency_name']
    return route_id, route_short_name, agency_id, agency_noc, agency_name

def get_route_id(agency_id, route_short_name, routes):
    return routes[(routes.agency_id == agency_id) & (routes.route_short_name == route_short_name)].route_id.values[0]

def get_trips_on_this_route(route_id:str, trips):
    return trips[trips.route_id == route_id][['trip_id', 'trip_headsign', 'shape_id']]

def get_unique_values_from_column(data, column_name:str):
    return data[column_name].unique()

# def get_items_for_unique_set(data, match_column, unique_set, slice_columns=None, rename=None):
#     matched_data = data[data[match_column].isin(unique_set)]
#     if slice_columns:
#         matched_data_sliced = matched_data.loc[:, slice_columns]
#     if rename:
#         matched_data_sliced.rename(columns=rename, inplace=True)
#     return matched_data_sliced

# def fix_shapes(data):
#     data = data.groupby('shape_id').apply(lambda x: x[['shape_pt_lon', 'shape_pt_lat']].values.round(5).tolist(), include_groups=False).reset_index(name='geometry')
#     return data

# def create_dict(data, index_col, value_col):
#     return data.set_index(index_col)[value_col].to_dict()

# def format_stops(data, stop_bearings):
#     stops_this_route = None
#     stops_this_route = data.merge(stop_bearings, on='stop_id', how='inner')
#     stops_this_route.rename(columns={'stop_name': 'name', 'stop_lat': 'lat', 'stop_lon': 'lon', 'Bearing': 'bearing'}, inplace=True)
#     stops_this_route['bearing'] = stops_this_route['bearing'].astype(int)
#     stops_this_route.set_index('stop_id', inplace=True)
#     stops = stops_this_route.to_dict(orient='index')
#     return stops

# def format_trip_list(stop_times_for_this_route):
    # Create an empty list to store results
    trip_list = []

    # Iterate over each unique trip_id
    for trip_id in stop_times_for_this_route['trip_id'].unique():
        
        # Filter rows for the current trip_id
        trip_df = stop_times_for_this_route[stop_times_for_this_route['trip_id'] == trip_id]
        # Sort by 'real' time
        trip_df = trip_df.sort_values(by='real')
        # Create a list of dicts for this trip
        current_trip_data = []
        for i, row in trip_df.iterrows():
            trip_data = [
                    row['stop_id'],
                    int(row['real']),
                    int(row['timetable'])
            ]
            current_trip_data.append(trip_data)
        # Append this trip's list to the main list
        trip_list.append(current_trip_data)

    return trip_list

def create_metadata(route_short_name, kebab_start, kebab_finish, route_start, route_via, route_finish, agency_name, agency_noc):
    # print(route_via, type(route_via))
    if route_via == np.nan:
        name = f"{route_short_name} - {route_start} - {route_finish}"
    else:
        name = f"{route_short_name} - {route_start} - {route_via} - {route_finish}"
    return dict({'id': f"{route_short_name}-{kebab_start}-{kebab_finish}", 
                 'name': name, 
                 "agency_name": agency_name, "agency_noc": agency_noc})

In [7]:
routes2agency = all_routes.merge(all_agency, on='agency_id', how='inner')
all_routes_dict = routes2agency.set_index('route_id').to_dict(orient='index')

In [8]:
all_stop_times = all_stop_times[['trip_id', 'stop_id', 'stop_sequence', 'arrival_time', 'date_str']]
tt_stop_times = tt_stop_times[['trip_id', 'stop_id', 'stop_sequence', 'arrival_time']]
all_stop_times = all_stop_times.merge(tt_stop_times, on=['trip_id', 'stop_id', 'stop_sequence'], how='inner', suffixes=('_real', '_timetable'))

In [9]:
# tt_trip2stoptimes = tt_stop_times.groupby('trip_id')[['arrival_time', 'departure_time', 'stop_id', 'stop_sequence']].agg(list).to_dict(orient='index')
trip2stoptimes = all_stop_times.groupby('trip_id')[['arrival_time_real', 'arrival_time_timetable', 'stop_id', 'stop_sequence', 'date_str']].agg(list).to_dict(orient='index')
shape_dict = all_shapes.groupby('shape_id').apply(lambda x: x[['shape_pt_lon', 'shape_pt_lat']].values.round(5).tolist(), include_groups=False).reset_index(name='geometry').set_index('shape_id').to_dict(orient='index')

In [10]:
stop_bearings = get_stop_names_and_bearings()[['stop_id', 'Bearing']] 
stops_dict = all_stops.merge(stop_bearings, on='stop_id', how='inner').set_index('stop_id').to_dict(orient='index')

In [None]:
for key, values in all_routes_dict.items():
    route_id = key
    agency_id, route_short_name, agency_noc, agency_name = values['agency_id'], values['route_short_name'], values['agency_noc'], values['agency_name']

    try:
        human_names, kebab_names = get_detailed_route_info(bdc, route_short_name, agency_noc)
        route_start, route_finish, route_via = human_names
        kebab_start, kebab_finish, kebab_via = kebab_names
    except:
        print("Unable to get route start and end. Skipping...")
        continue
    
    meta = create_metadata(route_short_name, kebab_start, kebab_finish, route_start, route_via, route_finish, agency_name, agency_noc)
    
    trips_on_this_route = get_trips_on_this_route(route_id, all_trips)
    unique_trips = get_unique_values_from_column(trips_on_this_route, 'trip_id')
    trips = []
    # Adding every stop id on this route to a list, so that we can find that set from it later on in the code.
    all_stop_ids = []

    for trip_id in unique_trips:
        try:
            stop_info = trip2stoptimes[trip_id]
        except KeyError:
            print(f'No real time info for trip_id:{trip_id}, route_id:{route_id}, agency_name:{agency_name}, route number: {route_short_name}')
            continue
        tt_arrival_times = stop_info['arrival_time_timetable']
        real_arrival_times = stop_info['arrival_time_real']
        real_stop_ids = stop_info['stop_id']
        date_strs = stop_info['date_str']
       
        real_timestamps = [convert_to_unix_timestamp(p, q) for p, q in zip (real_arrival_times, date_strs)]
        tt_timestamps = [convert_to_unix_timestamp(p, q) for p, q in zip (tt_arrival_times, date_strs)]
        trips.append([[i, j, k] for i, j, k in zip(real_stop_ids, real_timestamps, tt_timestamps)])

        all_stop_ids.append(real_stop_ids)
    unique_shapes = get_unique_values_from_column(trips_on_this_route, 'shape_id')
    line = dict()
    for shape_id in unique_shapes:
        try:
            s = shape_dict[shape_id]
            if shape_id not in line:
                line[shape_id] = s['geometry']
        except:
            print(f'Shape ID was {shape_id}, type {type(shape_id)}')
            continue
    
    flat_stop_list = [v for j in all_stop_ids for v in j]
    unique_stops = set(flat_stop_list)
    stops = dict()
    for s in unique_stops:
        stops[s] = dict({"name": stops_dict[s]['stop_name'], "lon": stops_dict[s]['stop_lat'], "lat": stops_dict[s]['stop_lon'], "bearing": stops_dict[s]['Bearing']})
    
    content = dict({'meta': meta, 'line': line, 'stops': stops, 'trips': trips})
    with open(ROOT / f"web/TLD/{route_short_name}-{kebab_start}-{kebab_finish}.json", "w") as f:
        json.dump(content, f, separators=(',',':'))

Unable to get route start and end. Skipping...
Unable to get route start and end. Skipping...
Unable to get route start and end. Skipping...
Unable to get route start and end. Skipping...
Unable to get route start and end. Skipping...
Unable to get route start and end. Skipping...
Shape ID was nan, type <class 'float'>
Shape ID was nan, type <class 'float'>
Shape ID was nan, type <class 'float'>
No real time info for trip_id:VJeab084f45591972a8a668b91fc0c8777db835db1, route_id:1608, agency_name:Stagecoach Manchester, route number: 192
No real time info for trip_id:VJ390a362eb00e0b6d383795a2310c8c8e837d715b, route_id:1608, agency_name:Stagecoach Manchester, route number: 192
No real time info for trip_id:VJf289a908b9d90b4b074402c4332eedeb629d8904, route_id:1608, agency_name:Stagecoach Manchester, route number: 192
No real time info for trip_id:VJ3fff8bb3db57e56df3f524bcb3bbff6521ab014e, route_id:1608, agency_name:Stagecoach Manchester, route number: 192
No real time info for trip_id:VJ3

In [None]:
p = tt_trips.merge(tt_calendar, on='service_id', how='inner')
p = p[(p.start_date >= 20240915) & (p.start_date <= 20240923)]
print(len(p.trip_id.unique()))
d = all_trips.merge(all_calendars, on='service_id', how='inner')
d = d[(d.start_date >= 20240915) & (d.start_date <= 20240923)]
print(len(d.trip_id.unique()))

573
361
