In [None]:
import os
import pandas as pd
import re
import numpy as np
import json
os.chdir("../")
from pathlib import Path
from pipelines.utils import load_full_gtfs, convert_to_unix_timestamp, get_stop_names_and_bearings
ROOT = Path(os.getcwd())
ROOT.resolve()

In [None]:
# Read the BODS data catalogue
bdc = pd.read_csv(ROOT / "web/bodsdatacatalogue/timetables_data_catalogue.csv")

In [None]:
# Load the GTFSRT data
dates = [f'202409{i}' for i in range(15, 24)]
date_strs = [f"{date[0:4]}-{date[4:6]}-{date[6:8]}" for date in dates]
gtfsrt_data = [[load_full_gtfs(ROOT / f"data/real/yorkshire_{date}.gtfs.zip", ['shapes.txt']), date_str] for date, date_str in zip(dates, date_strs)]

# Load the timetable
tt_agencies, tt_routes, tt_trips, tt_stops, tt_stop_times, tt_calendar, tt_calendar_dates = load_full_gtfs(ROOT / f"18SepGB_GTFS_Timetables_Downloaded/itm_yorkshire_gtfs.zip")

In [None]:
def glue_data(data, item, subset=None, drop_duplicates=True):
    ''''''
    result = pd.DataFrame()
    
    for i in range(len(data)):
        if not result.empty:
            result = pd.concat([result, data[i][0][item]])
        else:
            result = data[i][0][item]
    
    if drop_duplicates:
        result.drop_duplicates(subset, inplace=True, keep='first')
    
    if subset == 'route_id':
        result['date_str'] = data[i][1]

    return result

all_agency = glue_data(gtfsrt_data, 0, drop_duplicates=True, subset='agency_id')
all_routes = glue_data(gtfsrt_data, 1, drop_duplicates=True, subset='route_id')
all_trips = glue_data(gtfsrt_data, 2, drop_duplicates=False)
all_stop_times = glue_data(gtfsrt_data, 4, drop_duplicates=True, subset=['trip_id', 'stop_id', 'stop_sequence'])
all_stops = glue_data(gtfsrt_data, 3, drop_duplicates=True, subset='stop_id')
all_shapes = glue_data(gtfsrt_data, 7, drop_duplicates=True, subset=['shape_id', 'shape_pt_sequence'])

In [None]:
def tidy_route_names(string: str):
    '''
    Simplify the route names:
        -Remove reference to bus stations. 
        -Limit to A-z characters.
        -Replace spaces with hyphens.
        -Replace double hyphens with single.
        -Remove trailing hyphens.
    '''
    bus_station_pattern = r"(city bus station|bus station)"
    string = re.sub(bus_station_pattern, '', string, flags=re.IGNORECASE)
    string = string.strip()
    string = re.sub(r'[^a-zA-Z]+', ' ', string)
    return string

def kebab_case(string:str):
    string = re.sub(r' ', '-', string)
    string = re.sub(r"--", "-", string)
    string = re.sub(r"-$","", string)
    string = string.lower()
    return string

def get_detailed_route_info(bdc, route_short_name:str, agency_noc:str):
    """"""
    # Split the line names by a space
    data = bdc.copy()
    data['XML:Line Name'] = data['XML:Line Name'].str.split(' ')
    # Explode the list
    exploded_bdc = data.explode('XML:Line Name')
    # Filter to one row
    filtered_result = exploded_bdc[(exploded_bdc['XML:Line Name'] == route_short_name) & (exploded_bdc['XML:National Operator Code']==agency_noc)]
    start = filtered_result['OTC:Start Point'].values[0]
    finish = filtered_result['OTC:Finish Point'].values[0]
    via = filtered_result['OTC:Via'].values[0]
        
    names = [start, finish, via]
    tidy_names = []
    kebab_names = []
    for name in names:
        if type(name) == str:
            tidy_name = tidy_route_names(name)
            kebab_name = kebab_case(tidy_name)
        else:
            tidy_name = name
            kebab_name = name
        tidy_names.append(tidy_name)
        kebab_names.append(kebab_name)

    return tidy_names, kebab_names

def get_row_info(row):
    route_id = row['route_id']
    agency_id = row['agency_id']
    route_short_name = row['route_short_name']
    agency_noc = row['agency_noc']
    agency_name = row['agency_name']
    date_str = row['date_str']
    return route_id, route_short_name, agency_id, agency_noc, agency_name, date_str

def get_route_id(agency_id, route_short_name, routes):
    return routes[(routes.agency_id == agency_id) & (routes.route_short_name == route_short_name)].route_id.values[0]

def get_trips_on_this_route(route_id:str, trips):
    return trips[trips.route_id == route_id][['trip_id', 'trip_headsign', 'shape_id']]

def get_unique_values_from_column(data, column_name:str):
    return data[column_name].unique()

def get_items_for_unique_set(data, match_column, unique_set, slice_columns=None, rename=None):
    matched_data = data[data[match_column].isin(unique_set)]
    if slice_columns:
        matched_data_sliced = matched_data.loc[:, slice_columns]
    if rename:
        matched_data_sliced.rename(columns=rename, inplace=True)
    return matched_data_sliced

def fix_shapes(data):
    data = data.groupby('shape_id').apply(lambda x: x[['shape_pt_lon', 'shape_pt_lat']].values.round(5).tolist(), include_groups=False).reset_index(name='geometry')
    return data

def create_dict(data, index_col, value_col):
    return data.set_index(index_col)[value_col].to_dict()

def format_stops(data, stop_bearings):
    stops_this_route = None
    stops_this_route = data.merge(stop_bearings, on='stop_id', how='inner')
    stops_this_route.rename(columns={'stop_name': 'name', 'stop_lat': 'lat', 'stop_lon': 'lon', 'Bearing': 'bearing'}, inplace=True)
    stops_this_route['bearing'] = stops_this_route['bearing'].astype(int)
    stops_this_route.set_index('stop_id', inplace=True)
    stops = stops_this_route.to_dict(orient='index')
    return stops

def format_trip_list(stop_times_for_this_route):
    # Create an empty list to store results
    trip_list = []

    # Iterate over each unique trip_id
    for trip_id in stop_times_for_this_route['trip_id'].unique():
        
        # Filter rows for the current trip_id
        trip_df = stop_times_for_this_route[stop_times_for_this_route['trip_id'] == trip_id]
        # Sort by 'real' time
        trip_df = trip_df.sort_values(by='real')
        # Create a list of dicts for this trip
        current_trip_data = []
        for i, row in trip_df.iterrows():
            trip_data = [
                    row['stop_id'],
                    int(row['real']),
                    int(row['timetable'])
            ]
            current_trip_data.append(trip_data)
        # Append this trip's list to the main list
        trip_list.append(current_trip_data)

    return trip_list

def create_metadata(route_short_name, kebab_start, kebab_finish, route_start, route_via, route_finish, agency_name, agency_noc):
    # print(route_via, type(route_via))
    if route_via == np.nan:
        name = f"{route_short_name} - {route_start} - {route_via} - {route_finish}"
    else:
        name = f"{route_short_name} - {route_start} - {route_finish}"
    return dict({'id': f"{route_short_name}-{kebab_start}-{kebab_finish}", 
                 'name': name, 
                 "agency_name": agency_name, "agency_noc": agency_noc})

In [None]:
meta = None
formatted_stops = None
trip_list = None
line = None
routes2agency = all_routes.merge(all_agency, on='agency_id', how='inner')
for i, row in routes2agency.iterrows():
    route_id, route_short_name, agency_id, agency_noc, agency_name, date_str = get_row_info(row)
    # print(route_id, agency_id, route_short_name, agency_noc)
    try:
        human_names, kebab_names = get_detailed_route_info(bdc, route_short_name, agency_noc)
        route_start, route_finish, route_via = human_names
        kebab_start, kebab_finish, kebab_via = kebab_names
    except:
        print("Unable to get route start and end. Skipping...")
        continue
   
    route_id = get_route_id(agency_id, route_short_name, all_routes)
   
    trips_on_this_route = get_trips_on_this_route(route_id, all_trips)
    unique_trips = get_unique_values_from_column(trips_on_this_route, 'trip_id')
    unique_shapes = get_unique_values_from_column(trips_on_this_route, 'shape_id')
    # print("Unique trips on this route:\n\n", trips_on_this_route.count().to_csv())

    # Get the timetabled trips that match the real trips
    # print("Getting stop times from the timetable...")
    tt_stop_times_for_this_route = get_items_for_unique_set(tt_stop_times, 'trip_id', unique_trips, slice_columns=['trip_id', 'arrival_time', 'stop_id', 'stop_sequence'], rename={'arrival_time': 'timetable'})
    
    # print("Getting shapes for the trips on this route...")
    shapes_on_this_route = get_items_for_unique_set(all_shapes, 'shape_id', unique_shapes, slice_columns=['shape_id', 'shape_pt_lon', 'shape_pt_lat'])

    try:
        shapes_on_this_route = fix_shapes(shapes_on_this_route)
        line = create_dict(shapes_on_this_route, 'shape_id', 'geometry')
    except:
        print(f'No shape ID available for {route_id, route_short_name, agency_name}')
        line = 'na'

    # print("Getting the stop times for this route...")
    stop_times_for_this_route = get_items_for_unique_set(all_stop_times, 'trip_id', unique_trips, slice_columns=['trip_id', 'arrival_time', 'stop_id', 'stop_sequence'], rename={'arrival_time': 'real'})
    unique_stops = get_unique_values_from_column(stop_times_for_this_route, 'stop_id')

    # Add the timetabled times
    stop_times_for_this_route = stop_times_for_this_route.merge(tt_stop_times_for_this_route, on=['trip_id', 'stop_id', 'stop_sequence'], how='inner')

    # Convert times to UTC.
    stop_times_for_this_route['real'] = stop_times_for_this_route['real'].apply(convert_to_unix_timestamp, args=(date_str,))
    stop_times_for_this_route['timetable'] = stop_times_for_this_route['timetable'].apply(convert_to_unix_timestamp, args=(date_str,))
    
    trip_list = format_trip_list(stop_times_for_this_route)

    # Stops
    stops_this_route = get_items_for_unique_set(all_stops, 'stop_id', unique_stops, slice_columns=['stop_id', 'stop_name', 'stop_lon', 'stop_lat'])
    stop_bearings = get_stop_names_and_bearings()[['stop_id', 'Bearing']]

    formatted_stops = format_stops(stops_this_route, stop_bearings)

    meta = create_metadata(route_short_name, kebab_start, kebab_finish, route_start, route_via, route_finish, agency_name, agency_noc)
    
    content = dict({'meta': meta, 'line': line, 'stops': formatted_stops, 'trips': trip_list})
    with open(ROOT / f"web/TLE/{route_short_name}-{kebab_start}-{kebab_finish}.json", "w") as f:
        json.dump(content, f, separators=(',',':'))