In [None]:
import os
import glob
import pandas as pd
import json
import itertools
os.chdir("../")
from pathlib import Path
from pipelines.utils import *
ROOT = Path(os.getcwd())
ROOT.resolve()

In [2]:
# Read the lookup data
with open(ROOT / "web/bustimes.org.json") as f:
    route_detail_lookup = json.load(f)

In [3]:
region = 'north_west'
rgncd = 'TLD'

In [4]:
def delete_files(path):
    files = glob.glob(os.path.abspath(path))
    for f in files:
        os.remove(f)

In [5]:
delete_files(ROOT / f"web/{rgncd}/*")

In [None]:
# Load the GTFSRT data
dates = [f'202409{i}' for i in range(15, 24)]
date_strs = [make_date_with_dashes(date) for date in dates]
gtfsrt_data = [[load_full_gtfs(ROOT / f"data/real-interpolated/{region}_{date}.gtfs.zip", ['shapes.txt']), date_str] for date, date_str in zip(dates, date_strs)]

In [None]:
# Load the timetable
tt_agencies, tt_routes, tt_trips, tt_stops, tt_stop_times, tt_calendar, tt_calendar_dates = load_full_gtfs(ROOT / f"18SepGB_GTFS_Timetables_Downloaded/itm_{region}_gtfs.zip")

In [8]:
# Glue all the realtime data together
def glue_data(data, item, subset=None, drop_duplicates=True):
    ''''''
    result = pd.DataFrame()
    
    for i in range(len(data)):
        if not result.empty:
            result = pd.concat([result, data[i][0][item]])

            if item==4:
                result['date_str'] = result['date_str'].fillna(data[i][1])
                
        else:
            result = data[i][0][item]
            if item==4:
                result['date_str'] = data[i][1]
            
    if drop_duplicates:
        result.drop_duplicates(subset, inplace=True, keep='first')

    return result

all_agency = glue_data(gtfsrt_data, 0, drop_duplicates=True, subset='agency_id')
all_routes = glue_data(gtfsrt_data, 1, drop_duplicates=True, subset='route_id')
all_trips = glue_data(gtfsrt_data, 2, drop_duplicates=False)
all_stop_times = glue_data(gtfsrt_data, 4, drop_duplicates=True, subset=['trip_id', 'stop_id', 'stop_sequence', 'date_str'])
all_stops = glue_data(gtfsrt_data, 3, drop_duplicates=True, subset='stop_id')
all_calendars = glue_data(gtfsrt_data, 5, drop_duplicates=True, subset='service_id')
all_shapes = glue_data(gtfsrt_data, 7, drop_duplicates=True, subset=['shape_id', 'shape_pt_sequence'])

In [9]:
def get_detailed_route_info(lookup:dict, route_short_name:str, agency_noc:str):
    """
    Provide the detailed info for a route given its `route_short_name` and `agency_noc`.

    Parameters
    ----------
    lookup: dict
        Dictionary containing route info
    route_short_name: str
        Usually the display name of the service. For example "13A".
    agency_noc: str 
        Agency National operator code.
        
    Returns
    -------
    id: 
        human-reable unique id of the bus route.
    name: 
        human-readable name of the bus route.
    """
    id = lookup[agency_noc][route_short_name]['id']
    name = lookup[agency_noc][route_short_name]['name']
    return id, name

def get_row_info(row):
    route_id = row['route_id']
    agency_id = row['agency_id']
    route_short_name = row['route_short_name']
    agency_noc = row['agency_noc']
    agency_name = row['agency_name']
    return route_id, route_short_name, agency_id, agency_noc, agency_name

def get_route_id(agency_id, route_short_name, routes):
    return routes[(routes.agency_id == agency_id) & (routes.route_short_name == route_short_name)].route_id.values[0]

def get_trips_on_this_route(route_id:str, trips):
    return trips[trips.route_id == route_id][['trip_id', 'trip_headsign', 'shape_id']]

def get_unique_values_from_column(data, column_name:str):
    return data[column_name].unique()

def create_metadata(human_route_name, human_route_id, agency_name, agency_noc, bustimesorg):
    return dict({'id': human_route_id, 
                 'name': human_route_name, 
                 "agency_name": agency_name, 
                 "agency_noc": agency_noc,
                 "bustimesorg": bustimesorg})

In [10]:
routes2agency = all_routes.merge(all_agency, on='agency_id', how='inner')
all_routes_dict = routes2agency.set_index('route_id').to_dict(orient='index')

In [11]:
all_stop_times = all_stop_times[['trip_id', 'stop_id', 'stop_sequence', 'arrival_time', 'date_str', 'interpolated']]
tt_stop_times = tt_stop_times[['trip_id', 'stop_id', 'stop_sequence', 'arrival_time']]
all_stop_times = all_stop_times.merge(tt_stop_times, on=['trip_id', 'stop_id', 'stop_sequence'], how='inner', suffixes=('_real', '_timetable'))

In [12]:
trip2stoptimes = all_stop_times.groupby(['trip_id', 'date_str'])[['arrival_time_real', 'arrival_time_timetable', 'stop_id', 'stop_sequence', 'date_str', 'interpolated']].agg(list).to_dict(orient='index')
shape_dict = all_shapes.groupby('shape_id').apply(lambda x: x[['shape_pt_lon', 'shape_pt_lat']].values.round(5).tolist(), include_groups=False).reset_index(name='geometry').set_index('shape_id').to_dict(orient='index')

In [13]:
stop_bearings = get_stop_names_and_bearings(ROOT)[['stop_id', 'Bearing']] 
stops_dict = all_stops.merge(stop_bearings, on='stop_id', how='inner').set_index('stop_id').to_dict(orient='index')

In [14]:
for key, values in all_routes_dict.items():
    route_id = key
    agency_id, route_short_name, agency_noc, agency_name = values['agency_id'], values['route_short_name'], values['agency_noc'], values['agency_name']
    # Ensure that route short name is upper case to match with IDs from bustimes.org
    route_short_name = route_short_name.upper()
    try:
        human_route_id, human_route_name = get_detailed_route_info(route_detail_lookup, route_short_name, agency_noc)
        bus_times_org = True
        
    except:
        msg = f"Unable to find route name details via bustimes.org for {route_short_name, agency_noc, agency_name}. Using route_short_name and agency_name/noc instead.\n"
        with open(ROOT / f'web/{rgncd}-errors.txt', 'a') as f:
            f.writelines(msg)
        
        human_route_id = f"{route_short_name}-{agency_noc}"
        human_route_name = f"{route_short_name} - {agency_name}"
        bus_times_org = False
    
    # Write the metadata to a dictionary
    meta = create_metadata(human_route_name, human_route_id, agency_name, agency_noc, bus_times_org)
    
    trips_on_this_route = get_trips_on_this_route(route_id, all_trips)
    unique_trips = get_unique_values_from_column(trips_on_this_route, 'trip_id')
    trips = []
    # Adding every stop id on this route to a list, so that we can find that set from it later on in the code.
    all_stop_ids = []

    for trip_id, ds in itertools.product(unique_trips, date_strs):
        try:
            stop_info = trip2stoptimes[trip_id, ds]
        except KeyError:
            # print(f'No real time info for trip_id:{trip_id}, route_id:{route_id}, agency_name:{agency_name}, route number: {route_short_name}')
            continue
        tt_arrival_times = stop_info['arrival_time_timetable']
        real_arrival_times = stop_info['arrival_time_real']
        real_stop_ids = stop_info['stop_id']
        stop_dates = stop_info['date_str']
        interpolated_status = stop_info['interpolated']
       
        real_timestamps = [convert_to_unix_timestamp(p, q) for p, q in zip (real_arrival_times, stop_dates)]
        tt_timestamps = [convert_to_unix_timestamp(p, q) for p, q in zip (tt_arrival_times, stop_dates)]
        # print(len(real_stop_ids), len(real_timestamps), len(tt_timestamps))
        trips.append([[i, j, k, p] for i, j, k, p in zip(real_stop_ids, real_timestamps, tt_timestamps, interpolated_status)])
        all_stop_ids.append(real_stop_ids)
    
    unique_shapes = get_unique_values_from_column(trips_on_this_route, 'shape_id')
    line = dict()
    for shape_id in unique_shapes:
        try:
            s = shape_dict[shape_id]
            if shape_id not in line:
                line[shape_id] = s['geometry']
        except:
            # print(f'Shape ID was {shape_id}, type {type(shape_id)}')
            continue
    
    flat_stop_list = [v for j in all_stop_ids for v in j]
    unique_stops = set(flat_stop_list)
    stops = dict()
    for s in unique_stops:
        try:
            b = int(stops_dict[s]['Bearing'])
        except:
            b = None
        stops[s] = dict({"name": stops_dict[s]['stop_name'], "lon": stops_dict[s]['stop_lon'], "lat": stops_dict[s]['stop_lat'], "bearing": b})

    if not human_route_id:
        fname = f"web/{rgncd}/{route_short_name}.json"
    else:
        fname = f"web/{rgncd}/{human_route_id}.json"

    if os.path.isfile(fname):
        # msg = f'File already exists for {route_short_name, agency_name}. Appending to this file.\n'
        # with open(ROOT / f'web/{rgncd}/errors.txt', 'a') as f:
        #     f.writelines(msg)
        # print(f'File already exists for {route_short_name, agency_name}. Appending to this file.')
        # Read the file.
        with open(ROOT / fname, 'r') as f:
            content = json.load(f)
        old_trips = content['trips']
        old_stops = content['stops']
        # Append the trips to that file.
        for t in trips:
            old_trips.append(t)
        content['trips'] = old_trips
        # Merge the new stops in case some are only on one route. Uses | (OR) operator for dictionaries.
        new_stops = old_stops | stops
        content['stops'] = new_stops
        # Write the file.
        with open(ROOT / fname, "w") as f:
            json.dump(content, f, separators=(',',':'))
    else:
        content = dict({'meta': meta, 'line': line, 'stops': stops, 'trips': trips})
        with open(ROOT / fname, "w") as f:
            json.dump(content, f, separators=(',',':'))