In [1]:
from gtfs_realtime_utils import *
from utils import *
import pandas as pd
ROOT = Path("../")
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/bus-tracking')

### Loading all bus data for England (excl. London) on a specific day

In [None]:
def gtfsrt_filepaths_to_list(dir, date):
    date_with_dashes = f"{date[0:4]}-{date[4:6]}-{date[6:8]}"
    # Create an empty list to store file paths
    gtfs_rt_file_paths = []

    # Walk through the directory
    for root, dirs, files in os.walk(dir):
        for file in files:
            # Get the full path of the file and append it to the list
            if file[0:10] == date_with_dashes: #@TODO improve the slice here to a regex match for the date.
                full_path = os.path.abspath(os.path.join(root, file))
                gtfs_rt_file_paths.append(full_path)
    return gtfs_rt_file_paths

def entity_list_to_df(entities, date):
    trip_ids = []
    start_times = []
    start_dates = []
    schedule_relationships = []
    route_ids = []
    latitude = []
    longitude = []
    bearing = []
    stop_sequence = []
    status = []
    timestamps = []
    vehicle_ids = []
    print(len(entities))
    for e in entities:
        v = e.vehicle
        trip_ids.append(v.trip.trip_id)
        start_times.append(v.trip.start_time)
        start_dates.append(v.trip.start_date)
        schedule_relationships.append(v.trip.schedule_relationship)
        route_ids.append(v.trip.route_id)
        longitude.append(v.position.longitude)
        latitude.append(v.position.latitude)
        bearing.append(v.position.bearing)
        stop_sequence.append(v.current_stop_sequence)
        status.append(v.current_status)
        timestamps.append(v.timestamp)
        vehicle_ids.append(v.vehicle.id)

    data = pd.DataFrame({'trip_id': trip_ids, 'start_time': start_times, 'start_date': start_dates, 'schedule_relationship': schedule_relationships, 'route_id': route_ids,
                        'latitude': latitude, 'longitude': longitude, 'bearing': bearing, 'stop_sequence': stop_sequence, 'status': status, 'timestamp': timestamps, 'vehicle_id': vehicle_ids})

    # data['human_time'] = pd.to_datetime(data['timestamp'], unit='s')
    data['latitude'] = data['latitude'].round(5)
    data['longitude'] = data['longitude'].round(5)

    # Sort the data
    data.sort_values(by=['vehicle_id', 'timestamp', 'trip_id'], ascending=True, inplace=True)
    with_duplicates = len(data)
    data.drop_duplicates(subset=['longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id'], keep='first', inplace=True) # The first/last here shouldn't matter as one of the duplicate fields is timestamp. So these are data points that are for the same point in time too.
    without_duplicates = len(data)
    fraction_duplicated = 1 - without_duplicates/with_duplicates
    print(f"Fraction of data for {date} that was duplictaed in 'longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id':{fraction_duplicated}")
    return data

In [3]:
for date in range(20240915, 20240924):
    data = None
    gtfsrt_filepaths = None
    entities = None

    date = str(date)
    REALTIME_DATADIR = ROOT / f"data/gtfs-rt"
    gtfsrt_filepaths = gtfsrt_filepaths_to_list(dir=REALTIME_DATADIR, date=date)
    
    # Initialise the feed object
    feed = gtfs_realtime_pb2.FeedMessage()
    # Add all the entities (bus location objects) to a list to iterate through later.
    entities = entities_to_list(feed, gtfsrt_filepaths)
    print(f"Loaded data for {date}")
    # Add data to dataframe and sort
    data = entity_list_to_df(entities, date)

    print(f"Created dataframe for {date}")

    data.to_csv(ROOT / f"data/gtfs-rt/csv/{date}.csv", index=False)
    print(f'Finished writing {date}')

Loaded data for 20240915
28584494
Fraction of data for 20240915 that was duplictaed in 'longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id':0.8568668383634848
Created dataframe for 20240915
Finished writing 20240915
Loaded data for 20240916
25628104
Fraction of data for 20240916 that was duplictaed in 'longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id':0.6518878649782286
Created dataframe for 20240916
Finished writing 20240916
Loaded data for 20240917
28830261
Fraction of data for 20240917 that was duplictaed in 'longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id':0.62329643841934
Created dataframe for 20240917
Finished writing 20240917
Loaded data for 20240918
28835781
Fraction of data for 20240918 that was duplictaed in 'longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id':0.6373314806351178
Created dataframe for 20240918
Finished writing 20240918
Loaded data for 20240919
29227725
Fraction of data for 20240919 that was duplictaed in 'longitude', 'l