In this notebook we will explore the differences between the SiriVM and GTFS-RT feeds from the Bus Open Data Service (BODS). 

Things we need to do:

- parse all the GTFS-RT data.
- parse all the Siri-VM data.
- compare the coordinates at the same timestamp and same vehicle ID

In [1]:
import os
from pathlib import Path
os.chdir("../")
from pipelines.utils import *
from pipelines.gtfs_realtime_utils import *
from pipelines.sirivm_utils import *

ROOT = Path("./")
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/bus-tracking')

In [2]:
# Set the date for the data we are working with.
date = 20241210

In [3]:
gtfsrt_data = gtfsrt_to_dataframe(ROOT / f"investigations/test/GTFSRT/binary", date, round=True, drop_duplictaes=True, nth_file=None)

Loaded data for 20241210
There are 9142925 entities.
Fraction of data that was duplictaed in 'longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id':0.8203147242266562
Created dataframe for 20241210


Read the SiriVM files

In [4]:
sirivm_data = sirivm_to_dataframe('investigations/test/sirivm/xml/', round=True, drop_duplicates=True, nth_file=None)
# sirivm.to_csv(ROOT / f"data/sirivm/{date}.csv", index=False)

Fraction of data that was duplictaed in 'longitude', 'latitude', 'timestamp', 'vehicle_id', 'trip_id':0.6786969427208746


In [5]:
# Read the data
# sirivm_data = pd.read_csv(ROOT / "data/sirivm/20241210.csv")
# gtfsrt_data = pd.read_csv(ROOT  / "data/gtfs-rt/csv/20241210.csv")

In [6]:
# Calculate the number of unique reported bus locations in the given time
print(f"Number of Siri-VM unique reported bus locations: {len(sirivm_data)}")
print(f"Nunber of GTFSRT unique reported bus locations: {len(gtfsrt_data)}")

Number of Siri-VM unique reported bus locations: 2172071
Nunber of GTFSRT unique reported bus locations: 1642849


In [7]:
def join_data(left, right, on:list, how:str):
    '''
    Meta function for merging data and printing the length of the resultant dataframe

    Params
    ------
      left: pd.DataDrame
         left frame
      right: pd.DataFrame
         right frame
      on: list
         column names to merge on
      how: str
         how to merge - can be 'left', 'right', 'inner', 'outer', 'cross'
    Returns
    -------

    '''
    result = left.merge(right, on=on, how=how, indicator=True)
    
    if how == 'inner':
       msg = "Number of reported bus locations that appear in both left and right"
    if how == 'left':
       msg = "Number of reported bus locations that appear only in left"
       result = result[result['_merge'] == 'left_only']
    if how == 'right':
       msg = "Number of reported bus locations that appear only in right"
       result = result[result['_merge'] == 'right_only']

    print(f"{msg}: {len(result)}")
    return result

In [27]:
# Common rows
common_rows = join_data(sirivm_data, gtfsrt_data, on=['timestamp', 'vehicle_id', 'longitude', 'latitude'], how='inner')
# Siri only rows
siri_only = join_data(sirivm_data, gtfsrt_data, on=['timestamp', 'vehicle_id', 'longitude', 'latitude'], how='left')
# GTFSRT only rows
gtfsrt_only = join_data(sirivm_data, gtfsrt_data, on=['timestamp', 'vehicle_id', 'longitude', 'latitude'], how='right')

Number of reported bus locations that appear in both left and right: 1383845
Number of reported bus locations that appear only in left: 792786
Number of reported bus locations that appear only in right: 259004


In [9]:
print("No latitude", len(siri_only[siri_only.latitude.isna()]))
print("No longitude", len(siri_only[siri_only.longitude.isna()]))
print("No timestamp", len(siri_only[siri_only.timestamp.isna()]))

No latitude 0
No longitude 0
No timestamp 561


In [10]:
print("No latitude", len(gtfsrt_only[gtfsrt_only.latitude.isna()]))
print("No longitude", len(gtfsrt_only[gtfsrt_only.longitude.isna()]))
print("No timestamp", len(gtfsrt_only[gtfsrt_only.timestamp.isna()]))

No latitude 0
No longitude 0
No timestamp 0


In [11]:
gtfsrt_only.vehicle_id.nunique()

29137

In [12]:
siri_only.vehicle_id.nunique()

22604

In [38]:
pd.options.display.float_format = '{:}'.format
gtfsrt_only[gtfsrt_only.vehicle_id == '.3-YJ55BKG'][['timestamp', 'latitude', 'longitude', 'vehicle_id']].sort_values('timestamp')

Unnamed: 0,timestamp,latitude,longitude,vehicle_id
8,1733848137.0,52.7544,0.3993,.3-YJ55BKG
11,1733848239.0,52.7564,0.4002,.3-YJ55BKG
14,1733848333.0,52.7567,0.4049,.3-YJ55BKG
36,1733849044.0,52.7562,0.4345,.3-YJ55BKG
47,1733849725.0,52.7547,0.4451,.3-YJ55BKG
48,1733849951.0,52.7491,0.4919,.3-YJ55BKG


In [36]:
pd.options.display.float_format = '{:}'.format
siri_only[siri_only.vehicle_id == '.3-YJ55BKG'][['timestamp', 'latitude', 'longitude', 'vehicle_id']].sort_values('timestamp')

Unnamed: 0,timestamp,latitude,longitude,vehicle_id
3,1733847976.0,52.7544,0.3993,.3-YJ55BKG
4,1733847980.0,52.7544,0.3993,.3-YJ55BKG
7,1733848041.0,52.7544,0.3993,.3-YJ55BKG
11,1733848129.0,52.7544,0.3993,.3-YJ55BKG
12,1733848157.0,52.7544,0.3995,.3-YJ55BKG
15,1733848213.0,52.7553,0.4008,.3-YJ55BKG
16,1733848229.0,52.7559,0.4004,.3-YJ55BKG
17,1733848247.0,52.7568,0.4004,.3-YJ55BKG
19,1733848284.0,52.7563,0.4028,.3-YJ55BKG
20,1733848289.0,52.7564,0.4033,.3-YJ55BKG


In [37]:
50017-47975

2042