In [1]:
import os
import pandas as pd
import json
from datetime import datetime
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from tqdm import tqdm

# reading the first json from the realtime data folder
# parent_folder = 'washington_GTFS_data 2024.1.29-2024.2.11/washington_GTFS_data - Copy/output/' # smaller dataset
parent_folder = 'washington_GTFS_data 2024.2.12-2024.3.24\washington_GTFS_data 2.12-3.24/' # bigger dataset

trip_updates = 'RAIL_RT_TRIP_UPDATES/'

file_path = parent_folder + trip_updates + '2024_01_03_00_00_00.json'
try:
	with open(file_path, 'r') as file:
		tripJson = json.load(file)
except FileNotFoundError:
	print(f"File not found: {file_path}")

# extracting the links data from static data
links = pd.read_csv('static/links.csv')
stopTimes = pd.read_csv('static/stop_times.csv')
uniqueLinks = pd.read_csv('static/unique_links.csv')


In [2]:
tripJson

{'header': {'gtfs_realtime_version': '2.0',
  'incrementality': 0,
  'timestamp': 1709251192},
 'entity': [{'id': '0',
   'trip_update': {'trip': {'trip_id': '5480025_19778',
     'start_time': '19:06:00',
     'start_date': '20240229',
     'schedule_relationship': 0,
     'route_id': 'RED',
     'direction_id': 0},
    'stop_time_update': [{'stop_sequence': 1,
      'departure': {'time': 1709251560, 'uncertainty': 0},
      'stop_id': 'PF_A15_C',
      'schedule_relationship': 0},
     {'stop_sequence': 2,
      'arrival': {'time': 1709251752, 'uncertainty': 0},
      'stop_id': 'PF_A14_C',
      'schedule_relationship': 0},
     {'stop_sequence': 3,
      'arrival': {'time': 1709251946, 'uncertainty': 0},
      'stop_id': 'PF_A13_C',
      'schedule_relationship': 0},
     {'stop_sequence': 4,
      'arrival': {'time': 1709252092, 'uncertainty': 0},
      'stop_id': 'PF_A12_C',
      'schedule_relationship': 0},
     {'stop_sequence': 5,
      'arrival': {'time': 1709252249, 'uncert

In [3]:
def get_trip_data(tripJson: dict) -> pd.DataFrame:
    data = []
    for entity in tripJson.get('entity', []):  # Safely get 'entity' list
        trip_update = entity.get('trip_update', {})
        stop_time_updates = trip_update.get('stop_time_update', [])
        
        for stop_time_update in stop_time_updates:
            # Extract arrival and departure times
            arrival_time = stop_time_update.get('arrival', {}).get('time', None)
            departure_time = stop_time_update.get('departure', {}).get('time', None)
            
            # Ensure the timestamps are numeric and within a valid range
            if isinstance(arrival_time, (int, float)):
                if arrival_time < 0 or arrival_time > 2**31 - 1:  # Limit is 2038
                    arrival_time = None
            else:
                arrival_time = None
            
            if isinstance(departure_time, (int, float)):
                if departure_time < 0 or departure_time > 2**31 - 1:  # Limit is 2038
                    departure_time = None
            else:
                departure_time = None
            
            # Append the cleaned data
            data.append({
                'trip_id': trip_update.get('trip', {}).get('trip_id', None),
                'stop_id': stop_time_update.get('stop_id', None),
                'stop_sequence': stop_time_update.get('stop_sequence', None),
                'arrival_time': arrival_time,
                'arrival_uncertainty': stop_time_update.get('arrival', {}).get('uncertainty', None),
                'departure_time': departure_time,
                'departure_uncertainty': stop_time_update.get('departure', {}).get('uncertainty', None),
            })
    
    # Create a DataFrame from the cleaned data
    currentStopTimes = pd.DataFrame(data)
    
    return currentStopTimes

def calculateLinkParams(currentStopTimes: pd.DataFrame, lengthdf: pd.DataFrame) -> pd.DataFrame:
    currentStopTimes = currentStopTimes.sort_values(by=['trip_id', 'stop_sequence'])
    next_stop_times = currentStopTimes.shift(-1)

    mask = currentStopTimes['trip_id'] == next_stop_times['trip_id']
    link_data = currentStopTimes[mask].copy()
    link_data['end_stop'] = next_stop_times['stop_id']
    link_data['end_sequence'] = next_stop_times['stop_sequence']
    link_data['end_time'] = next_stop_times['arrival_time']

    # Ensure the trips are in sequence order
    link_data = link_data[link_data['stop_sequence'] == link_data['end_sequence'] - 1]

    # Ensure start_stop and end_stop are not the same
    link_data = link_data[link_data['stop_id'] != link_data['end_stop']]

    link_data['start_time'] = link_data['departure_time'].fillna(link_data['arrival_time'])
    link_data['real_time_taken [mins]'] = (link_data['end_time'] - link_data['start_time']) / 60

    # Drop rows where the time taken is negative
    link_data = link_data[link_data['real_time_taken [mins]'] >= 0]

    # Inner join to keep only links available in lengthdf
    link_data = link_data.merge(
        lengthdf,
        left_on=['stop_id', 'end_stop'],
        right_on=['start_stop', 'end_stop'],
        how='inner'
    )
    link_data['real_speed [km/h]'] = link_data['length'] / (link_data['real_time_taken [mins]'] / 60)
    
    return link_data[['trip_id', 'stop_id', 'end_stop', 'stop_sequence', 'end_sequence', 
                      'real_time_taken [mins]', 'start_time', 'end_time', 'length', 'real_speed [km/h]']].rename(
        columns={'stop_id': 'start_stop', 'stop_sequence': 'start_sequence'}
    )

def calculateLinkKpis(currentLinks: pd.DataFrame, uniqueLinks: pd.DataFrame, stopTimes: pd.DataFrame) -> pd.DataFrame:
    for index, row in uniqueLinks.iterrows():
        link = currentLinks[(currentLinks['start_stop'] == row['start_stop']) & (currentLinks['end_stop'] == row['end_stop'])]

        mean_speed = link['real_speed [km/h]'].mean()
        covariance_speed = link['real_speed [km/h]'].std() / mean_speed if mean_speed else 0
        buffer_speed = link['real_speed [km/h]'].quantile(0.95) - link['real_speed [km/h]'].quantile(0.5)

        mean_time = link['real_time_taken [mins]'].mean()
        covariance_time = link['real_time_taken [mins]'].std() / mean_time if mean_time else 0
        buffer_time = link['real_time_taken [mins]'].quantile(0.95) - link['real_time_taken [mins]'].quantile(0.5)

        no_of_trips = len(link)
        uniqueLinks.loc[index, 'no_of_trips'] = no_of_trips

        uniqueLinks.loc[index, 'mean_speed [km/h]'] = mean_speed
        uniqueLinks.loc[index, 'covariance_speed [%]'] = covariance_speed
        uniqueLinks.loc[index, 'buffer_speed [km/h]'] = buffer_speed
        uniqueLinks.loc[index, 'mean_time [mins]'] = mean_time
        uniqueLinks.loc[index, 'covariance_time [%]'] = covariance_time
        uniqueLinks.loc[index, 'buffer_time [mins]'] = buffer_time

        trips = stopTimes[(stopTimes['stop_id'] == row['end_stop']) & (stopTimes['stop_sequence'] == row['end_sequence'])].copy()
        trips['departure_time'] = trips['departure_time'].fillna(trips['arrival_time'])
        headways = trips['departure_time'].diff().dropna().abs() / 60

        if not headways.empty:
            mean_headway = headways.mean()
            covariance_headway = headways.std() / mean_headway if mean_headway else 0
            buffer_headway = headways.quantile(0.95) - headways.quantile(0.5)
            uniqueLinks.loc[index, 'mean_headway [mins]'] = mean_headway
            uniqueLinks.loc[index, 'covariance_headway [%]'] = covariance_headway
            uniqueLinks.loc[index, 'buffer_headway [mins]'] = buffer_headway
        else:
            uniqueLinks.loc[index, 'mean_headway [mins]'] = None
            uniqueLinks.loc[index, 'covariance_headway [%]'] = None
            uniqueLinks.loc[index, 'buffer_headway [mins]'] = None

    return uniqueLinks


In [4]:
currentStopTimes = get_trip_data(tripJson)
currentStopTimes

Unnamed: 0,trip_id,stop_id,stop_sequence,arrival_time,arrival_uncertainty,departure_time,departure_uncertainty
0,5480025_19778,PF_A15_C,1,,,1.709252e+09,0.0
1,5480025_19778,PF_A14_C,2,1.709252e+09,0.0,,
2,5480025_19778,PF_A13_C,3,1.709252e+09,0.0,,
3,5480025_19778,PF_A12_C,4,1.709252e+09,0.0,,
4,5480025_19778,PF_A11_C,5,1.709252e+09,0.0,,
...,...,...,...,...,...,...,...
3658,5644119_19778,PF_N08_C,30,1.709250e+09,0.0,,
3659,5644119_19778,PF_N09_C,31,1.709251e+09,0.0,,
3660,5644119_19778,PF_N10_C,32,1.709251e+09,0.0,,
3661,5644119_19778,PF_N11_C,33,1.709251e+09,0.0,,


In [5]:
uniqueLinks

Unnamed: 0,start_stop,end_stop,start_sequence,end_sequence,length,mean_speed [km/h],covariance_speed [%],buffer_speed [km/h],mean_time [mins],covariance_time [%],buffer_time [mins],mean_headway [mins],covariance_headway [%],buffer_headway [mins]
0,PF_A15_C,PF_A14_C,1,2,2.6710,40.065000,1.774532e-16,0.0,4.0,0.0,0.0,13.954708,5.889077,9.00
1,PF_A14_C,PF_A13_C,2,3,2.1048,42.096000,1.688916e-16,0.0,3.0,0.0,0.0,13.954708,5.889077,9.00
2,PF_A13_C,PF_A12_C,3,4,1.0988,32.964000,0.000000e+00,0.0,2.0,0.0,0.0,13.954708,5.889077,9.00
3,PF_A12_C,PF_A11_C,4,5,1.4536,29.072000,0.000000e+00,0.0,3.0,0.0,0.0,13.954708,5.889077,9.00
4,PF_A11_C,PF_A10_C,5,6,2.1612,32.418000,2.193121e-16,0.0,4.0,0.0,0.0,13.954708,5.889077,9.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,PF_N11_C,PF_N10_C,2,3,2.8733,34.479600,0.000000e+00,0.0,5.0,0.0,0.0,40.306034,3.493899,41.85
198,PF_N10_C,PF_N09_C,3,4,2.0799,31.198628,4.425913e-05,0.0,4.0,0.0,0.0,40.306034,3.493899,41.85
199,PF_N09_C,PF_N08_C,4,5,1.7224,34.447983,5.344577e-06,0.0,3.0,0.0,0.0,40.306034,3.493899,41.85
200,PF_N08_C,PF_N07_C,5,6,1.3477,26.954000,5.260984e-16,0.0,3.0,0.0,0.0,41.942149,3.397802,48.00


In [6]:
lengthdf = uniqueLinks[['start_stop','end_stop','length']]
lengthdf

Unnamed: 0,start_stop,end_stop,length
0,PF_A15_C,PF_A14_C,2.6710
1,PF_A14_C,PF_A13_C,2.1048
2,PF_A13_C,PF_A12_C,1.0988
3,PF_A12_C,PF_A11_C,1.4536
4,PF_A11_C,PF_A10_C,2.1612
...,...,...,...
197,PF_N11_C,PF_N10_C,2.8733
198,PF_N10_C,PF_N09_C,2.0799
199,PF_N09_C,PF_N08_C,1.7224
200,PF_N08_C,PF_N07_C,1.3477


In [7]:
currentLinks = calculateLinkParams(currentStopTimes, lengthdf)
currentLinks

Unnamed: 0,trip_id,start_stop,end_stop,start_sequence,end_sequence,real_time_taken [mins],start_time,end_time,length,real_speed [km/h]
0,5479823_19778,PF_A15_C,PF_A14_C,1,2.0,4.066667,1.709248e+09,1.709248e+09,2.6710,39.408197
1,5479823_19778,PF_A14_C,PF_A13_C,2,3.0,3.266667,1.709248e+09,1.709248e+09,2.1048,38.659592
2,5479823_19778,PF_A13_C,PF_A12_C,3,4.0,2.450000,1.709248e+09,1.709248e+09,1.0988,26.909388
3,5479823_19778,PF_A12_C,PF_A11_C,4,5.0,2.450000,1.709248e+09,1.709248e+09,1.4536,35.598367
4,5479823_19778,PF_A11_C,PF_A10_C,5,6.0,3.500000,1.709248e+09,1.709249e+09,2.1612,37.049143
...,...,...,...,...,...,...,...,...,...,...
3010,NR335,PF_D07_C,PF_D08_C,18,19.0,2.066667,1.709253e+09,1.709253e+09,0.7653,22.218387
3011,NR335,PF_D08_C,PF_D09_C,19,20.0,4.250000,1.709253e+09,1.709253e+09,2.0172,28.478118
3012,NR335,PF_D09_C,PF_D10_C,20,21.0,2.033333,1.709253e+09,1.709254e+09,0.9767,28.820656
3013,NR335,PF_D10_C,PF_D11_1,21,22.0,2.300000,1.709254e+09,1.709254e+09,1.1506,30.015652


In [8]:
# calculating the current unique links
currentUniqueLinks = currentLinks.drop_duplicates(subset=['start_stop', 'end_stop'])[['start_stop', 'end_stop', 'start_sequence', 'end_sequence']]
currentUniqueLinks.reset_index(drop=True, inplace=True)
currentUniqueLinks

Unnamed: 0,start_stop,end_stop,start_sequence,end_sequence
0,PF_A15_C,PF_A14_C,1,2.0
1,PF_A14_C,PF_A13_C,2,3.0
2,PF_A13_C,PF_A12_C,3,4.0
3,PF_A12_C,PF_A11_C,4,5.0
4,PF_A11_C,PF_A10_C,5,6.0
...,...,...,...,...
195,PF_D13_C,PF_D12_C,1,2.0
196,PF_D12_C,PF_D11_2,2,3.0
197,PF_D11_2,PF_D10_C,3,4.0
198,PF_D10_C,PF_D09_C,4,5.0


In [9]:
kpis = calculateLinkKpis(currentLinks, currentUniqueLinks, currentStopTimes)
kpis

Unnamed: 0,start_stop,end_stop,start_sequence,end_sequence,no_of_trips,mean_speed [km/h],covariance_speed [%],buffer_speed [km/h],mean_time [mins],covariance_time [%],buffer_time [mins],mean_headway [mins],covariance_headway [%],buffer_headway [mins]
0,PF_A15_C,PF_A14_C,1,2.0,15.0,41.506784,0.247898,11.200224,4.258889,0.443116,2.546667,12.443137,0.915950,29.860000
1,PF_A14_C,PF_A13_C,2,3.0,15.0,37.277533,0.220786,4.636398,3.726667,0.483377,2.355000,12.407843,0.921523,29.923333
2,PF_A13_C,PF_A12_C,3,4.0,15.0,25.374447,0.203750,0.228961,2.923333,0.592416,2.358333,12.390196,0.923199,29.856667
3,PF_A12_C,PF_A11_C,4,5.0,15.0,32.649997,0.259012,7.607167,3.030000,0.571775,2.266667,12.290196,0.932767,29.803333
4,PF_A11_C,PF_A10_C,5,6.0,15.0,35.026146,0.189858,0.000000,3.990000,0.419464,2.565000,12.234314,0.932287,29.590000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,PF_D13_C,PF_D12_C,1,2.0,7.0,45.178676,0.218696,7.978809,2.080952,0.239759,0.860000,17.072222,0.992027,35.926667
196,PF_D12_C,PF_D11_2,2,3.0,9.0,33.423173,0.193768,0.813304,3.527778,0.284202,2.160000,16.811111,0.996763,35.670000
197,PF_D11_2,PF_D10_C,3,4.0,9.0,26.802098,0.236328,5.068115,2.809259,0.409426,1.980000,16.637037,0.997063,35.183333
198,PF_D10_C,PF_D09_C,4,5.0,9.0,25.855517,0.249284,2.594519,2.527778,0.462503,2.306667,16.531481,0.992534,34.653333


In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def process_file(filename: str, lengthdf:pd.DataFrame) -> pd.DataFrame:
    file_path = parent_folder + trip_updates + filename
    try:
        with open(file_path, 'r') as file:
            tripJson = json.load(file)
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        return None

    datetime_str = filename.split('.')[0]
    try:
        file_datetime = datetime.strptime(datetime_str, '%Y_%d_%m_%H_%M_%S')
    except Exception as e:
        logging.error(f"Error parsing datetime from {filename}: {e}")
        return None

    logging.info(f"Processing file: {file_datetime}")

    currentStopTimes = get_trip_data(tripJson)
    if currentStopTimes.empty:
        logging.info("No valid data found in file.")
        return None

    currentLinks = calculateLinkParams(currentStopTimes, lengthdf)
    currentUniqueLinks = currentLinks.drop_duplicates(subset=['start_stop', 'end_stop'])[['start_stop', 'end_stop', 'start_sequence', 'end_sequence']]
    currentUniqueLinks.reset_index(drop=True, inplace=True)
    kpis = calculateLinkKpis(currentLinks, currentUniqueLinks, currentStopTimes)
    
    kpis['datetime'] = file_datetime
    
    preset_columns = [
        'start_stop', 'end_stop', 'start_sequence', 'end_sequence', 'no_of_trips',
        'mean_speed [km/h]', 'covariance_speed [%]', 'buffer_speed [km/h]',
        'mean_time [mins]', 'covariance_time [%]', 'buffer_time [mins]',
        'mean_headway [mins]', 'covariance_headway [%]', 'buffer_headway [mins]',
        'datetime'
    ]
    kpis = kpis.reindex(columns=preset_columns)
    
    return kpis

# Directory for output file
link_file = 'output-single/link_kpis_second.csv'
output_folder = 'output-single/'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

files = [f for f in os.listdir(parent_folder + trip_updates) if f.endswith('.json')]

results = []
with ThreadPoolExecutor(max_workers=1000) as executor:
    futures = [executor.submit(process_file, filename, lengthdf) for filename in files]
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            result = future.result()
            if result is not None:
                results.append(result)
        except Exception as e:
            logging.error(f"Error: {e}")

if results:
    final_kpis = pd.concat(results, ignore_index=True)
    final_kpis.to_csv(link_file, index=False)

2025-02-22 21:33:05,446 - INFO - Processing file: 2024-03-01 00:00:00
2025-02-22 21:33:05,457 - INFO - Processing file: 2024-03-01 00:01:00
2025-02-22 21:33:05,465 - INFO - Processing file: 2024-03-01 00:02:00
2025-02-22 21:33:05,485 - INFO - Processing file: 2024-03-01 00:03:00
2025-02-22 21:33:05,541 - INFO - Processing file: 2024-03-01 00:04:00
2025-02-22 21:33:05,550 - INFO - Processing file: 2024-03-01 00:05:00
2025-02-22 21:33:05,571 - INFO - Processing file: 2024-03-01 00:06:00
2025-02-22 21:33:05,585 - INFO - Processing file: 2024-03-01 00:07:00
2025-02-22 21:33:05,614 - INFO - Processing file: 2024-03-01 00:08:00
2025-02-22 21:33:05,636 - INFO - Processing file: 2024-03-01 00:09:00
2025-02-22 21:33:05,646 - INFO - Processing file: 2024-03-01 00:10:00
2025-02-22 21:33:05,662 - INFO - Processing file: 2024-03-01 00:12:00
2025-02-22 21:33:05,664 - INFO - Processing file: 2024-03-01 00:11:00
2025-02-22 21:33:05,686 - INFO - Processing file: 2024-03-01 00:13:00
2025-02-22 21:33:05,

In [None]:
print("Done!")

Done!
