# Overview

Calculates three metrics at the daily level 
| Metric | Calculation | Captures | Ignores |
|--------|-------------|----------|---------|
| Mean Delay | simple avg of arrival delay of each stop in each trip -> avg of all trips for each day | Overall | correlation between delays for all stops in a trip, recovery |
| Net Additional Delay | `arrival_delay(n) - arrival_delay(n-1)` for each stop **->** avg of all stops in each trip **->** avg for all trips in a day | Schedule drift along a trip, or systematic lateness or earliness | Recovery behavior, delay at stop 1 |
| Absolute Additional Delay | `abs(net_additional_delay)` **->** avg of all stops in a trip **->** avg for all trips in a day | Early AND late arrivals | Net effect |
| Delay Recovery Index | `abs_additional_delay - abs(net_additional_dealy)` avg all trips in a day | Recovery: instability minus drift | final outcome |

# SETUP

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read trip and weather data 
control_weather = pd.read_csv('./control_days_DEPTH_BASED.csv')
snow_weather = pd.read_csv('./snow_days_DEPTH_BASED.csv')
trip_data = pd.read_csv('all_trip_data.csv')

trip_data

Unnamed: 0,trip_id,stop_sequence,stop_id,arrival_time,arrival_delay,arrival_uncertainty,departure_time,departure_delay,departure_uncertainty,update_timestamp,trip_schedule_relationship,date,arrival_time_iso,arrival_time_local,trip_day_id
0,14010000572359051,1,9022001010098003,1610602254,54,,1610602254,54,0.0,1610602789,0,2021-01-14,2021-01-14 05:30:54+00:00,2021-01-14 06:30:54+01:00,18
1,14010000572359051,2,9022001010627001,1610602337,78,,1610602340,81,,1610602849,0,2021-01-14,2021-01-14 05:32:17+00:00,2021-01-14 06:32:17+01:00,18
2,14010000572359051,3,9022001010203002,1610602373,74,0.0,1610602373,74,0.0,1610602789,0,2021-01-14,2021-01-14 05:32:53+00:00,2021-01-14 06:32:53+01:00,18
3,14010000572359051,4,9022001010045001,1610602488,137,0.0,1610602488,137,0.0,1610602789,0,2021-01-14,2021-01-14 05:34:48+00:00,2021-01-14 06:34:48+01:00,18
4,14010000572359051,5,9022001010040001,1610602608,185,0.0,1610602608,185,0.0,1610602789,0,2021-01-14,2021-01-14 05:36:48+00:00,2021-01-14 06:36:48+01:00,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84934,14010000649240506,21,9022001010198002,1707119585,22,0.0,1707119605,42,0.0,1707119856,0,2024-02-05,2024-02-05 07:53:05+00:00,2024-02-05 08:53:05+01:00,858
84935,14010000649240506,22,9022001010045002,1707119773,-10,0.0,1707119773,-10,0.0,1707119856,0,2024-02-05,2024-02-05 07:56:13+00:00,2024-02-05 08:56:13+01:00,858
84936,14010000649240506,23,9022001010203001,1707119837,-15,0.0,1707119848,-4,0.0,1707119856,0,2024-02-05,2024-02-05 07:57:17+00:00,2024-02-05 08:57:17+01:00,858
84937,14010000649240506,24,9022001010627002,1707119925,-21,0.0,1707119943,-3,0.0,1707119951,0,2024-02-05,2024-02-05 07:58:45+00:00,2024-02-05 08:58:45+01:00,858


# Calculations


In [3]:
# stolen from Marina's notebook 🙏
def calc_additional_arrival_delay(df):
    # additional delay at each station (does not capture or penalize delay recovery or early arrivals)
    df['net_additional_arrival_delay'] = df.groupby('trip_day_id')['arrival_delay'].diff().fillna(0)
    df.loc[df['stop_sequence'] == 1, 'net_additional_arrival_delay'] = 0

    # absolute value of the net delay to "penalize" recovery
    df['abs_additional_arrival_delay'] = df['net_additional_arrival_delay'].abs()

    return df

In [4]:
calc_additional_arrival_delay(trip_data)
trip_data

Unnamed: 0,trip_id,stop_sequence,stop_id,arrival_time,arrival_delay,arrival_uncertainty,departure_time,departure_delay,departure_uncertainty,update_timestamp,trip_schedule_relationship,date,arrival_time_iso,arrival_time_local,trip_day_id,net_additional_arrival_delay,abs_additional_arrival_delay
0,14010000572359051,1,9022001010098003,1610602254,54,,1610602254,54,0.0,1610602789,0,2021-01-14,2021-01-14 05:30:54+00:00,2021-01-14 06:30:54+01:00,18,0.0,0.0
1,14010000572359051,2,9022001010627001,1610602337,78,,1610602340,81,,1610602849,0,2021-01-14,2021-01-14 05:32:17+00:00,2021-01-14 06:32:17+01:00,18,24.0,24.0
2,14010000572359051,3,9022001010203002,1610602373,74,0.0,1610602373,74,0.0,1610602789,0,2021-01-14,2021-01-14 05:32:53+00:00,2021-01-14 06:32:53+01:00,18,-4.0,4.0
3,14010000572359051,4,9022001010045001,1610602488,137,0.0,1610602488,137,0.0,1610602789,0,2021-01-14,2021-01-14 05:34:48+00:00,2021-01-14 06:34:48+01:00,18,63.0,63.0
4,14010000572359051,5,9022001010040001,1610602608,185,0.0,1610602608,185,0.0,1610602789,0,2021-01-14,2021-01-14 05:36:48+00:00,2021-01-14 06:36:48+01:00,18,48.0,48.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84934,14010000649240506,21,9022001010198002,1707119585,22,0.0,1707119605,42,0.0,1707119856,0,2024-02-05,2024-02-05 07:53:05+00:00,2024-02-05 08:53:05+01:00,858,-14.0,14.0
84935,14010000649240506,22,9022001010045002,1707119773,-10,0.0,1707119773,-10,0.0,1707119856,0,2024-02-05,2024-02-05 07:56:13+00:00,2024-02-05 08:56:13+01:00,858,-32.0,32.0
84936,14010000649240506,23,9022001010203001,1707119837,-15,0.0,1707119848,-4,0.0,1707119856,0,2024-02-05,2024-02-05 07:57:17+00:00,2024-02-05 08:57:17+01:00,858,-5.0,5.0
84937,14010000649240506,24,9022001010627002,1707119925,-21,0.0,1707119943,-3,0.0,1707119951,0,2024-02-05,2024-02-05 07:58:45+00:00,2024-02-05 08:58:45+01:00,858,-6.0,6.0


In [None]:
# per-trip averages for each version of accumulated delay
trip_delay_metrics = trip_data.groupby(['date', 'trip_day_id']).agg({
    'arrival_delay': 'mean',
    'net_additional_arrival_delay': 'mean',
    'abs_additional_arrival_delay': 'mean',
    'stop_sequence': 'max'
}).reset_index().rename(columns={
    'arrival_delay': 'mean_arrival_delay',
    'net_additional_arrival_delay': 'mean_net_additional_delay',
    'abs_additional_arrival_delay': 'mean_abs_additional_delay',
    'stop_sequence': 'num_stops'
})

# "delay recovery index"
# low values -> delays kept accumulating and were not recovered (the trips finished with delay)
# high values -> within trip fluctuations were mostly recovered and trips tended to finish on time
trip_delay_metrics['delay_recovery_index'] = trip_delay_metrics['mean_abs_additional_delay'] - trip_delay_metrics['mean_net_additional_delay'].abs()



# aggregate to daily averages
daily_delay_metrics = trip_delay_metrics.groupby(['date']).agg({
    'mean_arrival_delay': 'mean',
    'mean_net_additional_delay': 'mean',
    'mean_abs_additional_delay': 'mean',
    'delay_recovery_index': 'mean'
}).reset_index().rename(columns={
    'delay_recovery_index': 'mean_delay_recovery_index'
})


daily_delay_metrics

Unnamed: 0,date,mean_arrival_delay,mean_net_additional_delay,mean_abs_additional_delay,mean_delay_recovery_index,proportion_late_stops
0,2021-01-04,72.178571,0.224490,24.295918,22.785714,0.000000
1,2021-01-05,32.130580,-1.726562,32.414062,26.805804,0.012277
2,2021-01-07,56.006122,-2.540816,33.985714,27.871429,0.026531
3,2021-01-08,42.988095,-1.333333,33.235931,25.796537,0.002165
4,2021-01-11,151.557604,3.520737,36.624424,28.092166,0.152074
...,...,...,...,...,...,...
100,2024-02-07,94.467500,0.387500,39.935000,34.932500,0.083750
101,2024-02-09,142.500645,3.360000,41.589677,35.393548,0.138065
102,2024-02-13,121.844000,4.421333,41.173333,32.898667,0.128000
103,2024-02-14,198.947097,5.169032,40.714839,33.081290,0.232258


In [9]:
# Merge weather with daily delay
snow_weather_with_delay = pd.merge(snow_weather, daily_delay_metrics, 'left', on="date").drop(["Unnamed: 0", "date_dt"], axis=1)
control_weather_with_delay = pd.merge(control_weather, daily_delay_metrics, 'left', on="date").drop(["Unnamed: 0", "date_dt"], axis=1)

snow_weather_with_delay = snow_weather_with_delay[['date'] + [col for col in snow_weather_with_delay.columns if col != 'date']]
control_weather_with_delay = control_weather_with_delay[['date'] + [col for col in control_weather_with_delay.columns if col != 'date']]

In [10]:
# write to file
snow_weather_with_delay.to_csv("./weather_delay_SNOW.csv", index=False)
control_weather_with_delay.to_csv("./weather_delay_CONTROL.csv", index=False)