In [51]:
import joblib
import os
import pandas as pd
import random

In [2]:
import sys
# Import custom code
sys.path.insert(0, '..')
from src.constants import ROOT_DIR, DATA_DIR, DOWNLOAD_DIR, LOCAL_TIMEZONE
from src.helper_functions import fetch_weather, parse_gtfs_time, get_route_bearing

In [3]:
data_path = os.path.join(ROOT_DIR, DATA_DIR)
download_path = os.path.join(data_path, DOWNLOAD_DIR)
routes_path = os.path.join(download_path, 'routes.txt')
trips_path = os.path.join(download_path, 'trips.txt')
stop_times_path = os.path.join(download_path, 'stop_times.txt')
calendar_path = os.path.join(download_path, 'calendar.txt')
stops_path = os.path.join(data_path, 'stops_with_clusters.csv')
avg_delay_path = os.path.join(data_path, 'hist_avg_delays.csv')

In [4]:
routes_df = pd.read_csv(routes_path)
trips_df = pd.read_csv(trips_path)
stop_times_df = pd.read_csv(stop_times_path)
stops_df = pd.read_csv(stops_path)
calendar_df = pd.read_csv(calendar_path, parse_dates=['start_date', 'end_date'], date_format='%Y%m%d')
avg_delay_df = pd.read_csv(avg_delay_path)
sch_rel_weights = joblib.load('../models/sch_rel_weights.pkl')

In [5]:
# Convert calendar start and end date to local timezone
calendar_df['start_date'] = calendar_df['start_date'].dt.tz_localize(LOCAL_TIMEZONE)
calendar_df['end_date'] = calendar_df['end_date'].dt.tz_localize(LOCAL_TIMEZONE) + pd.Timedelta(days=1)

In [6]:
 trip_data = {}

In [9]:
chosen_time_local = pd.Timestamp.now(tz=LOCAL_TIMEZONE)
chosen_time_local

Timestamp('2025-05-17 16:00:20.588307-0400', tz='Canada/Eastern')

In [10]:
# Create day of week filters
day_mask = False
day_of_week = chosen_time_local.day_of_week
match day_of_week:
	case 0:
		day_mask = calendar_df['monday'] == 1
	case 1:
		day_mask = calendar_df['tuesday'] == 1
	case 2:
		day_mask = calendar_df['wednesday'] == 1
	case 3:
		day_mask = calendar_df['thursday'] == 1
	case 4:
		day_mask = calendar_df['friday'] == 1
	case 5:
		day_mask = calendar_df['saturday'] == 1
	case 6:
		day_mask = calendar_df['sunday'] == 1

In [None]:
# Filter calendar
date_mask = (chosen_time_local >= calendar_df['start_date']) & (chosen_time_local <= calendar_df['end_date'])
filtered_calendar = calendar_df[day_mask & date_mask]
service_ids = filtered_calendar['service_id'].unique().tolist()

In [None]:
# Filter trips
filtered_trips_df = trips_df[trips_df['service_id'].isin(service_ids)]

In [13]:
# Merge stop times with trips
merged_stop_times = pd.merge(left=stop_times_df, right=filtered_trips_df, how='inner', on='trip_id')

In [14]:
  # Add chosen time and start date
  merged_stop_times['chosen_time'] = chosen_time_local
  merged_stop_times['start_date'] = pd.to_datetime(merged_stop_times['chosen_time'].dt.date)

In [None]:
merged_stop_times['start_date']

0        2025-05-17
1        2025-05-17
2        2025-05-17
3        2025-05-17
4        2025-05-17
            ...    
496820   2025-05-17
496821   2025-05-17
496822   2025-05-17
496823   2025-05-17
496824   2025-05-17
Name: start_date, Length: 496825, dtype: datetime64[ns]

In [17]:
# Parse scheduled arrival time
merged_stop_times['sch_arrival_time'] = parse_gtfs_time(merged_stop_times, 'start_date', 'arrival_time', unit='ns')

In [18]:
merged_stop_times['sch_arrival_time']

0        2025-05-17 05:13:00-04:00
1        2025-05-17 05:14:51-04:00
2        2025-05-17 05:16:12-04:00
3        2025-05-17 05:17:00-04:00
4        2025-05-17 05:17:31-04:00
                    ...           
496820   2025-05-18 01:04:00-04:00
496821   2025-05-18 01:06:00-04:00
496822   2025-05-18 01:07:00-04:00
496823   2025-05-18 01:08:00-04:00
496824   2025-05-18 01:10:00-04:00
Name: sch_arrival_time, Length: 496825, dtype: datetime64[ns, Canada/Eastern]

In [19]:
# Add arrival hour
merged_stop_times['arrival_hour'] = merged_stop_times['sch_arrival_time'].dt.floor('h')

In [20]:
# Merge stops
scheduled_trips_df = pd.merge(left=merged_stop_times, right=stops_df, how='left', on='stop_id')

In [25]:
route_id = 146
direction = 'Ouest'
stop_id = 62173

In [23]:
# Filter route_id and direction
trip_mask = (scheduled_trips_df['route_id'] == route_id) & (scheduled_trips_df['trip_headsign'] == direction)
filtered_schedule_df = scheduled_trips_df[trip_mask]

In [26]:
# Filter by stop_id
filtered_by_stop_df = filtered_schedule_df[filtered_schedule_df['stop_id'] == stop_id]
filtered_by_stop_df = filtered_by_stop_df.sort_values('sch_arrival_time')

In [27]:
# Add hourly frequency
filtered_by_stop_df['arrivals_per_hour'] = filtered_by_stop_df.groupby('arrival_hour').transform('size')

In [28]:
# Get arrivals after chosen time
next_arrivals_filter = filtered_by_stop_df['sch_arrival_time'] >= filtered_by_stop_df['chosen_time']
next_arrivals = filtered_by_stop_df[next_arrivals_filter].sort_values('sch_arrival_time')

In [29]:
# Get next arrival
next_arrival = next_arrivals.head(1).squeeze()
next_trip_id = int(next_arrival['trip_id'])
next_arrival_time = next_arrival['sch_arrival_time']

In [None]:
# Get current trip
current_trip = filtered_schedule_df[filtered_schedule_df['trip_id'] == next_trip_id].sort_values('stop_sequence')

In [33]:
# Get current stop
current_stop = current_trip[current_trip['stop_id'] == stop_id].squeeze()

In [30]:
next_arrival

trip_id                                         284301447
arrival_time                                     16:09:46
departure_time                                   16:09:46
stop_id                                             62173
stop_sequence                                           8
route_id                                              146
service_id                              25M-H58M000A-81-A
trip_headsign                                       Ouest
direction_id                                            1
shape_id                                          1460136
wheelchair_accessible                                   2
note_fr                                               NaN
note_en                                               NaN
chosen_time              2025-05-17 16:00:20.588307-04:00
start_date                            2025-05-17 00:00:00
sch_arrival_time                2025-05-17 16:09:46-04:00
arrival_hour                    2025-05-17 16:00:00-04:00
stop_cluster  

In [34]:
current_stop

trip_id                                         284301447
arrival_time                                     16:09:46
departure_time                                   16:09:46
stop_id                                             62173
stop_sequence                                           8
route_id                                              146
service_id                              25M-H58M000A-81-A
trip_headsign                                       Ouest
direction_id                                            1
shape_id                                          1460136
wheelchair_accessible                                   2
note_fr                                               NaN
note_en                                               NaN
chosen_time              2025-05-17 16:00:20.588307-04:00
start_date                            2025-05-17 00:00:00
sch_arrival_time                2025-05-17 16:09:46-04:00
arrival_hour                    2025-05-17 16:00:00-04:00
stop_cluster  

In [41]:
# Add stop cluster
trip_data['stop_cluster'] = float(next_arrival['stop_cluster'])
trip_data

{'stop_cluster': 2.0}

In [38]:
# Get first and last stop coordinates
origin_lat = current_trip.iloc[0]['stop_lat']
origin_lon = current_trip.iloc[0]['stop_lon']
dest_lat = current_trip.iloc[-1]['stop_lat']
dest_lon = current_trip.iloc[-1]['stop_lon']

In [47]:
# Calculate route bearing
bearing = get_route_bearing(dest_lon, origin_lon, dest_lat, origin_lat)
trip_data['route_bearing'] = bearing
trip_data

{'stop_cluster': 2.0,
 'bearing': 183.78786897310056,
 'exp_trip_duration': 1740.0,
 'hist_avg_delay': 0.0,
 'route_bearing': 183.78786897310056}

In [43]:
# Add expected trip duration
trip_start = current_trip['sch_arrival_time'].min()
trip_end = current_trip['sch_arrival_time'].max()
trip_data['exp_trip_duration'] = (trip_end - trip_start) / pd.Timedelta(seconds=1)
trip_data

{'stop_cluster': 2.0,
 'bearing': 183.78786897310056,
 'exp_trip_duration': 1740.0}

In [46]:
# Add historical average delay
hour = next_arrival['sch_arrival_time'].hour
hist_filter = (avg_delay_df['route_id'] == route_id) & (avg_delay_df['stop_id'] == stop_id) & (avg_delay_df['hour'] == hour)
filtered_avg_delay = avg_delay_df[hist_filter]
hist_avg_delay = float(filtered_avg_delay['hist_avg_delay'].iloc[0])
trip_data['hist_avg_delay'] = hist_avg_delay
trip_data

{'stop_cluster': 2.0,
 'bearing': 183.78786897310056,
 'exp_trip_duration': 1740.0,
 'hist_avg_delay': 0.0}

In [49]:
# Add arrivals per hour
arrivals_per_hour = int(next_arrival['arrivals_per_hour'])
trip_data['arrivals_per_hour'] = arrivals_per_hour
trip_data

{'stop_cluster': 2.0,
 'bearing': 183.78786897310056,
 'exp_trip_duration': 1740.0,
 'hist_avg_delay': 0.0,
 'route_bearing': 183.78786897310056,
 'arrivals_per_hour': 2}

In [None]:

sch_rel_weights

{'Scheduled': 0.9743501832186678, 'NotScheduled': 0.025649816781332144}

In [58]:
trip_data

{'stop_cluster': 2.0,
 'bearing': 183.78786897310056,
 'exp_trip_duration': 1740.0,
 'hist_avg_delay': 0.0,
 'route_bearing': 183.78786897310056,
 'arrivals_per_hour': 2,
 'schedule_relationship_Scheduled': 1}

In [57]:
  sch_rel = random.choices([1, 0], weights=sch_rel_weights.values(), k=1)[0]
  trip_data['schedule_relationship_Scheduled'] = sch_rel