## Beginning

In [89]:
import geopandas as gpd
import pandas as pd
import sys

In [None]:
calendar_df = pd.read_csv('../data/download/calendar.txt')
stop_times_df = pd.read_csv('../data/download/stop_times.txt')
trips_df = pd.read_csv('../data/download/trips.txt')
stops_df = pd.read_csv('../data/stops_with_clusters.csv') 
avg_delay_df = pd.read_csv('../data/hist_avg_delays.csv')

In [3]:
# Import custom code
sys.path.insert(0, '..')
from src.constants import LOCAL_TIMEZONE
from src.helper_functions import parse_gtfs_time

In [4]:
min_date = pd.Timestamp(year=2025, month=5, day=6, hour=20, minute=45, second=0, tz=LOCAL_TIMEZONE)
today_local = pd.Timestamp.now(tz=LOCAL_TIMEZONE)
two_weeks_later = today_local + pd.Timedelta(weeks=2)
correct_dates = pd.date_range(start=min_date, end=two_weeks_later, freq='15min', tz=LOCAL_TIMEZONE)
correct_dates = correct_dates.to_series(index=list(range(len(correct_dates))))
correct_dates

0      2025-05-06 20:45:00-04:00
1      2025-05-06 21:00:00-04:00
2      2025-05-06 21:15:00-04:00
3      2025-05-06 21:30:00-04:00
4      2025-05-06 21:45:00-04:00
                  ...           
2077   2025-05-28 12:00:00-04:00
2078   2025-05-28 12:15:00-04:00
2079   2025-05-28 12:30:00-04:00
2080   2025-05-28 12:45:00-04:00
2081   2025-05-28 13:00:00-04:00
Length: 2082, dtype: datetime64[ns, Canada/Eastern]

In [None]:
# Required features

# exp_trip_duration
# hist_avg_delay
# route_direction_South - done
# frequency_normal
# stop_location_group
# trip_phase_middle
# frequency_very_rare
# route_direction_North - done
# route_direction_West - done
# frequency_rare
# stop_distance
# trip_phase_start

In [5]:
# Endpoint parameters
route_id = 48
direction = 'Ouest'
stop_id = 55043
current_time = correct_dates.sample().iloc[0]

In [7]:
# Add direction features to trip data
trip_data = {}

match direction:
	case 'Nord':
		trip_data['route_direction_South'] = 0
		trip_data['route_direction_North'] = 1
		trip_data['route_direction_West'] = 0
	case 'Sud':
		trip_data['route_direction_South'] = 1
		trip_data['route_direction_North'] = 0
		trip_data['route_direction_West'] = 0
	case 'Ouest':
		trip_data['route_direction_South'] = 0
		trip_data['route_direction_North'] = 0
		trip_data['route_direction_West'] = 1
	case 'Est':
		trip_data['route_direction_South'] = 0
		trip_data['route_direction_North'] = 0
		trip_data['route_direction_West'] = 0

In [8]:
trip_data

{'route_direction_South': 0,
 'route_direction_North': 0,
 'route_direction_West': 1}

In [9]:
# Convert calendar start and end dates
calendar_df['start_date'] = pd.to_datetime(calendar_df['start_date'], format='%Y%m%d').dt.tz_localize(LOCAL_TIMEZONE)
calendar_df['end_date'] = pd.to_datetime(calendar_df['end_date'], format='%Y%m%d').dt.tz_localize(LOCAL_TIMEZONE) + pd.Timedelta(days=1)
calendar_df.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,25J-H50J000S-80-S,1,1,1,1,1,0,0,2025-01-06 00:00:00-05:00,2025-03-22 00:00:00-04:00
1,25J-H50J000S-80-SV,0,0,0,0,1,0,0,2025-01-06 00:00:00-05:00,2025-03-22 00:00:00-04:00
2,25J-H54J000S-80-S,1,1,1,1,1,0,0,2025-01-06 00:00:00-05:00,2025-03-22 00:00:00-04:00
3,25J-H55J000S-81-S,1,1,1,1,1,0,0,2025-01-06 00:00:00-05:00,2025-03-22 00:00:00-04:00
4,25J-H55J000S-81-SV,0,0,0,0,1,0,0,2025-01-06 00:00:00-05:00,2025-03-22 00:00:00-04:00


In [10]:
# Create day of week filters
day_mask = False

day_of_week = current_time.day_of_week
match day_of_week:
	case 0:
		day_mask = calendar_df['monday'] == 1
	case 1:
		day_mask = calendar_df['tuesday'] == 1
	case 2:
		day_mask = calendar_df['wednesday'] == 1
	case 3:
		day_mask = calendar_df['thursday'] == 1
	case 4:
		day_mask = calendar_df['friday'] == 1
	case 5:
		day_mask = calendar_df['saturday'] == 1
	case 6:
		day_mask = calendar_df['sunday'] == 1

In [11]:
# Create date filter
date_mask = (current_time >= calendar_df['start_date']) & (current_time <= calendar_df['end_date'])

In [12]:
# Filter calendar
filtered_calendar = calendar_df[day_mask & date_mask]
filtered_calendar

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
66,25M-H50M000S-81-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
68,25M-H54M000S-80-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
69,25M-H55M000S-80-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
73,25M-H57M000S-81-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
74,25M-H58M000S-80-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
75,25M-H59M000S-81-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
77,25M-H60M000S-80-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
78,25M-H62M000S-80-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
79,25M-GLOBAUX-01-S,1,1,1,1,1,0,0,2025-03-24 00:00:00-04:00,2025-06-14 00:00:00-04:00
100,25M-H56M000S-83-S,1,1,1,1,1,0,0,2025-03-31 00:00:00-04:00,2025-06-14 00:00:00-04:00


In [1]:
# Get service ids
service_ids = filtered_calendar['service_id'].unique().tolist()
service_ids

NameError: name 'filtered_calendar' is not defined

## Merge stop times with trips

In [None]:
merged_stop_times = pd.merge(left=stop_times_df, right=trips_df, how='left', on='trip_id')

In [25]:
merged_stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,current_time,start_date,sch_arrival_time,arrival_hour,arrivals_per_hour,route_id,service_id,trip_headsign,direction_id,shape_id,wheelchair_accessible,note_fr,note_en
0,281570788,05:58:00,05:58:00,51095,1,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 05:58:00-04:00,2025-05-28 05:00:00-04:00,6,16,25J-H50J000A-80-A,Ouest,1,160088,1,,
1,281570788,05:59:39,05:59:39,51126,2,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 05:59:39-04:00,2025-05-28 05:00:00-04:00,6,16,25J-H50J000A-80-A,Ouest,1,160088,1,,
2,281570788,06:00:06,06:00:06,51113,3,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:00:06-04:00,2025-05-28 06:00:00-04:00,51,16,25J-H50J000A-80-A,Ouest,1,160088,1,,
3,281570788,06:00:44,06:00:44,51084,4,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:00:44-04:00,2025-05-28 06:00:00-04:00,51,16,25J-H50J000A-80-A,Ouest,1,160088,1,,
4,281570788,06:01:17,06:01:17,51063,5,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:01:17-04:00,2025-05-28 06:00:00-04:00,51,16,25J-H50J000A-80-A,Ouest,1,160088,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6629620,283978309,24:24:00,24:24:00,58,8,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-29 00:24:00-04:00,2025-05-29 00:00:00-04:00,120,5,25M-GLOBAUX-01-F3,Station Snowdon,1,51762,1,,
6629621,283978309,24:25:00,24:25:00,57,9,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-29 00:25:00-04:00,2025-05-29 00:00:00-04:00,117,5,25M-GLOBAUX-01-F3,Station Snowdon,1,51762,1,,
6629622,283978309,24:27:00,24:27:00,56,10,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-29 00:27:00-04:00,2025-05-29 00:00:00-04:00,119,5,25M-GLOBAUX-01-F3,Station Snowdon,1,51762,1,,
6629623,283978309,24:28:00,24:28:00,55,11,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-29 00:28:00-04:00,2025-05-29 00:00:00-04:00,119,5,25M-GLOBAUX-01-F3,Station Snowdon,1,51762,1,,


In [26]:
# Add endpoint time to trips dataframe
merged_stop_times['current_time'] = current_time
merged_stop_times['current_time']

0         2025-05-28 07:15:00-04:00
1         2025-05-28 07:15:00-04:00
2         2025-05-28 07:15:00-04:00
3         2025-05-28 07:15:00-04:00
4         2025-05-28 07:15:00-04:00
                     ...           
6629620   2025-05-28 07:15:00-04:00
6629621   2025-05-28 07:15:00-04:00
6629622   2025-05-28 07:15:00-04:00
6629623   2025-05-28 07:15:00-04:00
6629624   2025-05-28 07:15:00-04:00
Name: current_time, Length: 6629625, dtype: datetime64[ns, Canada/Eastern]

In [27]:
merged_stop_times['start_date'] = pd.to_datetime(stop_times_df['current_time'].dt.date)
merged_stop_times['start_date']

0         2025-05-28
1         2025-05-28
2         2025-05-28
3         2025-05-28
4         2025-05-28
             ...    
6629620   2025-05-28
6629621   2025-05-28
6629622   2025-05-28
6629623   2025-05-28
6629624   2025-05-28
Name: start_date, Length: 6629625, dtype: datetime64[ns]

In [28]:
merged_stop_times['sch_arrival_time'] = parse_gtfs_time(merged_stop_times, 'start_date', 'arrival_time', unit='ns')

In [None]:
merged_stop_times['arrival_hour'] = merged_stop_times['sch_arrival_time'].dt.floor('h')
merged_stop_times['arrival_hour'] 

0         2025-05-28 05:00:00-04:00
1         2025-05-28 05:00:00-04:00
2         2025-05-28 06:00:00-04:00
3         2025-05-28 06:00:00-04:00
4         2025-05-28 06:00:00-04:00
                     ...           
6629620   2025-05-29 00:00:00-04:00
6629621   2025-05-29 00:00:00-04:00
6629622   2025-05-29 00:00:00-04:00
6629623   2025-05-29 00:00:00-04:00
6629624   2025-05-29 00:00:00-04:00
Name: arrival_hour, Length: 6629625, dtype: datetime64[ns, Canada/Eastern]

## Merge stops

In [38]:
scheduled_trips_df = pd.merge(left=merged_stop_times, right=stops_df, how='left', on='stop_id')

In [39]:
scheduled_trips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6629625 entries, 0 to 6629624
Data columns (total 25 columns):
 #   Column                 Dtype                         
---  ------                 -----                         
 0   trip_id                int64                         
 1   arrival_time           object                        
 2   departure_time         object                        
 3   stop_id                int64                         
 4   stop_sequence          int64                         
 5   current_time           datetime64[ns, Canada/Eastern]
 6   start_date             datetime64[ns]                
 7   sch_arrival_time       datetime64[ns, Canada/Eastern]
 8   arrival_hour           datetime64[ns, Canada/Eastern]
 9   arrivals_per_hour      int64                         
 10  route_id               int64                         
 11  service_id             object                        
 12  trip_headsign          object                        
 1

## Filter trips

In [47]:
# Filter trips by service_id, route_id and direction
trip_mask = (scheduled_trips_df['service_id'].isin(service_ids)) & (scheduled_trips_df['route_id'] == route_id) & (scheduled_trips_df['trip_headsign'].str.contains(direction))
filtered_trips_df = scheduled_trips_df[trip_mask]
filtered_trips_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,current_time,start_date,sch_arrival_time,arrival_hour,arrivals_per_hour,...,wheelchair_accessible,note_fr,note_en,stop_location_group,stop_name,neighbourhood,stop_lat,stop_lon,location_type,wheelchair_boarding
3626428,285029373,06:10:00,06:10:00,61605,1,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:10:00-04:00,2025-05-28 06:00:00-04:00,21,...,1,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,6.0,Gare Rivière-des-Prairies,,45.661934,-73.538434,0.0,1.0
3626429,285029373,06:10:47,06:10:47,60316,2,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:10:47-04:00,2025-05-28 06:00:00-04:00,21,...,1,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,6.0,Maurice-Duplessis / Saint-Jean-Baptiste,,45.659521,-73.541707,0.0,1.0
3626430,285029373,06:13:00,06:13:00,54284,3,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:13:00-04:00,2025-05-28 06:00:00-04:00,18,...,1,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,6.0,Maurice-Duplessis / 70e Avenue,,45.654312,-73.554343,0.0,1.0
3626431,285029373,06:13:39,06:13:39,54270,4,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:13:39-04:00,2025-05-28 06:00:00-04:00,18,...,1,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,6.0,Maurice-Duplessis / 62e Avenue,,45.652137,-73.559007,0.0,1.0
3626432,285029373,06:14:00,06:14:00,54397,5,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:14:00-04:00,2025-05-28 06:00:00-04:00,18,...,1,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,6.0,Maurice-Duplessis / Rivière-des-Prairies,,45.650896,-73.561423,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5798159,285283538,08:51:04,08:51:04,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 08:51:04-04:00,2025-05-28 08:00:00-04:00,35,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5798160,285283538,08:54:00,08:54:00,50513,44,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 08:54:00-04:00,2025-05-28 08:00:00-04:00,33,...,2,,,2.0,Henri-Bourassa / Saint-Michel,,45.584230,-73.650413,0.0,1.0
5798161,285283538,08:57:41,08:57:41,50413,45,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 08:57:41-04:00,2025-05-28 08:00:00-04:00,36,...,2,,,2.0,Henri-Bourassa / De Saint-Firmin,Sault-au-Récollet,45.573080,-73.657268,0.0,1.0
5798162,285283538,09:01:00,09:01:00,50350,46,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 09:01:00-04:00,2025-05-28 09:00:00-04:00,37,...,2,,,2.0,Henri-Bourassa / Christophe-Colomb,Sault-au-Récollet,45.562812,-73.662532,0.0,1.0


In [49]:
# Filter by stop_id
filtered_by_stop_df = filtered_trips_df[filtered_trips_df['stop_id'] == stop_id]
filtered_by_stop_df = filtered_by_stop_df.sort_values('sch_arrival_time')
filtered_by_stop_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,current_time,start_date,sch_arrival_time,arrival_hour,arrivals_per_hour,...,wheelchair_accessible,note_fr,note_en,stop_location_group,stop_name,neighbourhood,stop_lat,stop_lon,location_type,wheelchair_boarding
5289962,284737632,04:52:42,04:52:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 04:52:42-04:00,2025-05-28 04:00:00-04:00,3,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5290302,284737649,05:11:42,05:11:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 05:11:42-04:00,2025-05-28 05:00:00-04:00,27,...,1,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5291063,284737669,05:26:42,05:26:42,55043,46,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 05:26:42-04:00,2025-05-28 05:00:00-04:00,9,...,2,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5291541,284737690,05:35:42,05:35:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 05:35:42-04:00,2025-05-28 05:00:00-04:00,27,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5294967,284737811,05:44:42,05:44:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 05:44:42-04:00,2025-05-28 05:00:00-04:00,27,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5296074,284737857,05:54:42,05:54:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 05:54:42-04:00,2025-05-28 05:00:00-04:00,27,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5297348,284737930,06:03:42,06:03:42,55043,46,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:03:42-04:00,2025-05-28 06:00:00-04:00,18,...,2,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5300841,284738068,06:10:42,06:10:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:10:42-04:00,2025-05-28 06:00:00-04:00,36,...,1,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5301995,284738118,06:17:42,06:17:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:17:42-04:00,2025-05-28 06:00:00-04:00,36,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5095323,285002349,06:23:42,06:23:42,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:23:42-04:00,2025-05-28 06:00:00-04:00,36,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0


In [50]:
# Calculate hourly frequency
filtered_by_stop_df['arrivals_per_hour'] = filtered_by_stop_df.groupby('arrival_hour').transform('size')
filtered_by_stop_df['arrivals_per_hour'].value_counts()

arrivals_per_hour
6     24
5     20
4     16
3     12
10    10
9      9
1      3
2      2
Name: count, dtype: int64

In [51]:
filtered_by_stop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96 entries, 5289962 to 5327504
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype                         
---  ------                 --------------  -----                         
 0   trip_id                96 non-null     int64                         
 1   arrival_time           96 non-null     object                        
 2   departure_time         96 non-null     object                        
 3   stop_id                96 non-null     int64                         
 4   stop_sequence          96 non-null     int64                         
 5   current_time           96 non-null     datetime64[ns, Canada/Eastern]
 6   start_date             96 non-null     datetime64[ns]                
 7   sch_arrival_time       96 non-null     datetime64[ns, Canada/Eastern]
 8   arrival_hour           96 non-null     datetime64[ns, Canada/Eastern]
 9   arrivals_per_hour      96 non-null     int64                 

In [52]:
# Filter by endpoint time
next_arrivals_filter = filtered_by_stop_df['sch_arrival_time'] >= filtered_by_stop_df['current_time']
next_arrivals = filtered_by_stop_df[next_arrivals_filter].sort_values('sch_arrival_time')
next_arrivals

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,current_time,start_date,sch_arrival_time,arrival_hour,arrivals_per_hour,...,wheelchair_accessible,note_fr,note_en,stop_location_group,stop_name,neighbourhood,stop_lat,stop_lon,location_type,wheelchair_boarding
5312862,284738591,07:15:20,07:15:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:15:20-04:00,2025-05-28 07:00:00-04:00,9,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5305627,284738295,07:22:20,07:22:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:22:20-04:00,2025-05-28 07:00:00-04:00,9,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5314275,284738659,07:27:20,07:27:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:27:20-04:00,2025-05-28 07:00:00-04:00,9,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5314782,284738696,07:31:20,07:31:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:31:20-04:00,2025-05-28 07:00:00-04:00,9,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5316194,284738750,07:38:20,07:38:20,55043,46,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:38:20-04:00,2025-05-28 07:00:00-04:00,9,...,2,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5291633,284737694,07:43:20,07:43:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:43:20-04:00,2025-05-28 07:00:00-04:00,9,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5320920,284738934,07:53:20,07:53:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:53:20-04:00,2025-05-28 07:00:00-04:00,9,...,1,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5302201,284738129,08:03:20,08:03:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 08:03:20-04:00,2025-05-28 08:00:00-04:00,6,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5332094,284739312,08:14:42,08:14:42,55043,46,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 08:14:42-04:00,2025-05-28 08:00:00-04:00,6,...,2,Ce bus passe par Gouin et Ozias-Leduc.,This bus runs via Gouin and Ozias-Leduc.,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5302091,284738122,08:22:04,08:22:04,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 08:22:04-04:00,2025-05-28 08:00:00-04:00,6,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0


In [75]:
# Get current trip
next_arrival = next_arrivals.head(1).squeeze()
next_trip_id = int(next_arrival['trip_id'])
next_arrival_time = next_arrival['sch_arrival_time']
current_trip = filtered_trips_df[filtered_trips_df['trip_id'] == next_trip_id].sort_values('stop_sequence')
current_trip

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,current_time,start_date,sch_arrival_time,arrival_hour,arrivals_per_hour,...,wheelchair_accessible,note_fr,note_en,stop_location_group,stop_name,neighbourhood,stop_lat,stop_lon,location_type,wheelchair_boarding
5312820,284738591,06:37:00,06:37:00,61605,1,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:37:00-04:00,2025-05-28 06:00:00-04:00,39,...,2,,,6.0,Gare Rivière-des-Prairies,,45.661934,-73.538434,0.0,1.0
5312821,284738591,06:37:47,06:37:47,60316,2,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:37:47-04:00,2025-05-28 06:00:00-04:00,39,...,2,,,6.0,Maurice-Duplessis / Saint-Jean-Baptiste,,45.659521,-73.541707,0.0,1.0
5312822,284738591,06:40:00,06:40:00,54284,3,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:40:00-04:00,2025-05-28 06:00:00-04:00,45,...,2,,,6.0,Maurice-Duplessis / 70e Avenue,,45.654312,-73.554343,0.0,1.0
5312823,284738591,06:41:18,06:41:18,54270,4,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:41:18-04:00,2025-05-28 06:00:00-04:00,45,...,2,,,6.0,Maurice-Duplessis / 62e Avenue,,45.652137,-73.559007,0.0,1.0
5312824,284738591,06:42:00,06:42:00,54397,5,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:42:00-04:00,2025-05-28 06:00:00-04:00,45,...,2,,,6.0,Maurice-Duplessis / Rivière-des-Prairies,,45.650896,-73.561423,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5312862,284738591,07:15:20,07:15:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:15:20-04:00,2025-05-28 07:00:00-04:00,44,...,2,,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0
5312863,284738591,07:19:00,07:19:00,50513,44,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:19:00-04:00,2025-05-28 07:00:00-04:00,47,...,2,,,2.0,Henri-Bourassa / Saint-Michel,,45.584230,-73.650413,0.0,1.0
5312864,284738591,07:22:41,07:22:41,50413,45,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:22:41-04:00,2025-05-28 07:00:00-04:00,44,...,2,,,2.0,Henri-Bourassa / De Saint-Firmin,Sault-au-Récollet,45.573080,-73.657268,0.0,1.0
5312865,284738591,07:26:00,07:26:00,50350,46,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:26:00-04:00,2025-05-28 07:00:00-04:00,47,...,2,,,2.0,Henri-Bourassa / Christophe-Colomb,Sault-au-Récollet,45.562812,-73.662532,0.0,1.0


In [103]:
# Get data for time of day
hour = next_arrival_time.hour
if hour < 6:
	trip_data['time_of_day_morning'] = 0
	trip_data['time_of_day_evening'] = 0
elif hour < 12:
	trip_data['time_of_day_morning'] = 1
	trip_data['time_of_day_evening'] = 0
elif hour < 18:
	trip_data['time_of_day_morning'] = 0
	trip_data['time_of_day_evening'] = 0
else:
	trip_data['time_of_day_morning'] = 0
	trip_data['time_of_day_evening'] = 1

In [107]:
next_arrival_time

Timestamp('2025-05-28 07:15:20-0400', tz='Canada/Eastern')

In [105]:
# Get data for peak hour
day_of_week = next_arrival_time.day_of_week
trip_data['is_peak_hour'] = int((day_of_week not in [5, 6]) & (hour in [6, 7, 8, 9, 15, 16, 17, 18]))

In [106]:
trip_data

{'route_direction_South': 0,
 'route_direction_North': 0,
 'route_direction_West': 1,
 'exp_trip_duration': 3300.0,
 'hist_avg_delay': 74.1146085048364,
 'frequency_normal': 0,
 'frequency_very_rare': 0,
 'frequency_rare': 0,
 'stop_location_group': 2.0,
 'stop_distance': 515.7914726928796,
 'trip_phase_middle': 0,
 'trip_phase_start': 0,
 'time_of_day_morning': 1,
 'time_of_day_evening': 0,
 'is_peak_hour': 1}

In [None]:
# Get previous coordinates
current_trip['prev_lat'] = current_trip['stop_lat'].shift(1)
current_trip['prev_lon'] = current_trip['stop_lon'].shift(1)

In [88]:
current_trip

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,current_time,start_date,sch_arrival_time,arrival_hour,arrivals_per_hour,...,note_en,stop_location_group,stop_name,neighbourhood,stop_lat,stop_lon,location_type,wheelchair_boarding,prev_lat,prev_lon
5312820,284738591,06:37:00,06:37:00,61605,1,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:37:00-04:00,2025-05-28 06:00:00-04:00,39,...,,6.0,Gare Rivière-des-Prairies,,45.661934,-73.538434,0.0,1.0,,
5312821,284738591,06:37:47,06:37:47,60316,2,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:37:47-04:00,2025-05-28 06:00:00-04:00,39,...,,6.0,Maurice-Duplessis / Saint-Jean-Baptiste,,45.659521,-73.541707,0.0,1.0,45.661934,-73.538434
5312822,284738591,06:40:00,06:40:00,54284,3,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:40:00-04:00,2025-05-28 06:00:00-04:00,45,...,,6.0,Maurice-Duplessis / 70e Avenue,,45.654312,-73.554343,0.0,1.0,45.659521,-73.541707
5312823,284738591,06:41:18,06:41:18,54270,4,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:41:18-04:00,2025-05-28 06:00:00-04:00,45,...,,6.0,Maurice-Duplessis / 62e Avenue,,45.652137,-73.559007,0.0,1.0,45.654312,-73.554343
5312824,284738591,06:42:00,06:42:00,54397,5,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 06:42:00-04:00,2025-05-28 06:00:00-04:00,45,...,,6.0,Maurice-Duplessis / Rivière-des-Prairies,,45.650896,-73.561423,0.0,1.0,45.652137,-73.559007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5312862,284738591,07:15:20,07:15:20,55043,43,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:15:20-04:00,2025-05-28 07:00:00-04:00,44,...,,2.0,Henri-Bourassa / Saint-Vital,,45.590915,-73.646863,0.0,1.0,45.593724,-73.644549
5312863,284738591,07:19:00,07:19:00,50513,44,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:19:00-04:00,2025-05-28 07:00:00-04:00,47,...,,2.0,Henri-Bourassa / Saint-Michel,,45.584230,-73.650413,0.0,1.0,45.590915,-73.646863
5312864,284738591,07:22:41,07:22:41,50413,45,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:22:41-04:00,2025-05-28 07:00:00-04:00,44,...,,2.0,Henri-Bourassa / De Saint-Firmin,Sault-au-Récollet,45.573080,-73.657268,0.0,1.0,45.584230,-73.650413
5312865,284738591,07:26:00,07:26:00,50350,46,2025-05-28 07:15:00-04:00,2025-05-28,2025-05-28 07:26:00-04:00,2025-05-28 07:00:00-04:00,47,...,,2.0,Henri-Bourassa / Christophe-Colomb,Sault-au-Récollet,45.562812,-73.662532,0.0,1.0,45.573080,-73.657268


In [90]:
# Create GeoDataFrames for previous and current stop
sch_gdf1 = gpd.GeoDataFrame(
  current_trip[['prev_lon', 'prev_lat']],
  geometry=gpd.points_from_xy(current_trip['prev_lon'], current_trip['prev_lat']),
  crs='EPSG:4326' # WGS84 (sea level)
).to_crs(epsg=3857) # Convert to metric

sch_gdf2 = gpd.GeoDataFrame(
  current_trip[['stop_lon', 'stop_lat']],
  geometry=gpd.points_from_xy(current_trip['stop_lon'], current_trip['stop_lat']),
  crs='EPSG:4326'
).to_crs(epsg=3857)

In [91]:
# Calculate distance from previous stop
current_trip['stop_distance'] = sch_gdf1.distance(sch_gdf2)
current_trip['stop_distance'].describe()

count      46.000000
mean      521.836066
std       392.676250
min       180.874635
25%       294.508249
50%       406.245776
75%       529.348748
max      1930.555864
Name: stop_distance, dtype: float64

In [92]:
# Replace null value with 0
current_trip['stop_distance'] = current_trip['stop_distance'].fillna(0)

In [96]:
stop_distance = float(current_trip[current_trip['stop_id'] == stop_id].squeeze()['stop_distance'])

In [97]:
trip_data['stop_distance'] = stop_distance

In [99]:
# Get trip progress
current_trip['trip_progress'] = current_trip['stop_sequence'] / len(current_trip)

In [100]:
current_trip['trip_progress']

5312820    0.021277
5312821    0.042553
5312822    0.063830
5312823    0.085106
5312824    0.106383
             ...   
5312862    0.914894
5312863    0.936170
5312864    0.957447
5312865    0.978723
5312866    1.000000
Name: trip_progress, Length: 47, dtype: float64

In [101]:
trip_progress = float(current_trip[current_trip['stop_id'] == stop_id].squeeze()['trip_progress'])
trip_progress

0.9148936170212766

In [102]:
# Get one-hot features from trip progress
if trip_progress < 0.33:
	trip_data['trip_phase_middle'] = 0
	trip_data['trip_phase_start'] = 1
elif trip_progress < 0.67:
	trip_data['trip_phase_middle'] = 1
	trip_data['trip_phase_start'] = 0
else:
	trip_data['trip_phase_middle'] = 0
	trip_data['trip_phase_start'] = 0

In [None]:
# Required features

# exp_trip_duration - done
# hist_avg_delay - done
# route_direction_South - done
# frequency_normal - done
# stop_location_group - done
# trip_phase_middle - done
# frequency_very_rare - done
# route_direction_North - done
# route_direction_West - done
# frequency_rare - done
# stop_distance - done
# trip_phase_start - done

In [None]:
# Calculate expected trip duration
exp_trip_duration = (current_trip['sch_arrival_time'].max() - current_trip['sch_arrival_time'].min()) / pd.Timedelta(seconds=1)
exp_trip_duration

3300.0

In [66]:
trip_data['exp_trip_duration'] = exp_trip_duration

In [None]:
# Get historical average delay
hist_filter = (avg_delay_df['route_id'] == route_id) & (avg_delay_df['stop_id'] == stop_id) & (avg_delay_df['hour'] == hour)
filtered_avg_delay = avg_delay_df[hist_filter]
hist_avg_delay = float(filtered_avg_delay['hist_avg_delay'].iloc[0])
hist_avg_delay

74.1146085048364

In [67]:
trip_data['hist_avg_delay'] = hist_avg_delay

In [76]:
# Get frequency
frequency = int(next_arrival['arrivals_per_hour'])
frequency

9

In [98]:
trip_data

{'route_direction_South': 0,
 'route_direction_North': 0,
 'route_direction_West': 1,
 'exp_trip_duration': 3300.0,
 'hist_avg_delay': 74.1146085048364,
 'frequency_normal': 0,
 'frequency_very_rare': 0,
 'frequency_rare': 0,
 'stop_location_group': 2.0,
 'stop_distance': 515.7914726928796}

In [79]:
# Add one-hot frequency attributes to trip_data
if frequency in [1, 2]:
	trip_data['frequency_normal'] = 0
	trip_data['frequency_very_rare'] = 1
	trip_data['frequency_rare'] = 0
elif frequency in [3, 4]:
	trip_data['frequency_normal'] = 0
	trip_data['frequency_very_rare'] = 1
	trip_data['frequency_rare'] = 0
elif frequency in [5, 6]:
	trip_data['frequency_normal'] = 1
	trip_data['frequency_very_rare'] = 0
	trip_data['frequency_rare'] = 0
else:
	trip_data['frequency_normal'] = 0
	trip_data['frequency_very_rare'] = 0
	trip_data['frequency_rare'] = 0

In [83]:
# Get stop location group
stop_location_group = float(next_arrival['stop_location_group'])
stop_location_group

2.0

In [84]:
trip_data['stop_location_group'] = stop_location_group

In [109]:
result = {
  'next_arrival_time': next_arrival_time,
  'trip_data': trip_data
}
result

{'next_arrival_time': Timestamp('2025-05-28 07:15:20-0400', tz='Canada/Eastern'),
 'trip_data': {'route_direction_South': 0,
  'route_direction_North': 0,
  'route_direction_West': 1,
  'exp_trip_duration': 3300.0,
  'hist_avg_delay': 74.1146085048364,
  'frequency_normal': 0,
  'frequency_very_rare': 0,
  'frequency_rare': 0,
  'stop_location_group': 2.0,
  'stop_distance': 515.7914726928796,
  'trip_phase_middle': 0,
  'trip_phase_start': 0,
  'time_of_day_morning': 1,
  'time_of_day_evening': 0,
  'is_peak_hour': 1}}

## End