# STM Transit Delay Feature Engineering and Data Preprocessing

This notebook preprocesses data about STM trip updates and historical weather data.

## Data Description

`trip_id` unique identifier of a trip<br>
`route_id` bus or metro line<br>
`stop_id` stop number<br>
`stop_lat`stop latitude<br>
`stop_lon`stop longitude<br>
`stop_sequence` sequence of the stop, for ordering<br>
`wheelchair_boarding` indicates if the stop is accessible for people in wheelchair, 1 being true and 2 being false<br>
`realtime_arrival_time` actual arrival time, in milliseconds<br>
`scheduled_arrival_time` planned arrival time, in milliseconds<br>
`temperature` air temperature at 2 meters above ground, in Celsius<br>
`precipitation` total precipitation (rain, showers, snow) sum of the preceding hour, in millimeters<br>
`windspeed` wind speed at 10 meters above ground, in km/h<br>
`weathercode` World Meteorological Organization (WMO) code<br>

## Imports

In [43]:
import numpy as np
import pandas as pd
import pickle
#from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sys

In [44]:
# Import custom code
sys.path.insert(0, '..')
from scripts.custom_functions import LOCAL_TIMEZONE, WEATHER_CODES

In [45]:
# Load data
df = pd.read_csv('../data/stm_weather_merged.csv')

## Feature Engineering

In [46]:
# Sort data
df = df.sort_values(by=['trip_id', 'stop_sequence'])

In [47]:
# Convert real and scheduled timestamps
df['realtime_arrival_time'] = pd.to_datetime(df['realtime_arrival_time'], origin='unix', unit='ms', utc=True)
df['scheduled_arrival_time'] = pd.to_datetime(df['scheduled_arrival_time'], origin='unix', unit='ms', utc=True)

In [None]:
# Calculate delay in seconds (real - scheduled)
df['delay'] = (df['realtime_arrival_time'] - df['scheduled_arrival_time']).dt.total_seconds()

In [49]:
# Add previous stop delay
df['delay_previous_stop'] = df.groupby('trip_id')['delay'].shift(1)

In [50]:
# Fill null values with 0
df['delay_previous_stop'] = df['delay_previous_stop'].fillna(0)

In [51]:
# Add delay difference to track worsening/improving delay
df['delay_diff'] = df['delay'] - df['delay_previous_stop']

In [52]:
# Add previous stop latitude and longitude
df['prev_lat'] = df.groupby('trip_id')['stop_lat'].shift(1)
df['prev_lon'] = df.groupby('trip_id')['stop_lon'].shift(1)

In [53]:
# Fill null values with current latitude and longitude
df['prev_lat'] = df['prev_lat'].fillna(df['stop_lat'])
df['prev_lon'] = df['prev_lon'].fillna(df['stop_lon'])

In [54]:
def get_haversine_dist(lat1, lon1, lat2, lon2) -> float:
	'''Returns the distance in meters between two points'''
	R = 6371000  # Earth radius in meters
	phi1 = np.radians(lat1)
	phi2 = np.radians(lat2)
	delta_phi = np.radians(lat2 - lat1)
	delta_lambda = np.radians(lon2 - lon1)

	a = np.sin(delta_phi / 2.0) ** 2 + \
	np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
	return R * 2 * np.arcsin(np.sqrt(a))

In [55]:
# Add stop distance
df['stop_distance'] = get_haversine_dist(
  	df['prev_lat'], df['prev_lon'],
	df['stop_lat'], df['stop_lon']
)

In [56]:
df.columns

Index(['trip_id', 'route_id', 'stop_id', 'stop_lat', 'stop_lon',
       'stop_sequence', 'wheelchair_boarding', 'realtime_arrival_time',
       'scheduled_arrival_time', 'temperature', 'precipitation', 'windspeed',
       'weathercode', 'delay', 'delay_previous_stop', 'delay_diff', 'prev_lat',
       'prev_lon', 'stop_distance'],
      dtype='object')

In [57]:
# Add previous actual arrival
df['prev_actual_arrival'] = df.groupby('trip_id')['realtime_arrival_time'].shift(1)

In [58]:
# Fill null values with current arrival time
df['prev_actual_arrival'] = df['prev_actual_arrival'].fillna(df['realtime_arrival_time'])

In [67]:
# Add travel time
df['travel_time'] = (df['realtime_arrival_time'] - df['prev_actual_arrival']).dt.total_seconds()

In [68]:
# Add speed between stops (in m/s)
df['speed_between_stops'] = df['stop_distance'] / df['travel_time']

In [86]:
# Add delay per km
df['delay_per_km'] = df['delay'] / (df['stop_distance'] / 1000)

## Data Preprocessing

### Encode Datetime

In [72]:
# Convert realtime arrival timestamp to datetime
rt_arrival_dt = pd.to_datetime(df['realtime_arrival_time'], origin='unix', unit='ms', utc=True)
rt_arrival_dt = rt_arrival_dt.dt.tz_convert(LOCAL_TIMEZONE)
rt_arrival_dt.head()

26075   2025-04-23 04:53:00-04:00
26076   2025-04-23 04:53:56-04:00
26077   2025-04-23 04:54:42-04:00
26078   2025-04-23 04:55:08-04:00
26079   2025-04-23 04:55:35-04:00
Name: realtime_arrival_time, dtype: datetime64[ns, Canada/Eastern]

In [73]:
# Convert datetime to useful features
df['day'] = rt_arrival_dt.dt.day_of_week
df['hour'] = rt_arrival_dt.dt.hour

In [74]:
# Use Cyclical Encoding for day and hour, as it's more suitable for time-related features
# And the model can "understand" the wrap-around
df['day_sin'] = np.sin(2 * np.pi * df['day'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 7)

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [75]:
# Add boolean value is_weekend
weekend_mask = df['day'].isin([5, 6])
df['is_weekend'] = np.where(weekend_mask, 1, 0)

In [76]:
# Add boolean value is_peak_hour (weekdays from 7-9am or 4-6pm)
peak_hour_mask = (weekend_mask == False) & (df['hour'].isin([7, 8, 9, 16, 17, 18]))
df['is_peak_hour'] = np.where(peak_hour_mask, 1, 0)

### Convert boolean columns to integer

In [77]:
df['wheelchair_boarding'] = df['wheelchair_boarding'].astype('int64')

### Use Label Encoding for route_id and stop_id

In [78]:
le_route = LabelEncoder()
df['route_id'] = le_route.fit_transform(df['route_id'])

In [79]:
le_stop = LabelEncoder()
df['stop_id'] = le_stop.fit_transform(df['stop_id'])

### Convert weathercode Into Categories

In [80]:
# Create weather code mapping
weathercodes = df['weathercode'].sort_values().unique()
condition_list = []
label_list = []

for code in weathercodes:
  condition_list.append(df['weathercode'] == code)
  label_list.append(WEATHER_CODES[code])

In [81]:
# Create categories
df['weather'] = np.select(condition_list, label_list, default='Unknown')

In [82]:
# Use One Hot Encoding
one_hot = pd.get_dummies(df['weather'], drop_first=True, dtype='int64', prefix='weather')
df = df.drop(['weathercode', 'weather'], axis=1).join(one_hot)

### Reduce station coordinates to one feature (PCA)

## Export Data

In [87]:
df.columns

Index(['trip_id', 'route_id', 'stop_id', 'stop_lat', 'stop_lon',
       'stop_sequence', 'wheelchair_boarding', 'realtime_arrival_time',
       'scheduled_arrival_time', 'temperature', 'precipitation', 'windspeed',
       'delay', 'delay_previous_stop', 'delay_diff', 'prev_lat', 'prev_lon',
       'stop_distance', 'prev_actual_arrival', 'speed_between_stops',
       'travel_time', 'delay_per_m', 'day', 'hour', 'day_sin', 'day_cos',
       'hour_sin', 'hour_cos', 'is_weekend', 'is_peak_hour',
       'weather_Light drizzle', 'weather_Mainly clear',
       'weather_Moderate drizzle', 'weather_Moderate rain', 'weather_Overcast',
       'weather_Partly cloudy', 'weather_Slight rain', 'delay_per_km'],
      dtype='object')

In [None]:
# Keep encoded columns and reorder them
df = df[[ 'route_id', 'stop_id', 'stop_lat', 'stop_lon', 'prev_lat', 'prev_lon', #'pca_coords',
  'wheelchair_boarding', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'is_weekend',
  'is_peak_hour', 'delay_previous_stop', 'delay_diff', 'stop_distance', 'speed_between_stops',
  'travel_time', 'delay_per_km', 'temperature', 'precipitation', 'windspeed', 'weather_Light drizzle',
  'weather_Mainly clear', 'weather_Moderate drizzle', 'weather_Moderate rain', 'weather_Overcast',
  'weather_Partly cloudy', 'weather_Slight rain', 'delay']]

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1678987 entries, 26075 to 373539
Data columns (total 30 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   route_id                  1678987 non-null  int64  
 1   stop_id                   1678987 non-null  int64  
 2   stop_lat                  1678987 non-null  float64
 3   stop_lon                  1678987 non-null  float64
 4   prev_lat                  1678987 non-null  float64
 5   prev_lon                  1678987 non-null  float64
 6   wheelchair_boarding       1678987 non-null  int64  
 7   day_sin                   1678987 non-null  float64
 8   day_cos                   1678987 non-null  float64
 9   hour_sin                  1678987 non-null  float64
 10  hour_cos                  1678987 non-null  float64
 11  is_weekend                1678987 non-null  int64  
 12  is_peak_hour              1678987 non-null  int64  
 13  delay_previous_stop       167

In [90]:
# Export encoders
encoders = {
  'le_route': le_route,
  'le_stop': le_stop
}
with open('../models/label_encoders.pickle', 'wb') as handle:
	pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [91]:
# Export dataframe
df.to_csv('../data/preprocessed.csv', index=False)

## End