# STM Transit Delay Feature Engineering and Data Preprocessing

This notebook preprocesses data about STM trip updates and historical weather data.

## Data Description

`trip_id` unique identifier of a trip<br>
`route_id` bus or metro line<br>
`stop_id` stop number<br>
`stop_lat`stop latitude<br>
`stop_lon`stop longitude<br>
`stop_sequence` sequence of the stop, for ordering<br>
`wheelchair_boarding` indicates if the stop is accessible for people in wheelchair, 1 being true and 2 being false<br>
`realtime_arrival_time` actual arrival time, in milliseconds<br>
`scheduled_arrival_time` planned arrival time, in milliseconds<br>
`temperature` air temperature at 2 meters above ground, in Celsius<br>
`precipitation` total precipitation (rain, showers, snow) sum of the preceding hour, in millimeters<br>
`windspeed` wind speed at 10 meters above ground, in km/h<br>
`weathercode` World Meteorological Organization (WMO) code<br>

## Imports

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sys

In [None]:
# Import custom code
sys.path.insert(0, '..')
from scripts.custom_functions import LOCAL_TIMEZONE, WEATHER_CODES

In [None]:
# Load data
df = pd.read_csv('../data/stm_weather_merged.csv')

## Feature Engineering

In [None]:
# Sort data
df = df.sort_values(by=['trip_id', 'stop_sequence'])

In [None]:
# Convert real and scheduled timestamps
df['realtime_arrival_time'] = pd.to_datetime(df['realtime_arrival_time'], origin='unix', unit='ms', utc=True)
df['scheduled_arrival_time'] = pd.to_datetime(df['scheduled_arrival_time'], origin='unix', unit='ms', utc=True)

In [None]:
# Calculate delay in seconds (real - scheduled)
df['delay'] = (df['realtime_arrival_time'] - df['scheduled_arrival_time']).dt.total_seconds()

## Data Preprocessing

### Encode Datetime

In [None]:
# Convert realtime arrival to local timezone
df['realtime_arrival_time'] = df['realtime_arrival_time'].dt.tz_convert(LOCAL_TIMEZONE)
df.head()

In [None]:
# Convert datetime to integers
df['day'] = df['realtime_arrival_time'].dt.day_of_week
df['hour'] = df['realtime_arrival_time'].dt.hour

In [None]:
# Use Cyclical Encoding for day and hour, as it's more suitable for time-related features
# And the model can "understand" the wrap-around
df['day_sin'] = np.sin(2 * np.pi * df['day'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 7)

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [None]:
# Add boolean value is_weekend
weekend_mask = df['day'].isin([5, 6])
df['is_weekend'] = np.where(weekend_mask, 1, 0)

In [None]:
# Add boolean value is_peak_hour (weekdays from 7-9am or 4-6pm)
peak_hour_mask = (weekend_mask == False) & (df['hour'].isin([7, 8, 9, 16, 17, 18]))
df['is_peak_hour'] = np.where(peak_hour_mask, 1, 0)

### Convert boolean columns to integer

In [None]:
df['wheelchair_boarding'] = df['wheelchair_boarding'].astype('int64')

### Use Label Encoding for route_id and stop_id

In [None]:
le_route = LabelEncoder()
df['route_id'] = le_route.fit_transform(df['route_id'])

In [None]:
le_stop = LabelEncoder()
df['stop_id'] = le_stop.fit_transform(df['stop_id'])

### Convert weathercode Into Categories

In [None]:
# Create weather code mapping
weathercodes = df['weathercode'].sort_values().unique()
condition_list = []
label_list = []

for code in weathercodes:
  condition_list.append(df['weathercode'] == code)
  label_list.append(WEATHER_CODES[code])

In [None]:
# Create categories
df['weather'] = np.select(condition_list, label_list, default='Unknown')

In [None]:
# Use One Hot Encoding
one_hot = pd.get_dummies(df['weather'], drop_first=True, dtype='int64', prefix='weather')
df = df.drop(['weathercode', 'weather'], axis=1).join(one_hot)

### Reduce station coordinates to one feature (PCA)

In [None]:
stop_coords = df[['stop_lat', 'stop_lon']]
stop_coords

In [None]:
# Scale the coordinates because PCA works best with normalized features
scaler_coord = StandardScaler()
coords_scaled = scaler_coord.fit_transform(stop_coords)
coords_scaled

In [None]:
# Apply PCA
pca = PCA(n_components=1)
pca_coords = pca.fit_transform(coords_scaled)
df['pca_coords'] = pca_coords

## Export Data

In [None]:
df.columns

In [None]:
# Keep encoded columns and reorder them
df = df[[ 'route_id', 'stop_id', 'pca_coords', 'stop_sequence',
  		'wheelchair_boarding', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 
  		'is_weekend', 'is_peak_hour', 'delay_previous_stop', 
  		'temperature', 'precipitation', 'windspeed', 'weather_Dense drizzle', 'weather_Light drizzle',
  		'weather_Mainly clear', 'weather_Moderate drizzle', 'weather_Moderate rain', 'weather_Overcast',
  		'weather_Partly cloudy', 'weather_Slight rain', 'delay']]

In [None]:
# Export encoders
encoders = {
  'le_route': le_route,
  'le_stop': le_stop
}
with open('../models/label_encoders.pickle', 'wb') as handle:
	pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Export scaler
with open('../models/coord_scaler.pickle', 'wb') as handle:
	pickle.dump(scaler_coord, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Export coordinates PCA
with open('../models/coord_pca.pickle', 'wb') as handle:
	pickle.dump(pca_coords, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df.info()

In [None]:
df.head(10).to_csv('sample.csv', index=False)

In [None]:
# Export dataframe
df.to_csv('../data/preprocessed.csv', index=False)

## End