# SE-2200E Model Generation

Get inspired: https://machinelearningmastery.com/start-here/#process

We have CSVs with actual stop times, scheduled stop times, and we want to combine them to run a regression to predict future times. 

In [1]:
import pandas as pd
import pytz
from sklearn.linear_model import LinearRegression

REALTIME_DATA_PATH = 'C:/Users/nings/OneDrive - The University of Western Ontario/Scholar\'s 2200E/result_data/result.csv'
SCHEDULE_DATA_PATH = 'C:/Users/nings/Documents/GitHub/se-2200e/raw_data/schedule/stop_times.txt'

## Data Preparation
Remove excess features, identify outliers, regularization, transformation

In [7]:
realtime_df = pd.read_csv(REALTIME_DATA_PATH, sep=',', error_bad_lines=False, index_col=False)
realtime_df = realtime_df.drop(columns=['stop_id', 'vehicle_label', 'timestamp'])
realtime_df = realtime_df.rename(columns={'departure_time': 'realtime_departure_time'})

schedule_df = pd.read_csv(SCHEDULE_DATA_PATH, sep=',', error_bad_lines=False, index_col=False)
schedule_df = schedule_df.drop(columns=['arrival_time', 'drop_off_type', 'pickup_type', 'stop_headsign', 'stop_id', 'timepoint'])
schedule_df = schedule_df.rename(columns={'departure_time': 'scheduled_departure_time'})

combined_df = realtime_df.merge(schedule_df, how='inner', on=['trip_id', 'stop_sequence'])
combined_df.head()



Unnamed: 0,trip_id,start_date,start_time,route_id,stop_sequence,realtime_departure_time,vehicle_id,scheduled_departure_time
0,1346803,20201104,11:18:00,24,41,1604508849,3140,11:53:34
1,1346803,20201105,11:18:00,24,41,1604595120,3177,11:53:34
2,1346803,20201106,11:18:00,24,41,1604681520,3143,11:53:34
3,1346803,20201109,11:18:00,24,41,1604940720,3144,11:53:34
4,1346803,20201110,11:18:00,24,41,1605027120,3178,11:53:34


In [16]:
# Categorical
combined_df['trip_id'] = pd.Categorical(combined_df['trip_id'])

# Categorical
combined_df['route_id'] = pd.Categorical(combined_df['route_id'])

# Numeric
combined_df['stop_sequence'] = pd.to_numeric(combined_df['stop_sequence'], errors='coerce').fillna(0).astype(int)

# Convert UTC to EST
try:
    combined_df['realtime_departure_time'] = pd.to_datetime(combined_df['realtime_departure_time'], unit='s', errors='coerce').dt.tz_localize('utc').dt.tz_convert('America/Toronto')
except TypeError:
    pass

# Categorical
combined_df['vehicle_id'] = pd.Categorical(combined_df['vehicle_id'])

# Time
combined_df['scheduled_departure_time'] = pd.to_datetime(combined_df['scheduled_departure_time'], format='%H:%M:%S', errors='coerce')

combined_df.info()
combined_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3128386 entries, 0 to 3128385
Data columns (total 8 columns):
 #   Column                    Dtype                          
---  ------                    -----                          
 0   trip_id                   category                       
 1   start_date                object                         
 2   start_time                object                         
 3   route_id                  category                       
 4   stop_sequence             int32                          
 5   realtime_departure_time   datetime64[ns, America/Toronto]
 6   vehicle_id                category                       
 7   scheduled_departure_time  datetime64[ns]                 
dtypes: category(3), datetime64[ns, America/Toronto](1), datetime64[ns](1), int32(1), object(2)
memory usage: 226.4+ MB


Unnamed: 0,trip_id,start_date,start_time,route_id,stop_sequence,realtime_departure_time,vehicle_id,scheduled_departure_time
0,1346803,20201104,11:18:00,24,41,2020-11-04 11:54:09-05:00,3140.0,1900-01-01 11:53:34
1,1346803,20201105,11:18:00,24,41,2020-11-05 11:52:00-05:00,3177.0,1900-01-01 11:53:34
2,1346803,20201106,11:18:00,24,41,2020-11-06 11:52:00-05:00,3143.0,1900-01-01 11:53:34
3,1346803,20201109,11:18:00,24,41,2020-11-09 11:52:00-05:00,3144.0,1900-01-01 11:53:34
4,1346803,20201110,11:18:00,24,41,2020-11-10 11:52:00-05:00,3178.0,1900-01-01 11:53:34


## Algorithm Spot Check
Test and train, is it necessary for linear regression?

In [18]:
combined_df['realtime_departure_time'] = combined_df['realtime_departure_time'].values.astype(int)
combined_df['scheduled_departure_time'] = combined_df['scheduled_departure_time'].values.astype(int)

In [19]:
# import seaborn as sns
# sns.relplot(data=combined_df, x='departure_time', y='actual_departure_time')

In [22]:
X = combined_df[['stop_sequence', 'realtime_departure_time']]
Y = combined_df['scheduled_departure_time']
reg = LinearRegression().fit(X, Y)
reg.score(X, Y)

7.00048850532653e-06

In [24]:
import numpy as np
reg.predict(np.array([41, 1604508849]).reshape(1, -1))

array([-3070230.64344943])

## Model Usage
Get a trained model paramaters and use it to predict live data from transit feeds using the AWS vm.