In [None]:
# We have a csv with the actual stop times.
# We have another CSV with the scheduled stop times.
# We want to combine them so we can run a linear regression.
# We use pandas

import pandas as pd
from sklearn.linear_model import LinearRegression

import pytz
RESULT_DIRECTORY = 'C:/Users/nings/OneDrive - The University of Western Ontario/Scholar\'s 2200E/result_data'
LTC_DIRECTORY = 'C:/Users/nings/Documents/GitHub/se-2200e/raw_data'


In [None]:
schedule_csv_path = LTC_DIRECTORY + '/schedule/stop_times.txt'
schedule_df = pd.read_csv(schedule_csv_path)
schedule_df = schedule_df.drop(columns=['stop_headsign', 'arrival_time', 'pickup_type', 'drop_off_type', 'timepoint'])
schedule_df.head()


In [None]:
schedule_df['trip_id'] = pd.to_numeric(schedule_df['trip_id'], errors='coerce').fillna(0).astype(int)
schedule_df['departure_time'] = pd.to_datetime(schedule_df['departure_time'], format='%H:%M:%S', errors='coerce')
schedule_df['stop_sequence'] = pd.to_numeric(schedule_df['stop_sequence'], errors='coerce').fillna(0).astype(int)
schedule_df['stop_id'] = schedule_df['stop_id'].astype('category')
eastern = pytz.timezone('America/Toronto')

schedule_df.info()
schedule_df.head()

In [None]:
csv_path = RESULT_DIRECTORY + '/result.csv'
actual_df = pd.read_csv(csv_path, sep=',', error_bad_lines=False, index_col=False)
actual_df = actual_df.rename(columns={'departure_time': 'actual_departure_time'})
actual_df.info()
actual_df.head()


In [None]:
actual_df['trip_id'] = pd.to_numeric(actual_df['trip_id'], errors='coerce').fillna(0).astype(int)
actual_df['route_id'] = pd.to_numeric(actual_df['route_id'], errors='coerce').fillna(0).astype(int)
actual_df['stop_sequence'] = pd.to_numeric(actual_df['stop_sequence'], errors='coerce').fillna(0).astype(int)
actual_df['vehicle_id'] = pd.to_numeric(actual_df['vehicle_id'], errors='coerce').fillna(0).astype(int)
actual_df['vehicle_label'] = pd.to_numeric(actual_df['vehicle_label'], errors='coerce').fillna(0).astype(int)
actual_df.info()
actual_df.head()

In [None]:
actual_df['actual_departure_time'] = pd.to_datetime(actual_df['actual_departure_time'], unit='s', errors='coerce').dt.tz_localize('utc').dt.tz_convert('America/Toronto')
actual_df['timestamp'] = pd.to_datetime(actual_df['timestamp'], unit='s', errors='coerce').dt.tz_localize('utc').dt.tz_convert('America/Toronto')
actual_df['stop_id'] = actual_df['stop_id'].astype('category')
# actual_df['start_date'] = pd.to_datetime(actual_df['start_date'], infer_datetime_format=True, errors='coerce')
# actual_df['start_time'] = pd.to_datetime(actual_df['start_time'], infer_datetime_format=True, errors='coerce')

actual_df.info()
actual_df.head(50)

In [None]:
combined_df = actual_df.merge(schedule_df, how='inner', on=['trip_id', 'stop_id', 'stop_sequence'])
combined_df.info()
combined_df.head(500)

In [None]:
combined_df['actual_departure_time'] = combined_df['actual_departure_time'].values.astype(int)
combined_df['departure_time'] = combined_df['departure_time'].values.astype(int)

In [None]:
# import seaborn as sns
# sns.relplot(data=combined_df, x='departure_time', y='actual_departure_time')

In [None]:
X = combined_df[['trip_id', 'route_id', 'stop_sequence', 'actual_departure_time', 'vehicle_id']]
Y = combined_df['departure_time']
reg = LinearRegression().fit(X, Y)
reg.score(X, Y)

In [None]:
# reg.predict([])