In [36]:
# We have a csv with the actual stop times.
# We have another CSV with the scheduled stop times.
# We want to combine them so we can run a linear regression.
# We use pandas

import pandas as pd 
import pytz
RESULT_DIRECTORY = 'C:/Users/nings/OneDrive - The University of Western Ontario/Scholar\'s 2200E/result_data'
LTC_DIRECTORY = 'C:/Users/nings/Documents/GitHub/se-2200e/raw_data'


In [37]:
schedule_csv_path = LTC_DIRECTORY + '/schedule/stop_times.txt'
schedule_df = pd.read_csv(schedule_csv_path)
schedule_df = schedule_df.drop(columns=['stop_headsign', 'arrival_time', 'pickup_type', 'drop_off_type', 'timepoint'])
schedule_df.head()


Unnamed: 0,trip_id,departure_time,stop_id,stop_sequence
0,1342560,6:13:00,KIPPADEL,1
1,1342560,6:13:51,KIPPBELF,2
2,1342560,6:14:41,KIPPBARK,3
3,1342560,6:15:13,KIPPARBO,4
4,1342560,6:16:04,KIPPBRIA,5


In [39]:
schedule_df['trip_id'] = pd.to_numeric(schedule_df['trip_id'], errors='coerce').fillna(0).astype(int)
schedule_df['departure_time'] = pd.to_datetime(schedule_df['departure_time'], format='%H:%M:%S', errors='coerce').dt.time
schedule_df['stop_sequence'] = pd.to_numeric(schedule_df['stop_sequence'], errors='coerce').fillna(0).astype(int)
schedule_df['stop_id'] = schedule_df['stop_id'].astype('category')
eastern = pytz.timezone('America/Toronto')

schedule_df.info()
schedule_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314334 entries, 0 to 314333
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   trip_id         314334 non-null  int32   
 1   departure_time  309366 non-null  object  
 2   stop_id         314334 non-null  category
 3   stop_sequence   314334 non-null  int32   
dtypes: category(1), int32(2), object(1)
memory usage: 5.5+ MB


Unnamed: 0,trip_id,departure_time,stop_id,stop_sequence
0,1342560,06:13:00,KIPPADEL,1
1,1342560,06:13:51,KIPPBELF,2
2,1342560,06:14:41,KIPPBARK,3
3,1342560,06:15:13,KIPPARBO,4
4,1342560,06:16:04,KIPPBRIA,5


In [40]:
csv_path = RESULT_DIRECTORY + '/result.csv'
actual_df = pd.read_csv(csv_path, sep=',', error_bad_lines=False, index_col=False, infer_datetime_format=True)
actual_df.info()
actual_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4790499 entries, 0 to 4790498
Data columns (total 10 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   trip_id         object
 1   start_date      object
 2   start_time      object
 3   route_id        object
 4   stop_sequence   object
 5   departure_time  object
 6   stop_id         object
 7   vehicle_id      object
 8   vehicle_label   object
 9   timestamp       object
dtypes: object(10)
memory usage: 365.5+ MB


Unnamed: 0,trip_id,start_date,start_time,route_id,stop_sequence,departure_time,stop_id,vehicle_id,vehicle_label,timestamp
0,1346803,20201104,11:18:00,24,41,1604508849,WESTMAL1,3140,140,1604509170
1,1346803,20201104,11:18:00,24,42,1604508850,VISCFARN,3140,140,1604509170
2,1346803,20201104,11:18:00,24,43,1604508889,VISCVIGR,3140,140,1604509170
3,1346803,20201104,11:18:00,24,44,1604508928,VISCCRAN,3140,140,1604509170
4,1346803,20201104,11:18:00,24,45,1604508982,CRANBAR2,3140,140,1604509170


In [41]:
actual_df['trip_id'] = pd.to_numeric(actual_df['trip_id'], errors='coerce').fillna(0).astype(int)
actual_df['route_id'] = pd.to_numeric(actual_df['route_id'], errors='coerce').fillna(0).astype(int)
actual_df['stop_sequence'] = pd.to_numeric(actual_df['stop_sequence'], errors='coerce').fillna(0).astype(int)
actual_df['vehicle_id'] = pd.to_numeric(actual_df['vehicle_id'], errors='coerce').fillna(0).astype(int)
actual_df['vehicle_label'] = pd.to_numeric(actual_df['vehicle_label'], errors='coerce').fillna(0).astype(int)
actual_df.info()
actual_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4790499 entries, 0 to 4790498
Data columns (total 10 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   trip_id         int32 
 1   start_date      object
 2   start_time      object
 3   route_id        int32 
 4   stop_sequence   int32 
 5   departure_time  object
 6   stop_id         object
 7   vehicle_id      int32 
 8   vehicle_label   int32 
 9   timestamp       object
dtypes: int32(5), object(5)
memory usage: 274.1+ MB


Unnamed: 0,trip_id,start_date,start_time,route_id,stop_sequence,departure_time,stop_id,vehicle_id,vehicle_label,timestamp
0,1346803,20201104,11:18:00,24,41,1604508849,WESTMAL1,3140,140,1604509170
1,1346803,20201104,11:18:00,24,42,1604508850,VISCFARN,3140,140,1604509170
2,1346803,20201104,11:18:00,24,43,1604508889,VISCVIGR,3140,140,1604509170
3,1346803,20201104,11:18:00,24,44,1604508928,VISCCRAN,3140,140,1604509170
4,1346803,20201104,11:18:00,24,45,1604508982,CRANBAR2,3140,140,1604509170


In [47]:
actual_df['departure_time'] = pd.to_datetime(actual_df['departure_time'], unit='s', errors='coerce').dt.tz_localize('America/Toronto')
actual_df['timestamp'] = pd.to_datetime(actual_df['timestamp'], unit='s', errors='coerce').dt.tz_localize('America/Toronto')
actual_df['stop_id'] = actual_df['stop_id'].astype('category')
# actual_df['start_date'] = pd.to_datetime(actual_df['start_date'], infer_datetime_format=True, errors='coerce')
# actual_df['start_time'] = pd.to_datetime(actual_df['start_time'], infer_datetime_format=True, errors='coerce')

actual_df.info()
actual_df.head(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4790499 entries, 0 to 4790498
Data columns (total 10 columns):
 #   Column          Dtype                          
---  ------          -----                          
 0   trip_id         int32                          
 1   start_date      object                         
 2   start_time      object                         
 3   route_id        int32                          
 4   stop_sequence   int32                          
 5   departure_time  datetime64[ns, America/Toronto]
 6   stop_id         category                       
 7   vehicle_id      int32                          
 8   vehicle_label   int32                          
 9   timestamp       datetime64[ns, America/Toronto]
dtypes: category(1), datetime64[ns, America/Toronto](2), int32(5), object(2)
memory usage: 246.8+ MB


Unnamed: 0,trip_id,start_date,start_time,route_id,stop_sequence,departure_time,stop_id,vehicle_id,vehicle_label,timestamp
0,1346803,20201104,11:18:00,24,41,2020-11-04 16:54:09-05:00,WESTMAL1,3140,140,2020-11-04 16:59:30-05:00
1,1346803,20201104,11:18:00,24,42,2020-11-04 16:54:10-05:00,VISCFARN,3140,140,2020-11-04 16:59:30-05:00
2,1346803,20201104,11:18:00,24,43,2020-11-04 16:54:49-05:00,VISCVIGR,3140,140,2020-11-04 16:59:30-05:00
3,1346803,20201104,11:18:00,24,44,2020-11-04 16:55:28-05:00,VISCCRAN,3140,140,2020-11-04 16:59:30-05:00
4,1346803,20201104,11:18:00,24,45,2020-11-04 16:56:22-05:00,CRANBAR2,3140,140,2020-11-04 16:59:30-05:00
5,1346803,20201104,11:18:00,24,46,2020-11-04 16:57:02-05:00,CRANBIRC,3140,140,2020-11-04 16:59:30-05:00
6,1346803,20201104,11:18:00,24,47,2020-11-04 16:57:30-05:00,CRANSCHR,3140,140,2020-11-04 16:59:30-05:00
7,1346803,20201104,11:18:00,24,48,2020-11-04 16:58:07-05:00,TILLSOUT,3140,140,2020-11-04 16:59:30-05:00
8,1346803,20201104,11:18:00,24,49,2020-11-04 16:58:35-05:00,TILLRALE,3140,140,2020-11-04 16:59:30-05:00
9,1346803,20201104,11:18:00,24,50,2020-11-04 16:59:07-05:00,TILBPOME,3140,140,2020-11-04 16:59:30-05:00


In [48]:
# need to fix timezone issues
combined_df = actual_df.merge(schedule_df, how='inner', on=['trip_id', 'stop_id', 'stop_sequence'])
combined_df.info()
combined_df.head(50)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1864497 entries, 0 to 1864496
Data columns (total 11 columns):
 #   Column            Dtype                          
---  ------            -----                          
 0   trip_id           int32                          
 1   start_date        object                         
 2   start_time        object                         
 3   route_id          int32                          
 4   stop_sequence     int32                          
 5   departure_time_x  datetime64[ns, America/Toronto]
 6   stop_id           object                         
 7   vehicle_id        int32                          
 8   vehicle_label     int32                          
 9   timestamp         datetime64[ns, America/Toronto]
 10  departure_time_y  object                         
dtypes: datetime64[ns, America/Toronto](2), int32(5), object(4)
memory usage: 135.1+ MB


Unnamed: 0,trip_id,start_date,start_time,route_id,stop_sequence,departure_time_x,stop_id,vehicle_id,vehicle_label,timestamp,departure_time_y
0,1346628,20201104,11:39:00,19,9,2020-11-04 16:47:57-05:00,SUNNYMCA,3141,141,2020-11-04 16:59:31-05:00,11:40:27
1,1346628,20201105,11:39:00,19,9,2020-11-05 16:43:38-05:00,SUNNYMCA,3317,317,2020-11-05 16:52:43-05:00,11:40:27
2,1346628,20201106,11:39:00,19,9,2020-11-06 16:43:52-05:00,SUNNYMCA,3108,108,2020-11-06 16:52:34-05:00,11:40:27
3,1346628,20201109,11:39:00,19,9,2020-11-09 16:42:28-05:00,SUNNYMCA,3153,153,2020-11-09 16:50:47-05:00,11:40:27
4,1346628,20201110,11:39:00,19,9,2020-11-10 16:42:13-05:00,SUNNYMCA,3303,303,2020-11-10 16:50:33-05:00,11:40:27
5,1346628,20201111,11:39:00,19,9,2020-11-11 16:48:50-05:00,SUNNYMCA,3317,317,2020-11-11 16:56:37-05:00,11:40:27
6,1346628,20201112,11:39:00,19,9,2020-11-12 16:42:40-05:00,SUNNYMCA,3148,148,2020-11-12 16:50:40-05:00,11:40:27
7,1346628,20201113,11:39:00,19,9,2020-11-13 16:42:24-05:00,SUNNYMCA,3340,340,2020-11-13 16:50:24-05:00,11:40:27
8,1346628,20201116,11:39:00,19,9,2020-11-16 16:42:18-05:00,SUNNYMCA,3144,144,2020-11-16 16:50:22-05:00,11:40:27
9,1346628,20201117,11:39:00,19,9,2020-11-17 16:44:36-05:00,SUNNYMCA,3172,172,2020-11-17 16:52:06-05:00,11:40:27
