In [2]:
import os
import pandas as pd
import numpy as np
import xlsxwriter
from datetime import datetime


# Set the max_columns option to display all columns horizontally
pd.set_option('display.max_columns', None)

Hard code the inputs and read in the data.

In [3]:
folder_path = "samples/easy"
start_time_str = "1:00"
stop_time_str = "19:00"
start_time = datetime.strptime(start_time_str, "%H:%M").time()
stop_time = datetime.strptime(stop_time_str, "%H:%M").time()
filter_date = "06/24/2023"
output_file = "testing.xlsx"

# Read all CSV files from the specified folder
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()
print(files)

['NB_full_20231213.csv', 'SB_full_20231213.csv']


In [4]:
# Combine all CSV files into a single DataFrame
dfs = []
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    # Add a new column with the source filename
    df['source_file'] = file
    dfs.append(df)

    combined_data = pd.concat(dfs, ignore_index=True)
    combined_data['local_datetime'] = pd.to_datetime(combined_data['local_datetime'])
    
combined_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv


In [8]:
# Create a new column 'before_after' based on the input implementation date
combined_data['before_after'] = np.where(pd.to_datetime(combined_data['local_datetime']) < filter_date, 'before', 'after')
combined_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv,before
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv,before
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,before


At this point, maybe I can avoid the index part...

In [10]:
combined_data['hour'] = combined_data['local_datetime'].dt.hour
combined_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv,before,0
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv,before,4
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before,16
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before,17
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,before,17


In [11]:
# Calculate the hourly average travel time for each hour of the day on each route (source_file)
#hourly_avg_travel_time = combined_data.groupby(['source_file', 'hour'])['avg_travel_time'].mean().rename('hourly_average')
hourly_avg_travel_time = combined_data.groupby(['before_after', 'source_file', 'hour'])['avg_travel_time'].mean().reset_index().rename(columns={'avg_travel_time': 'hourly_average'})
hourly_avg_travel_time.head(10)

Unnamed: 0,before_after,source_file,hour,hourly_average
0,after,NB_full_20231213.csv,16,46.775
1,after,NB_full_20231213.csv,17,14.725
2,after,SB_full_20231213.csv,8,49.5
3,after,SB_full_20231213.csv,9,14.71
4,before,NB_full_20231213.csv,0,45.0
5,before,NB_full_20231213.csv,4,2.0
6,before,NB_full_20231213.csv,16,14.83
7,before,NB_full_20231213.csv,17,57.58
8,before,SB_full_20231213.csv,0,26.7
9,before,SB_full_20231213.csv,8,10.0


In [12]:
# Merge the hourly averages back to the original DataFrame
merged_data = combined_data.merge(hourly_avg_travel_time, on=['before_after', 'source_file', 'hour'], suffixes=('', '_hourly'))
merged_data.head(100)

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour,hourly_average
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv,before,0,45.0
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv,before,4,2.0
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before,16,14.83
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before,17,57.58
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,before,17,57.58
5,6/27/2023 6:00,2023-06-27 17:00:00,America/Denver,14.76,mins,NB_full_20231213.csv,after,17,14.725
6,7/5/2023 6:00,2023-07-05 17:00:00,America/Denver,14.69,mins,NB_full_20231213.csv,after,17,14.725
7,6/28/2023 6:00,2023-06-28 16:00:00,America/Denver,79.0,mins,NB_full_20231213.csv,after,16,46.775
8,6/29/2023 6:00,2023-06-29 16:00:00,America/Denver,14.55,mins,NB_full_20231213.csv,after,16,46.775
9,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,2.0,mins,SB_full_20231213.csv,before,0,26.7


In [13]:
# Calculate the z-score for each travel time based on the average for its respective hour
merged_data['z_score'] = (merged_data['avg_travel_time'] - merged_data['hourly_average']) / merged_data['hourly_average'].std()
merged_data.head(20)

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour,hourly_average,z_score
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv,before,0,45.0,0.0
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv,before,4,2.0,0.0
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before,16,14.83,0.0
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before,17,57.58,-2.367
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,before,17,57.58,2.367
5,6/27/2023 6:00,2023-06-27 17:00:00,America/Denver,14.76,mins,NB_full_20231213.csv,after,17,14.725,0.001953
6,7/5/2023 6:00,2023-07-05 17:00:00,America/Denver,14.69,mins,NB_full_20231213.csv,after,17,14.725,-0.001953
7,6/28/2023 6:00,2023-06-28 16:00:00,America/Denver,79.0,mins,NB_full_20231213.csv,after,16,46.775,1.798128
8,6/29/2023 6:00,2023-06-29 16:00:00,America/Denver,14.55,mins,NB_full_20231213.csv,after,16,46.775,-1.798128
9,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,2.0,mins,SB_full_20231213.csv,before,0,26.7,-1.378239


In [14]:
# Set a threshold for outliers (e.g., z-score greater than 3 or less than -3)
outlier_threshold = 3
outliers = merged_data[abs(merged_data['z_score']) > outlier_threshold]
outliers.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour,hourly_average,z_score
12,6/21/2023 6:00,2023-06-21,America/Denver,100.0,mins,SB_full_20231213.csv,before,0,26.7,4.090078


In [15]:
# Filter data based on peak hour range and remove outliers
#peak_hour_mask = (merged_data['local_datetime'].dt.time >= start_time) & (merged_data['local_datetime'].dt.time <= stop_time)
peak_hour_data = merged_data[(merged_data.local_datetime.dt.time >= start_time)&(merged_data.local_datetime.dt.time <= stop_time)]
#filtered_data = merged_data[peak_hour_mask & ~merged_data['local_datetime'].isin(outliers.index)]
#filtered_data = peak_hour_data
peak_hour_data.head(10)

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour,hourly_average,z_score
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before,16,36.126667,-1.750172
3,6/28/2023 6:00,2023-06-28 16:00:00,America/Denver,79.0,mins,NB_full_20231213.csv,before,16,36.126667,3.523354
4,6/29/2023 6:00,2023-06-29 16:00:00,America/Denver,14.55,mins,NB_full_20231213.csv,before,16,36.126667,-1.773182
5,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before,17,43.306667,-2.313108
6,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,before,17,43.306667,4.659089
7,6/27/2023 6:00,2023-06-27 17:00:00,America/Denver,14.76,mins,NB_full_20231213.csv,before,17,43.306667,-2.34598
8,7/5/2023 6:00,2023-07-05 17:00:00,America/Denver,14.69,mins,NB_full_20231213.csv,after,17,14.69,0.0


In [22]:
# Filter out the outliers z score above the z score threshold
filtered_data = peak_hour_data[peak_hour_data.z_score < outlier_threshold]
filtered_data.head(10)

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour,hourly_average,z_score
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before,16,36.126667,-1.750172
4,6/29/2023 6:00,2023-06-29 16:00:00,America/Denver,14.55,mins,NB_full_20231213.csv,before,16,36.126667,-1.773182
5,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before,17,43.306667,-2.313108
7,6/27/2023 6:00,2023-06-27 17:00:00,America/Denver,14.76,mins,NB_full_20231213.csv,before,17,43.306667,-2.34598
8,7/5/2023 6:00,2023-07-05 17:00:00,America/Denver,14.69,mins,NB_full_20231213.csv,after,17,14.69,0.0


In [20]:
# Create summary table to compare before and after travel times, excluding the outliers
summary_table = (
    filtered_data
    .groupby(['source_file','before_after'])
    ['avg_travel_time']
    .mean()
    .reset_index()
    .pivot(index='source_file', columns='before_after', values='avg_travel_time')
    .reset_index()
)

# I don't want to do the t test, because sometimes the data won't have before and after.
print(summary_table)

before_after           source_file  after  before
0             NB_full_20231213.csv  14.69  14.825
