In [1]:
import os
import pandas as pd
import numpy as np
import xlsxwriter
from datetime import datetime


# Set the max_columns option to display all columns horizontally
pd.set_option('display.max_columns', None)

Hard code the inputs and read in the data.

In [17]:
folder_path = "samples/easy"
start_time_str = "1:00"
stop_time_str = "19:00"
start_time = datetime.strptime(start_time_str, "%H:%M").time()
stop_time = datetime.strptime(stop_time_str, "%H:%M").time()
filter_date = "06/24/2023"
output_file = "testing.xlsx"
start_date =  "No"#"06/20/2023"

# Read all CSV files from the specified folder
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()
print(files)

['NB_full_20231213.csv', 'SB_full_20231213.csv']


In [18]:
# Combine all CSV files into a single DataFrame
dfs = []
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    # Add a new column with the source filename
    df['source_file'] = file
    dfs.append(df)

    combined_data = pd.concat(dfs, ignore_index=True)
    combined_data['local_datetime'] = pd.to_datetime(combined_data['local_datetime'])
    #clean_data = combined_data[combined_data['local_datetime'] >= start_date]
    
    # Filters all data before the start date or skips this step if the answer was NO
    if start_date.lower() not in ["no", "n"]:
        # Filter combined_data based on the condition
        clean_data = combined_data[combined_data['local_datetime'] >= start_date]
    else:
        # If start_date is "no", assign clean_data to combined_data
        clean_data = combined_data
    
clean_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv


In [5]:
# Create a new column 'before_after' based on the input implementation date
combined_data['before_after'] = np.where(pd.to_datetime(combined_data['local_datetime']) < filter_date, 'before', 'after')
print(filter_date)

06/24/2023


At this point, maybe I can avoid the index part...

In [10]:
combined_data['hour'] = combined_data['local_datetime'].dt.hour
combined_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,hour
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv,0
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv,4
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,16
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,17
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,17


In [None]:
# Calculate the hourly average travel time for each hour of the day on each route (source_file)
#hourly_avg_travel_time = combined_data.groupby(['source_file', 'hour'])['avg_travel_time'].mean().rename('hourly_average')
hourly_avg_travel_time = combined_data.groupby(['before_after', 'source_file', 'hour'])['avg_travel_time'].mean().reset_index().rename(columns={'avg_travel_time': 'hourly_average'})
hourly_avg_travel_time.head(10)

In [None]:
# Merge the hourly averages back to the original DataFrame
merged_data = combined_data.merge(hourly_avg_travel_time, on=['before_after', 'source_file', 'hour'], suffixes=('', '_hourly'))
merged_data.head(100)

In [None]:
# Calculate the z-score for each travel time based on the average for its respective hour
merged_data['z_score'] = (merged_data['avg_travel_time'] - merged_data['hourly_average']) / merged_data['hourly_average'].std()
merged_data.head(20)

In [None]:
# Set a threshold for outliers (e.g., z-score greater than 3 or less than -3)
outlier_threshold = 3
outliers = merged_data[abs(merged_data['z_score']) > outlier_threshold]
outliers.head()

In [None]:
# Filter data based on peak hour range and remove outliers
#peak_hour_mask = (merged_data['local_datetime'].dt.time >= start_time) & (merged_data['local_datetime'].dt.time <= stop_time)
peak_hour_data = merged_data[(merged_data.local_datetime.dt.time >= start_time)&(merged_data.local_datetime.dt.time <= stop_time)]
#filtered_data = merged_data[peak_hour_mask & ~merged_data['local_datetime'].isin(outliers.index)]
#filtered_data = peak_hour_data
peak_hour_data.head(10)

In [None]:
# Filter out the outliers z score above the z score threshold
filtered_data = peak_hour_data[peak_hour_data.z_score < outlier_threshold]
filtered_data.head(10)

In [None]:
# Create summary table to compare before and after travel times, excluding the outliers
summary_table = (
    filtered_data
    .groupby(['source_file','before_after'])
    ['avg_travel_time']
    .mean()
    .reset_index()
    .pivot(index='source_file', columns='before_after', values='avg_travel_time')
    .reset_index()
)

# I don't want to do the t test, because sometimes the data won't have before and after.
print(summary_table)