In [1]:
import os
import pandas as pd
import numpy as np
import xlsxwriter
from datetime import datetime


# Set the max_columns option to display all columns horizontally
pd.set_option('display.max_columns', None)

Hard code the inputs and read in the data.

In [11]:
folder_path = "data/samples"
start_time_str = "1:00"
stop_time_str = "19:00"
start_time = datetime.strptime(start_time_str, "%H:%M").time()
stop_time = datetime.strptime(stop_time_str, "%H:%M").time()
filter_date = "02/01/2024"
output_file = "testing.xlsx"
start_date =  "No"#"06/20/2023"

# Read all CSV files from the specified folder
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()
print(files)

['EB 2100.csv', 'EB Pioneer.csv', 'WB 2100.csv', 'WB Pioneer.csv']


In [12]:
# Combine all CSV files into a single DataFrame
dfs = []
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    # Add a new column with the source filename
    df['source_file'] = file
    dfs.append(df)

    combined_data = pd.concat(dfs, ignore_index=True)
    combined_data['local_datetime'] = pd.to_datetime(combined_data['local_datetime'])
    #clean_data = combined_data[combined_data['local_datetime'] >= start_date]
    
    # Filters all data before the start date or skips this step if the answer was NO
    if start_date.lower() not in ["no", "n"]:
        # Filter combined_data based on the condition
        clean_data = combined_data[combined_data['local_datetime'] >= start_date]
    else:
        # If start_date is "no", assign clean_data to combined_data
        clean_data = combined_data

    # filter out the filter dates here.
    # if filter_dates.lower() not in ["no", "n"]:
    # clean_data_wo_dates = clean_data
    
clean_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file
0,2023-12-26 07:00,2023-12-26 00:00:00,America/Denver,4.69,mins,EB 2100.csv
1,2023-12-26 07:05,2023-12-26 00:05:00,America/Denver,4.66,mins,EB 2100.csv
2,2023-12-26 07:10,2023-12-26 00:10:00,America/Denver,4.77,mins,EB 2100.csv
3,2023-12-26 07:15,2023-12-26 00:15:00,America/Denver,4.96,mins,EB 2100.csv
4,2023-12-26 07:20,2023-12-26 00:20:00,America/Denver,4.47,mins,EB 2100.csv


In [13]:
# Create a new column 'before_after' based on the input implementation date
clean_data['before_after'] = np.where(pd.to_datetime(combined_data['local_datetime']) < filter_date, 'before', 'after')
clean_data

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after
0,2023-12-26 07:00,2023-12-26 00:00:00,America/Denver,4.69,mins,EB 2100.csv,before
1,2023-12-26 07:05,2023-12-26 00:05:00,America/Denver,4.66,mins,EB 2100.csv,before
2,2023-12-26 07:10,2023-12-26 00:10:00,America/Denver,4.77,mins,EB 2100.csv,before
3,2023-12-26 07:15,2023-12-26 00:15:00,America/Denver,4.96,mins,EB 2100.csv,before
4,2023-12-26 07:20,2023-12-26 00:20:00,America/Denver,4.47,mins,EB 2100.csv,before
...,...,...,...,...,...,...,...
70826,2024-03-25 17:20,2024-03-25 11:20:00,America/Denver,8.54,mins,WB Pioneer.csv,after
70827,2024-03-25 17:25,2024-03-25 11:25:00,America/Denver,8.61,mins,WB Pioneer.csv,after
70828,2024-03-25 17:30,2024-03-25 11:30:00,America/Denver,9.01,mins,WB Pioneer.csv,after
70829,2024-03-25 17:35,2024-03-25 11:35:00,America/Denver,8.40,mins,WB Pioneer.csv,after


# Outlier Analysis

In [15]:
# create an hour variable to compare travel times within the hour to an hourly average
clean_data['hour'] = clean_data['local_datetime'].dt.hour
clean_data['time'] = clean_data['local_datetime'].dt.time
clean_data

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour,time
0,2023-12-26 07:00,2023-12-26 00:00:00,America/Denver,4.69,mins,EB 2100.csv,before,0,00:00:00
1,2023-12-26 07:05,2023-12-26 00:05:00,America/Denver,4.66,mins,EB 2100.csv,before,0,00:05:00
2,2023-12-26 07:10,2023-12-26 00:10:00,America/Denver,4.77,mins,EB 2100.csv,before,0,00:10:00
3,2023-12-26 07:15,2023-12-26 00:15:00,America/Denver,4.96,mins,EB 2100.csv,before,0,00:15:00
4,2023-12-26 07:20,2023-12-26 00:20:00,America/Denver,4.47,mins,EB 2100.csv,before,0,00:20:00
...,...,...,...,...,...,...,...,...,...
70826,2024-03-25 17:20,2024-03-25 11:20:00,America/Denver,8.54,mins,WB Pioneer.csv,after,11,11:20:00
70827,2024-03-25 17:25,2024-03-25 11:25:00,America/Denver,8.61,mins,WB Pioneer.csv,after,11,11:25:00
70828,2024-03-25 17:30,2024-03-25 11:30:00,America/Denver,9.01,mins,WB Pioneer.csv,after,11,11:30:00
70829,2024-03-25 17:35,2024-03-25 11:35:00,America/Denver,8.40,mins,WB Pioneer.csv,after,11,11:35:00


In [None]:
# Extract the time part from the timestamp
df['time'] = df['timestamp'].dt.time

# Group the DataFrame by the 'time' column and calculate the average travel time for each time bin
time_bin_avg = df.groupby('time')['travel_time'].mean().reset_index()

# Merge the average travel time back to the original DataFrame based on the 'time' column
df = pd.merge(df, time_bin_avg, on='time', suffixes=('', '_avg'))

# Rename the new column containing the average travel time
df.rename(columns={'travel_time_avg': 'avg_travel_time'}, inplace=True)