In [32]:
import os
import pandas as pd
import numpy as np
import xlsxwriter
from datetime import datetime

# Set the max_columns option to display all columns horizontally
pd.set_option('display.max_columns', None)

Hard code the inputs and read in the data.

In [33]:
folder_path = "samples/easy"
start_time = "10:00"
stop_time = "19:00"
filter_date = "07/01/2023"
output_file = "testing.xlsx"

# Read all CSV files from the specified folder
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()
print(files)

['NB_full_20231213.csv', 'SB_full_20231213.csv']


In [34]:
# Combine all CSV files into a single DataFrame
dfs = []
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    # Add a new column with the source filename
    df['source_file'] = file
    dfs.append(df)

    combined_data = pd.concat(dfs, ignore_index=True)
    combined_data['local_datetime'] = pd.to_datetime(combined_data['local_datetime'])
    
combined_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv


In [35]:
# Create a new column 'before_after' based on the input implementation date
combined_data['before_after'] = np.where(pd.to_datetime(combined_data['local_datetime']) < filter_date, 'before', 'after')
combined_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv,before
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv,before
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,before


At this point, maybe I can avoid the index part...

In [36]:
# Outlier Analysis -----------------------------------
# set time variable to index for time series calculations
data_index = combined_data.set_index('local_datetime')
data_index['hour'] = data_index.index.hour
data_index.head()

Unnamed: 0_level_0,utc_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour
local_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-06-14 00:00:00,6/14/2023 6:00,America/Denver,45.0,mins,NB_full_20231213.csv,before,0
2023-06-15 04:15:00,6/15/2023 6:00,America/Denver,2.0,mins,NB_full_20231213.csv,before,4
2023-06-20 16:00:00,6/20/2023 6:00,America/Denver,14.83,mins,NB_full_20231213.csv,before,16
2023-06-21 17:00:00,6/21/2023 6:00,America/Denver,15.16,mins,NB_full_20231213.csv,before,17
2023-06-22 17:00:00,6/22/2023 6:00,America/Denver,100.0,mins,NB_full_20231213.csv,before,17


In [37]:
combined_data['hour'] = combined_data['local_datetime'].dt.hour
combined_data.head()

Unnamed: 0,utc_datetime,local_datetime,timezone,avg_travel_time,avg_travel_time_units,source_file,before_after,hour
0,6/14/2023 6:00,2023-06-14 00:00:00,America/Denver,45.0,mins,NB_full_20231213.csv,before,0
1,6/15/2023 6:00,2023-06-15 04:15:00,America/Denver,2.0,mins,NB_full_20231213.csv,before,4
2,6/20/2023 6:00,2023-06-20 16:00:00,America/Denver,14.83,mins,NB_full_20231213.csv,before,16
3,6/21/2023 6:00,2023-06-21 17:00:00,America/Denver,15.16,mins,NB_full_20231213.csv,before,17
4,6/22/2023 6:00,2023-06-22 17:00:00,America/Denver,100.0,mins,NB_full_20231213.csv,before,17
