In [None]:
import os
import pandas as pd
import numpy as np
import xlsxwriter
from datetime import datetime
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import plot


# Set the max_columns option to display all columns horizontally
pd.set_option('display.max_columns', None)

Hard code the inputs and read in the data.

In [None]:
folder_path = "data/TOD/2100"
start_time_str = "7:00"
stop_time_str = "9:00"
start_time = datetime.strptime(start_time_str, "%H:%M").time()
stop_time = datetime.strptime(stop_time_str, "%H:%M").time()
filter_date = "03/26/2024 00:00:00"
implementation_date = pd.to_datetime(filter_date)
output_file = "testing.xlsx"
start_date =  "No"#"06/20/2023"
primary_movement = 'Pioneer EB.csv'
category_order = [primary_movement]
# 2100: dates_to_exclude = ['2024-03-20', '2024-03-21', '2024-03-22', '2024-03-25', '2024-03-26', '2024-03-27', '2024-03-28', '2024-03-29', '2024-04-01', '2024-04-02', '2024-04-03']
# Foothill: dates_to_exclude = ['2024-03-04', '2024-03-05', '2024-03-06', '2024-03-07', '2024-03-08', '2024-03-14', '2024-03-15', '2024-03-18', '2024-03-19', '2024-03-20', '2024-03-21', '2024-03-22', '2024-03-25', '2024-03-26']
dates_to_exclude = ['2024-11-04']
output_summary_table = 'output/Redwood_AM_summarytable.html'


# Read all CSV files from the specified folder
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()
print(files)

In [None]:
# Combine all CSV files into a single DataFrame
dfs = []
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    # Add a new column with the source filename
    df['source_file'] = file
    dfs.append(df)

    combined_data = pd.concat(dfs, ignore_index=True)
    combined_data['local_datetime'] = pd.to_datetime(combined_data['local_datetime'])
    #clean_data = combined_data[combined_data['local_datetime'] >= start_date]
    
    # Filters all data before the start date or skips this step if the answer was NO
    if start_date.lower() not in ["no", "n"]:
        # Filter combined_data based on the condition
        clean_data = combined_data[combined_data['local_datetime'] >= start_date]
    else:
        # If start_date is "no", assign clean_data to combined_data
        clean_data = combined_data

    
# filter out the filter dates here.
# Filter out the data for the specified dates
clean_data = clean_data[~clean_data['local_datetime'].dt.normalize().isin(dates_to_exclude)]    
clean_data

In [None]:
# Create a new column 'before_after' based on the input implementation date
clean_data.loc[:,'before_after'] = np.where(pd.to_datetime(clean_data['local_datetime']) < filter_date, 'before', 'after')

clean_data

# Outlier Analysis

In [None]:
# create an hour variable to compare travel times within the hour to an hourly average
clean_data.loc[:,'hour'] = clean_data['local_datetime'].dt.hour
clean_data.loc[:,'time'] = clean_data['local_datetime'].dt.time
clean_data.loc[:,'day'] = clean_data['local_datetime'].dt.date
clean_data.loc[:,'day_of_week'] = clean_data['local_datetime'].dt.day_of_week
clean_data

In [None]:
# Group the DataFrame by the 'time' column and calculate the average travel time for each time bin
time_bin_avg = clean_data.groupby('time')['avg_travel_time'].mean().reset_index()

# Merge the average travel time back to the original DataFrame based on the 'time' column
merged_data = pd.merge(clean_data, time_bin_avg, on='time', suffixes=('', '_avg'))

# Rename the new column containing the average travel time
merged_data.rename(columns={'avg_travel_time_avg': 'time_bin_avg'}, inplace=True)
merged_data



In [None]:
# Calculate the z-score for each travel time based on the average for its respective hour
merged_data['z_score'] = (merged_data['avg_travel_time'] - merged_data['time_bin_avg']) / merged_data['time_bin_avg'].std()
# Set a threshold for outliers (e.g., z-score greater than 3 or less than -3)
outlier_threshold = 3
outliers = merged_data[abs(merged_data['z_score']) > outlier_threshold]

# Filter data based on peak hour range and remove outliers
peak_hour_data = merged_data[(merged_data.local_datetime.dt.time >= start_time)&(merged_data.local_datetime.dt.time <= stop_time)]
filtered_data = peak_hour_data[abs(peak_hour_data.z_score) < outlier_threshold]
filtered_data

# Summary Table

In [None]:
# Calculate Travel Times ---------------------------------
# Create summary table to compare before and after travel times, excluding the outliers
summary_table = (
    filtered_data
    .groupby(['source_file','before_after'])
    ['avg_travel_time']
    .mean()
    .reset_index()
    .pivot(index='source_file', columns='before_after', values='avg_travel_time')
    .reset_index()
)

# Reorder the columns
# Check if 'before' and 'after' columns exist in the DataFrame
if 'before' in summary_table.columns and 'after' in summary_table.columns:
    # Reorder the columns if both 'before' and 'after' exist
    summary_table = summary_table[['source_file', 'before', 'after']]
    # add the difference column
    summary_table['Difference (sec)'] = (summary_table['after'] - summary_table['before']) * 60
elif 'before' in summary_table.columns:
    # Insert NA values for 'after' and reorder the columns
    summary_table['after'] = np.nan
    summary_table = summary_table[['source_file', 'before', 'after']]
elif 'after' in summary_table.columns:
    # Insert NA values for 'before' and reorder the columns
    summary_table['before'] = np.nan
    summary_table = summary_table[['source_file', 'before', 'after']]

html_summary_table = summary_table.to_html()
# Write HTML content to a file
with open(output_summary_table, 'w') as file:
    file.write(html_summary_table)

summary_table


In [None]:
# store variables
avg_before_1 = summary_table.loc[0, 'before']
avg_before_2 = summary_table.loc[1, 'before']

avg_after_1 = summary_table.loc[0, 'after']
avg_after_2 = summary_table.loc[1, 'after']

# Time Series Plots

## Overall Time Series (Unfiltered)

In [None]:
# plotly for time series, wrapped  by route
# Create a Plotly figure
fig_overall = px.line(clean_data, x='local_datetime', y='avg_travel_time', color='source_file',  title='Average Travel Time Over Time')

# Show the chart
fig_overall.show()

## Cleaned Time Series - showing before and after comparison

In [None]:
# clean time series plot with horizontal lines ## NOTE maybe this one needs to be a ---- DAILY AVERAGE ----

# start with filtered data then group by day and route and before_after then mean. 
daily_avg_data = filtered_data.groupby(['source_file','day'])['avg_travel_time'].mean().reset_index().sort_values(by='day')
daily_avg_data

In [None]:
# category rules: keep it alphabetical... just go with the default flow

fig_daily_average = px.line(daily_avg_data, x='day', y='avg_travel_time', color='source_file', facet_row='source_file', title='Daily Average Travel Time Over Time')

# add horizontal lines

# create data for horizontal line length extents
before_data = filtered_data[(filtered_data['before_after'] == 'before')]
after_data = filtered_data[(filtered_data['before_after'] == 'after')]

# add before lines, stops at implementation date ## NOTE: it is weird that the rows seem to be backwards... 1 from summary table matches 2 in the facet but shows up first on the plot...
fig_daily_average.add_trace(go.Scatter(x=before_data['local_datetime'], y=[avg_before_1] * len(filtered_data),
                    mode='lines', name='Average Before Implementation', line=dict(color='red', dash='dash')), row=2, col=1)
fig_daily_average.add_trace(go.Scatter(x=before_data['local_datetime'], y=[avg_before_2] * len(filtered_data),
                    mode='lines', name='Average Before Implementation', line=dict(color='red', dash='dash')), row=1, col=1)

# add after lines, continuous for comparison
fig_daily_average.add_trace(go.Scatter(x=filtered_data['local_datetime'], y=[avg_after_1] * len(filtered_data),
                    mode='lines', name='Average After Implementation', line=dict(color='green', dash='dash')), row=2, col=1)
fig_daily_average.add_trace(go.Scatter(x=filtered_data['local_datetime'], y=[avg_after_2] * len(filtered_data),
                    mode='lines', name='Average After Implementation', line=dict(color='green', dash='dash')), row=1, col=1)

# add vertical line showing implementation date
fig_daily_average.add_trace(go.Scatter(x=[implementation_date, implementation_date], y=[filtered_data['avg_travel_time'].min(), filtered_data['avg_travel_time'].max()],
                    mode='lines', name='Implementation Date', line=dict(color='black', dash='solid')), row=1, col=1)
fig_daily_average.add_trace(go.Scatter(x=[implementation_date, implementation_date], y=[filtered_data['avg_travel_time'].min(), filtered_data['avg_travel_time'].max()],
                    mode='lines', name='Implementation Date', line=dict(color='black', dash='solid')), row=2, col=1)

fig_daily_average.show()

In [None]:
# plotly for time of day, wrapped by route
time_of_day_data = filtered_data.groupby(['source_file','before_after','time'])['avg_travel_time'].mean().reset_index()

fig_time_of_day = px.line(time_of_day_data, x='time', y='avg_travel_time', color='before_after', facet_row='source_file', title='Average Travel Time Over Time-of-Day')

# add horizontal lines ## NOTE: it is weird that the rows seem to be backwards... 1 from summary table matches 2 in the facet but shows up first on the plot...
# before
fig_time_of_day.add_trace(go.Scatter(x=time_of_day_data['time'], y=[avg_before_1] * len(filtered_data),
                    mode='lines', name='Average Before Implementation', line=dict(color='red', dash='dash')), row=2, col=1)
fig_time_of_day.add_trace(go.Scatter(x=time_of_day_data['time'], y=[avg_before_2] * len(filtered_data),
                    mode='lines', name='Average Before Implementation', line=dict(color='red', dash='dash')), row=1, col=1)
#after
fig_time_of_day.add_trace(go.Scatter(x=time_of_day_data['time'], y=[avg_after_1] * len(filtered_data),
                    mode='lines', name='Average After Implementation', line=dict(color='blue', dash='dash')), row=2, col=1)
fig_time_of_day.add_trace(go.Scatter(x=time_of_day_data['time'], y=[avg_after_2] * len(filtered_data),
                    mode='lines', name='Average After Implementation', line=dict(color='blue', dash='dash')), row=1, col=1)


fig_time_of_day.show()

In [None]:
# diagnostics. Shows histogram distribution. Travel time bins in x axis and frequency in the y axis
# why is this backwards (red is "after" now...)

fig_dist = px.histogram(filtered_data, x='avg_travel_time', color='before_after', barmode='overlay', facet_row='source_file', title='Distribution of Travel Times')
fig_dist.update_traces(opacity=0.9)

fig_dist.show()

In [None]:
# write plot to HTML
# Save the chart as an HTML file
output_plot_file_path = "output/Foothill_PM_distribution.html"
plot(fig_dist, filename=output_plot_file_path)




In [None]:
# how do we know the difference is significant?
# how do we monitor the change overtime (APIs, email report updates)
# or PLOTLY website
# I need API to clearguide and ATSPM aggregates.