In [1]:
import os
import pandas as pd
import numpy as np
import xlsxwriter
from datetime import datetime
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import plot

This code looks at travel times from clearguide and shows how average travel times vary by time of day.

In [27]:
folder_path = "data/TOD"

# Read all CSV files from the specified folder
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()


# Combine all CSV files into a single DataFrame
dfs = []
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    # Add a new column with the source filename
    df['source_file'] = file
    # Add a new column with routeID as string before '_'
    df['routeID'] = df['source_file'].str.split('_').str[0]
    dfs.append(df)

    combined_data = pd.concat(dfs, ignore_index=True)
    combined_data['local_datetime'] = pd.to_datetime(combined_data['local_datetime'])
    #clean_data = combined_data[combined_data['local_datetime'] >= start_date]

# create new columns for the outlier analysis
combined_data.loc[:,'hour'] = combined_data['local_datetime'].dt.hour
combined_data.loc[:,'time'] = combined_data['local_datetime'].dt.time
combined_data.loc[:,'day'] = combined_data['local_datetime'].dt.date
combined_data.loc[:,'day_of_week'] = combined_data['local_datetime'].dt.day_of_week


## Outlier Analysis

In [28]:
# Group the DataFrame by the 'time' column and calculate the average travel time for each time bin
time_bin_avg = combined_data.groupby('time')['avg_travel_time'].mean().reset_index()

# Merge the average travel time back to the original DataFrame based on the 'time' column
merged_data = pd.merge(combined_data, time_bin_avg, on='time', suffixes=('', '_avg'))

# Rename the new column containing the average travel time
merged_data.rename(columns={'avg_travel_time_avg': 'time_bin_avg'}, inplace=True)

# Calculate the z-score for each travel time based on the average for its respective hour
merged_data['z_score'] = (merged_data['avg_travel_time'] - merged_data['time_bin_avg']) / merged_data['time_bin_avg'].std()
# Set a threshold for outliers (e.g., z-score greater than 3 or less than -3)
outlier_threshold = 3
outliers = merged_data[abs(merged_data['z_score']) > outlier_threshold]

# Remove outliers
filtered_data = merged_data[abs(merged_data.z_score) < outlier_threshold]


## Time of Day Plots

In [30]:
# plotly for time of day, wrapped by route 
# use merged_data to include all data
# use filtered_data to exclude outliers
time_of_day_data = merged_data.groupby(['routeID', 'source_file','time'])['avg_travel_time'].mean().reset_index()

fig_time_of_day = px.line(time_of_day_data, x='time', y='avg_travel_time', color='source_file', facet_col='routeID', title='Average Travel Time Over Time-of-Day') # facet_row = route

fig_time_of_day.show()