In [1]:
import json
from pathlib import Path
import plotly.graph_objects as go
import numpy as np

In [2]:
def calculate_average_time(directory_path):
    total_time = 0
    file_count = 0

    for file_path in directory_path.glob('*.json'):
        with open(file_path, 'r') as file:
            data = json.load(file)
            if data and isinstance(data, list) and data[-1].get('time'):
                total_time += data[-1]['time']
                file_count += 1

    average_time = total_time / file_count if file_count else 0
    return average_time, file_count

# Calculate average time for each directory
directories = [
    ('./amazon_vehicle_events', 'Amazon'),
    ('./naive_vehicle_events', 'Naive'),
    ('./optimized_vehicle_events', 'Optimized')
]

for directory, name in directories:
    average_time, file_count = calculate_average_time(Path(directory))
    print(f'Average time of the last event in each file ({name}): {round(average_time,2 )} seconds')

Average time of the last event in each file (Amazon): 24083.04 seconds
Average time of the last event in each file (Naive): 23571.25 seconds
Average time of the last event in each file (Optimized): 21259.05 seconds


In [3]:
def collect_times(directory_path):
    times = []
    for file_path in directory_path.glob('*.json'):
        with open(file_path, 'r') as file:
            data = json.load(file)
            if data and isinstance(data, list) and data[-1].get('time'):
                times.append(data[-1]['time']/ 3600)
    return times

def remove_outliers_iqr(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return data[(data >= lower_bound) & (data <= upper_bound)]

# Data collection for each directory
directories = [
    ('./amazon_vehicle_events', 'Amazon'),
    ('./naive_vehicle_events', 'Naive'),
    ('./optimized_vehicle_events', 'Optimized')
]

fig = go.Figure()

# Define colors for each plot
colors = [
    ('rgba(255,0,0,0.6)', 'red'),  # Amazon in red
    ('rgba(0,255,0,0.6)', 'green'),  # Naive in green
    ('rgba(0,0,255,0.6)', 'blue')  # Optimized in blue
]

# Generate KDE plot for each set of times
for (directory, name), (fillcolor, line_color) in zip(directories, colors):
    times = collect_times(Path(directory))
    times = np.array(times)  # Convert times list to a NumPy array for better performance
    
    # Remove outliers using the IQR method
    times = remove_outliers_iqr(times)
    
    fig.add_trace(go.Violin(x=times, name=name, points=False, line_color=line_color, fillcolor=fillcolor, box_visible=True))

fig.update_layout(
    title='Density of Times Across Different Algorithms',
    xaxis_title='Time (Hours)',
    yaxis_title='Density',
    barmode='overlay'
)

fig.update_traces(opacity=0.6)  # Make plots slightly translucent

fig.show()

In [7]:

def calculate_average_times(directories):
    averages = []
    for directory, name in directories:
        times = collect_times(Path(directory))
        if times:
            average_time = np.mean(times)
            averages.append((name, average_time))
        else:
            averages.append((name, 0))  # Append 0 or a suitable value if no times are found
    return averages

# Data collection for each directory
directories = [
    ('./amazon_vehicle_events', 'Amazon'),
    ('./naive_vehicle_events', 'Naive'),
    ('./optimized_vehicle_events', 'Optimized')
]

# Colors for each directory
colors = [
    ('rgba(255,0,0,0.6)', 'red'),  # Amazon in red
    ('rgba(0,255,0,0.6)', 'green'),  # Naive in green
    ('rgba(0,0,255,0.6)', 'blue')  # Optimized in blue
]

# Calculate average times
average_times = calculate_average_times(directories)

# Create bar chart
fig = go.Figure()

# Calculate the maximum value for setting y-axis range dynamically if needed
max_value = max(avg_time for _, avg_time in average_times) * 1.1  # 10% more than max for better visibility

# Add bars for each directory with specific colors
for (name, avg_time), (fill_color, _) in zip(average_times, colors):
    fig.add_trace(go.Bar(x=[name], y=[avg_time], name=name, marker_color=fill_color, width=0.4))

fig.update_layout(
    title='Average Times Across Different Algorithms (Lower is Better)',
    xaxis_title='Directory',
    yaxis_title='Average Time (Hours)',
    yaxis=dict(type='linear', range=[5.5, max_value]),  # Set y-axis to start at 20k
    barmode='group'
)

fig.show()

In [4]:
def calculate_time_difference(amazon_directory, optimized_directory):
    max_difference = 0
    file_with_max_difference = ""

    # Iterate over the amazon directory to find matching files in the optimized directory
    for amazon_file_path in amazon_directory.glob('*.json'):
        optimized_file_path = optimized_directory / amazon_file_path.name

        if optimized_file_path.exists():
            with open(amazon_file_path, 'r') as amazon_file, open(optimized_file_path, 'r') as optimized_file:
                amazon_data = json.load(amazon_file)
                optimized_data = json.load(optimized_file)
                
                if amazon_data and optimized_data and isinstance(amazon_data, list) and isinstance(optimized_data, list):
                    amazon_last_time = amazon_data[-1]['time'] if amazon_data[-1].get('time') else 0
                    optimized_last_time = optimized_data[-1]['time'] if optimized_data[-1].get('time') else 0

                    time_difference = abs(amazon_last_time - optimized_last_time)
                    
                    if time_difference > max_difference:
                        max_difference = time_difference
                        file_with_max_difference = amazon_file_path.name

    return file_with_max_difference, max_difference

# Paths to the directories
amazon_directory = Path('./amazon_vehicle_events')
optimized_directory = Path('./optimized_vehicle_events')

# Calculate and print the file with the maximum time difference
file_name, max_diff = calculate_time_difference(amazon_directory, optimized_directory)
print(f'File with the largest time difference: {file_name} with a difference of {max_diff} seconds')

File with the largest time difference: RouteID_69233ee9-1ac4-4d89-b322-94fb13df54a3_vehicle_events.json with a difference of 5390.982792299736 seconds
