In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import os
from datetime import datetime

In [2]:
### All global variables that would be passed as arguments in a python script
input_folder_name = "/home/nils/Downloads/WindowManagementBM_1737308559"
output_folder_name = "/home/nils/Downloads/WindowManagementBM_1737308559"
statistics_csv_name = "all_statistics.csv"
statistics_csv_path = os.path.join(input_folder_name, statistics_csv_name)

# Set the seaborn style
sns.set(style="whitegrid")

In [3]:
# Converting 
pattern_worker_file = r"^worker_\d+\.txt$"
pattern_task_details = (r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+).*?"
       r"Task (?P<task_id>\d+) for Pipeline (?P<pipeline>\d+).*?"
       r"(?P<action>Started|Completed)(?:\. Number of Tuples: (?P<num_tuples>\d+))?")
statistic_files = [os.path.join(input_folder_name, f) for f in os.listdir(input_folder_name) if re.match(pattern_worker_file, f)]
combined_df = pd.DataFrame()
for stat_file in statistic_files:
    print(stat_file)
    with open(stat_file, 'r') as file:
        log_text = file.read()

    records = []
    tasks = {}
    for match in re.finditer(pattern_task_details, log_text):
        timestamp = pd.to_datetime(match.group("timestamp"), format="%Y-%m-%d %H:%M:%S.%f")
        task_id = int(match.group("task_id"))
        action = match.group("action")
        num_tuples = int(match.group("num_tuples")) if match.group("num_tuples") else None
        pipeline_id = int(match.group("pipeline")) if match.group("pipeline") else None
    
        if action == "Started":
            tasks[task_id] = {"start_time": timestamp, "num_tuples": num_tuples}
        elif action == "Completed" and task_id in tasks:
            task_info = tasks[task_id]
            start_time = task_info["start_time"]
            duration = (timestamp - start_time).total_seconds()
            throughput = task_info["num_tuples"] / duration if duration > 0 else 0
            records.append({
                "task_id": task_id,
                "start_time": start_time,
                "end_time": timestamp,
                "duration": duration,
                "num_tuples": task_info["num_tuples"],
                "throughput": throughput,
                "pipeline_id": pipeline_id
            })
    
    # Create DataFrame and write it to the csv file
    df = pd.DataFrame(records)
    df.to_csv(os.path.join(input_folder_name, stat_file + ".csv"), index=False)

    # Adding this dataframe to the global one
    combined_df = pd.concat([combined_df, df], ignore_index=True)


# Writing the combined dataframe to a csv file
combined_df.to_csv(statistics_csv_path, index=False)

/home/nils/Downloads/WindowManagementBM_1737308559/worker_0.txt
/home/nils/Downloads/WindowManagementBM_1737308559/worker_1.txt
/home/nils/Downloads/WindowManagementBM_1737308559/worker_3.txt
/home/nils/Downloads/WindowManagementBM_1737308559/worker_2.txt


In [4]:
# Loading all data and plotting the duration and the throughput
data = pd.read_csv(statistics_csv_path)
# Create a unique plot for each pipeline_id
for pipeline in data['pipeline_id'].unique():
    # Filter data for the current pipeline_id
    pipeline_df = data[data['pipeline_id'] == pipeline]
    
    # Create subplots for duration and throughput
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot Duration over Task ID (using scatterplot)
    sns.scatterplot(x='task_id', y='duration', data=pipeline_df, ax=axes[0])
    axes[0].set_title(f"Duration vs Task ID for Pipeline {pipeline}")
    axes[0].set_xlabel("Task ID")
    axes[0].set_ylabel("Duration")
    
    # Plot Throughput over Task ID (using scatterplot)
    sns.scatterplot(x='task_id', y='throughput', data=pipeline_df, ax=axes[1])
    axes[1].set_title(f"Throughput vs Task ID for Pipeline {pipeline}")
    axes[1].set_xlabel("Task ID")
    axes[1].set_ylabel("Throughput")
    
    # Save the plot to a file
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder_name, f"pipeline_{pipeline}_plot.pdf"))
    plt.close()

In [6]:
# Loading all data and plotting the duration and the throughput via aggregates
# Define the bin size (e.g., every 10 task_ids)
bin_size = 100

# Create a unique plot for each pipeline_id
for pipeline in data['pipeline_id'].unique():
    # Filter data for the current pipeline_id
    pipeline_df = data[data['pipeline_id'] == pipeline]
    
    # Create task_id bins
    pipeline_df['task_id_bin'] = (pipeline_df['task_id'] // bin_size) * bin_size
    
    # Calculate aggregate statistics for each task_id bin
    aggregated_data = pipeline_df.groupby('task_id_bin').agg(
        avg_duration=('duration', 'mean'),
        median_duration=('duration', 'median'),
        p90_duration=('duration', lambda x: np.percentile(x, 90)),
        avg_throughput=('throughput', 'mean'),
        median_throughput=('throughput', 'median'),
        p90_throughput=('throughput', lambda x: np.percentile(x, 90))
    ).reset_index()
    
    # Create subplots for average, median, and 90th percentile duration and throughput
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot Average, Median, and 90th Percentile Duration
    sns.lineplot(x='task_id_bin', y='avg_duration', data=aggregated_data, ax=axes[0], label='Average Duration', color='blue')
    sns.lineplot(x='task_id_bin', y='median_duration', data=aggregated_data, ax=axes[0], label='Median Duration', color='red')
    sns.lineplot(x='task_id_bin', y='p90_duration', data=aggregated_data, ax=axes[0], label='90th Percentile Duration', color='purple')
    axes[0].set_title(f"Duration (Avg, Median, & 90th Percentile) vs Task ID Bin for Pipeline {pipeline}")
    axes[0].set_xlabel("Task ID Bin")
    axes[0].set_ylabel("Duration")
    axes[0].legend()
    
    # Plot Average, Median, and 90th Percentile Throughput
    sns.lineplot(x='task_id_bin', y='avg_throughput', data=aggregated_data, ax=axes[1], label='Average Throughput', color='green')
    sns.lineplot(x='task_id_bin', y='median_throughput', data=aggregated_data, ax=axes[1], label='Median Throughput', color='orange')
    sns.lineplot(x='task_id_bin', y='p90_throughput', data=aggregated_data, ax=axes[1], label='90th Percentile Throughput', color='brown')
    axes[1].set_title(f"Throughput (Avg, Median, & 90th Percentile) vs Task ID Bin for Pipeline {pipeline}")
    axes[1].set_xlabel("Task ID Bin")
    axes[1].set_ylabel("Throughput")
    axes[1].legend()
    
    # Save the plot to a file
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder_name, f"pipeline_{pipeline}_aggregated_bin_plot.png"))
    plt.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_df['task_id_bin'] = (pipeline_df['task_id'] // bin_size) * bin_size
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_df['task_id_bin'] = (pipeline_df['task_id'] // bin_size) * bin_size
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_df['task_id_bin'] = (pipeline_df['task_id'