In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import os
from datetime import datetime

In [2]:
### All global variables that would be passed as arguments in a python script
input_folder_name = "/home/nils/Downloads/WindowManagementBM_1737492626"
output_folder_name = "/home/nils/Downloads/WindowManagementBM_1737492626"
statistics_csv_name = "all_statistics.csv"
statistics_csv_path = os.path.join(input_folder_name, statistics_csv_name)
pipeline_txt_name = "pipelines.txt"
pipeline_txt_path = os.path.join(input_folder_name, pipeline_txt_name)

# Set the seaborn style
sns.set(style="whitegrid")

In [3]:
# Converting pipelines.txt to a dict of pipeline id to a title
def extract_pipeline_data(input_text):
    pipeline_dict = {}
    # Split the input text by the delimiter
    pipeline_sections = input_text.split("############################################")
    physical_pattern = re.compile(r"\bPhysical\w+")

    for section in pipeline_sections:
        # Find the pipeline ID in the section
        pipeline_match = re.search(r"Pipeline:\s*(\d+)", section)
        if pipeline_match:
            pipeline_id = int(pipeline_match.group(1))
            if pipeline_id not in pipeline_dict:
                pipeline_dict[pipeline_id] = []
            
            # Find all words starting with 'Physical' in the section
            physical_matches = physical_pattern.findall(section)
            cleaned_matches = [match.replace("Physical", "").replace("Operator", "").replace("Stream", "") for match in physical_matches]
            pipeline_dict[pipeline_id].extend(cleaned_matches)

    
    # Concatenate multiple values with "_"
    return {key: "_".join(values) for key, values in pipeline_dict.items()}



with open(pipeline_txt_path, 'r') as input_file:
    pipeline_ids_title = extract_pipeline_data(input_file.read())

pipeline_ids_title

{2: 'JoinProbe_Emit',
 3: 'WindowTrigger',
 4: 'Scan_JoinBuild',
 5: 'Scan_WatermarkAssignment_Emit',
 7: 'WindowTrigger',
 8: 'Scan_JoinBuild',
 9: 'Scan_WatermarkAssignment_Emit'}

In [4]:
# Converting query engine statistics to statistics csv
pattern_worker_file = r"^worker_\d+\.txt$"
pattern_task_details = (r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+).*?"
       r"Task (?P<task_id>\d+) for Pipeline (?P<pipeline>\d+).*?"
       r"(?P<action>Started|Completed)(?:\. Number of Tuples: (?P<num_tuples>\d+))?")
statistic_files = [os.path.join(input_folder_name, f) for f in os.listdir(input_folder_name) if re.match(pattern_worker_file, f)]
combined_df = pd.DataFrame()
for stat_file in statistic_files:
    print(stat_file)
    with open(stat_file, 'r') as file:
        log_text = file.read()

    records = []
    tasks = {}
    for match in re.finditer(pattern_task_details, log_text):
        timestamp = pd.to_datetime(match.group("timestamp"), format="%Y-%m-%d %H:%M:%S.%f")
        task_id = int(match.group("task_id"))
        action = match.group("action")
        num_tuples = int(match.group("num_tuples")) if match.group("num_tuples") else None
        pipeline_id = int(match.group("pipeline")) if match.group("pipeline") else None
    
        if action == "Started":
            tasks[task_id] = {"start_time": timestamp, "num_tuples": num_tuples}
        elif action == "Completed" and task_id in tasks:
            task_info = tasks[task_id]
            start_time = task_info["start_time"]
            duration = (timestamp - start_time).total_seconds()
            throughput = task_info["num_tuples"] / duration if duration > 0 else 0
            records.append({
                "task_id": task_id,
                "start_time": start_time,
                "end_time": timestamp,
                "duration": duration * (1000 * 1000),
                "num_tuples": task_info["num_tuples"],
                "throughput": throughput / (1000 * 1000),
                "pipeline_id": pipeline_id
            })
    
    # Create DataFrame and write it to the csv file
    df = pd.DataFrame(records)
    df.to_csv(os.path.join(input_folder_name, stat_file + ".csv"), index=False)

    # Adding this dataframe to the global one
    combined_df = pd.concat([combined_df, df], ignore_index=True)


# Writing the combined dataframe to a csv file
combined_df.to_csv(statistics_csv_path, index=False)

/home/nils/Downloads/WindowManagementBM_1737492626/worker_0.txt
/home/nils/Downloads/WindowManagementBM_1737492626/worker_1.txt
/home/nils/Downloads/WindowManagementBM_1737492626/worker_3.txt
/home/nils/Downloads/WindowManagementBM_1737492626/worker_2.txt


In [5]:
# Loading all data and plotting the duration and the throughput
data = pd.read_csv(statistics_csv_path)
# Create a unique plot for each pipeline_id
for pipeline in data['pipeline_id'].unique():
    # Filter data for the current pipeline_id
    pipeline_df = data[data['pipeline_id'] == pipeline]
    
    # Create subplots for duration and throughput
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))

    # Create title if it is an operator pipeline
    title = f"{pipeline_ids_title[pipeline]}({pipeline})" if pipeline in pipeline_ids_title else f"Pipeline ({pipeline})"
    
    # Plot Duration over Task ID (using scatterplot)
    sns.scatterplot(x='task_id', y='duration', data=pipeline_df, ax=axes[0])
    axes[0].set_title(f"Duration vs Task ID for {title}")
    axes[0].set_xlabel("Task ID")
    axes[0].set_ylabel("Duration [us]")
    
    # Plot Throughput over Task ID (using scatterplot)
    sns.scatterplot(x='task_id', y='throughput', data=pipeline_df, ax=axes[1])
    axes[1].set_title(f"Throughput vs Task ID for {title}")
    axes[1].set_xlabel("Task ID")
    axes[1].set_ylabel("Throughput [Mtup/s]")
    
    # Save the plot to a file
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder_name, f"pipeline_{pipeline}_plot.png"))
    plt.close()

In [6]:
# Loading all data and plotting the duration and the throughput via aggregates
# Define the bin size (e.g., every 10 task_ids)
bin_size = 100

# Create a unique plot for each pipeline_id
for pipeline in data['pipeline_id'].unique():
    # Filter data for the current pipeline_id
    pipeline_df = data[data['pipeline_id'] == pipeline]
    
    # Create task_id bins
    pipeline_df['task_id_bin'] = (pipeline_df['task_id'] // bin_size) * bin_size
    
    # Calculate aggregate statistics for each task_id bin
    aggregated_data = pipeline_df.groupby('task_id_bin').agg(
        avg_duration=('duration', 'mean'),
        median_duration=('duration', 'median'),
        p90_duration=('duration', lambda x: np.percentile(x, 90)),
        avg_throughput=('throughput', 'mean'),
        median_throughput=('throughput', 'median'),
        p90_throughput=('throughput', lambda x: np.percentile(x, 90))
    ).reset_index()
    
    # Create subplots for average, median, and 90th percentile duration and throughput
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))

    # Create title if it is an operator pipeline
    title = f"{pipeline_ids_title[pipeline]}({pipeline})" if pipeline in pipeline_ids_title else f"Pipeline ({pipeline})"

    
    # Plot Average, Median, and 90th Percentile Duration
    sns.lineplot(x='task_id_bin', y='avg_duration', data=aggregated_data, ax=axes[0], label='Average Duration', color='blue')
    sns.lineplot(x='task_id_bin', y='median_duration', data=aggregated_data, ax=axes[0], label='Median Duration', color='red')
    sns.lineplot(x='task_id_bin', y='p90_duration', data=aggregated_data, ax=axes[0], label='90th Percentile Duration', color='purple')
    axes[0].set_title(f"Duration vs Task ID Binned\n for {title}")
    axes[0].set_xlabel("Task ID Bin")
    axes[0].set_ylabel("Duration [us]")
    axes[0].legend()
    
    # Plot Average, Median, and 90th Percentile Throughput
    sns.lineplot(x='task_id_bin', y='avg_throughput', data=aggregated_data, ax=axes[1], label='Average Throughput', color='green')
    sns.lineplot(x='task_id_bin', y='median_throughput', data=aggregated_data, ax=axes[1], label='Median Throughput', color='orange')
    sns.lineplot(x='task_id_bin', y='p90_throughput', data=aggregated_data, ax=axes[1], label='90th Percentile Throughput', color='brown')
    axes[1].set_title(f"Throughput vs Task ID Binned\n for {title}")
    axes[1].set_xlabel("Task ID Bin")
    axes[1].set_ylabel("Throughput [Mtup/s]")
    axes[1].legend()
    
    # Save the plot to a file
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder_name, f"pipeline_{pipeline}_aggregated_bin_plot.png"))
    plt.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_df['task_id_bin'] = (pipeline_df['task_id'] // bin_size) * bin_size
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_df['task_id_bin'] = (pipeline_df['task_id'] // bin_size) * bin_size
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_df['task_id_bin'] = (pipeline_df['task_id'

In [7]:
# Loading all data and plotting the duration and the throughput via boxplots
# Create a figure with 2 subplots
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Boxplot for Duration
sns.boxplot(x='pipeline_id', y='duration', data=data, ax=axes[0])
axes[0].set_title("Duration Distribution per Pipeline")
axes[0].set_xlabel("Pipeline ID")
axes[0].set_ylabel("Duration")

# Boxplot for Throughput
sns.boxplot(x='pipeline_id', y='throughput', data=data, ax=axes[1])
axes[1].set_title("Throughput Distribution per Pipeline")
axes[1].set_xlabel("Pipeline ID")
axes[1].set_ylabel("Throughput")

# Adjust layout to avoid overlap
plt.tight_layout()
plt.savefig(os.path.join(output_folder_name, f"pipelines_aggregated_boxplot.png"))
plt.close()
