In [2]:
import re

import pandas as pd


def data_preparation(df: pd.DataFrame, keep_ratio: float=0.8) -> pd.DataFrame:
    stats = (
        df.groupby(["Module", "Function"])
        .agg(
            mean_cpu=pd.NamedAgg(column="CPU Time", aggfunc="mean"),
            std_cpu=pd.NamedAgg(column="CPU Time", aggfunc="std"),
            mean_mem_bound=pd.NamedAgg(column="Memory Bound(%)", aggfunc="mean"),
            std_mem_bound=pd.NamedAgg(column="Memory Bound(%)", aggfunc="std"),
            mean_fp=pd.NamedAgg(column="Vectorization:% of FP Ops(%)", aggfunc="mean"),
            std_fp=pd.NamedAgg(column="Vectorization:% of FP Ops(%)", aggfunc="std"),
            
        )
        .sort_values(by="mean_cpu", ascending=False)
    )
    stats["perc_makespan"] = stats["mean_cpu"] / stats["mean_cpu"].sum() * 100
    stats["cumperc_makespan"] = stats["perc_makespan"].cumsum()
    hotspots = stats[
        stats["cumperc_makespan"] < max(keep_ratio * 100, stats["cumperc_makespan"][0])
    ]

    # Total CPU Time per module
    module_mean_cpu = hotspots.groupby("Module")["mean_cpu"].sum().to_dict()
    hotspots = hotspots.reset_index()
    hotspots["module_cpu"] = hotspots["Module"].map(module_mean_cpu)

    # Clean Function and Module names
    def clean_module(module: str)-> str:
        if ( m := re.match(r"^\w+\.so", module) ):
            return m.group(0)
        return module

    hotspots["clean_func"] = hotspots["Function"].apply(lambda x: re.sub(r"<.*>", "", x))
    hotspots["clean_module"] = hotspots["Module"].apply(clean_module)

    # CPU Time sorted by "Module CPU Time" -> "CPU Time".
    hotspots = hotspots.sort_values(by=["module_cpu", "mean_cpu"], ascending=False)
    # Reorder the cumulative makespan for plotting.
    hotspots["cumperc_makespan"] = hotspots["perc_makespan"].cumsum()
    return hotspots

In [7]:
import math

from bokeh.io import output_notebook
from bokeh.models import (
    ColorBar,
    ColumnDataSource, 
    FactorRange, 
    HoverTool, 
    LinearAxis, 
    LinearColorMapper,
    Range1d, 
    Whisker,
)
from bokeh.plotting import figure, show
from bokeh.transform import transform


output_notebook()

def plot_results(data, *, colorbar_on="mean_fp", colorbar_title='% of FP operations'):
    df = data_preparation(data, keep_ratio=.8)
    xs = [(module, str(i)) for i, module in enumerate(df["clean_module"], start=1)]
    source = ColumnDataSource(data=dict(
        x=xs,
        module=df["clean_module"].values,
        func=df["clean_func"].values,
        perc_makespan=df["perc_makespan"].values,
        cumperc_makespan=df["cumperc_makespan"].values,
        mean_cpu=df["mean_cpu"].values,
        std_cpu=df["std_cpu"].values,
        std_lower_cpu=df["mean_cpu"].values-df["std_cpu"].values,
        std_upper_cpu=df["mean_cpu"].values+df["std_cpu"].values,
        mean_mem_bound=df["mean_mem_bound"].values,
        std_mem_bound=df["std_mem_bound"].values,
        mean_fp=df["mean_fp"].values,
        std_fp=df["std_fp"].values,
    ))

    p = figure(
        x_range=FactorRange(*xs),
        title="Avg. CPU Time",
        width=1200,
        toolbar_location=None,
    )

    # Tools
    hover = HoverTool()
    hover.tooltips = [
        ("Module", "@module"),
        ("Function", "@func"),
        ("Makespan contribution", "@perc_makespan{1.0}%"),
        ("Makespan contribution (cumulative)", "@cumperc_makespan{1.0}%"),
        ("CPU Time (mean ± std)", "@mean_cpu ± @std_cpu"),
        ("% of memory bound (mean ± std)", "@mean_mem_bound ± @std_mem_bound"),
        ("% of FP ops (mean ± std)", "@mean_fp ± @std_fp"),
    ]
    p.tools.append(hover)

    # Mean
    color_mapper = LinearColorMapper(palette='Magma256', low=0, high=100)
    color_bar = ColorBar(
        color_mapper = color_mapper,
        label_standoff = 14,
        location = (0,0),
        title = colorbar_title,
    )
    p.add_layout(color_bar, 'left')
    p.vbar(
        x="x",
        top="mean_cpu", 
        width=0.9, 
        source=source, 
        color=transform(colorbar_on, color_mapper)
    )
    # Error
    error = Whisker(base="x", upper="std_upper_cpu", lower="std_lower_cpu", source=source,
                    level="annotation", line_width=2)
    error.upper_head.size=20
    error.lower_head.size=20
    p.add_layout(error)

    # Cumulative makespan
    p.extra_y_ranges = {"percentage": Range1d(start=0, end=100)}
    p.add_layout(LinearAxis(y_range_name="percentage"), 'right')
    p.scatter(x="x", y="cumperc_makespan", source=source, y_range_name="percentage", color="red", size=15)

    # Style
    y_limit = 1.05 * (df["mean_cpu"].values+df["std_cpu"].values).max()
    p.y_range.end = y_limit
    # p.y_range = Range1d(start=0, end=y_limit)
    p.x_range.range_padding = 0.1
    p.xaxis.major_label_orientation = math.pi/8
    p.xaxis.group_label_orientation = math.pi/8
    p.xaxis.separator_line_alpha = 0
    p.xgrid.grid_line_color = None

    return show(p)


In [8]:
import pandas as pd
from pathlib import Path


def read_profiling_data(path: Path, *, delimiter: str ="\t") -> pd.DataFrame:
    return pd.concat(
        (
            pd.read_csv(filename, delimiter=delimiter) 
            for filename 
            in path.rglob("*.csv")
        ),
        ignore_index=True
    )

profiling_dir = Path("/mnt", "lustre", "mathdugre", "mri-bottleneck", "vtune_output")
experiments: dict[str, tuple[str]] = {
    "ants": (
        "brainExtraction",
    ),
}

for toolkit, pipelines in experiments.items():
    for pipeline in pipelines:
        profiling_data = read_profiling_data(profiling_dir / toolkit / pipeline)
        plot_results(profiling_data)