In [62]:
import re

import pandas as pd


def data_preparation(df: pd.DataFrame, keep_ratio: float = 0.8) -> pd.DataFrame:
    stats = (
        df.groupby(["Module", "Function"])
        .agg(
            mean_cpu=pd.NamedAgg(column="CPU Time", aggfunc="mean"),
            std_cpu=pd.NamedAgg(column="CPU Time", aggfunc="std"),
            mean_mem_bound=pd.NamedAgg(column="Memory Bound(%)", aggfunc="mean"),
            std_mem_bound=pd.NamedAgg(column="Memory Bound(%)", aggfunc="std"),
            mean_fp=pd.NamedAgg(column="Vectorization:% of FP Ops(%)", aggfunc="mean"),
            std_fp=pd.NamedAgg(column="Vectorization:% of FP Ops(%)", aggfunc="std"),
        )
        .sort_values(by="mean_cpu", ascending=False)
    )
    stats["perc_makespan"] = stats["mean_cpu"] / stats["mean_cpu"].sum() * 100
    stats["cumperc_makespan"] = stats["perc_makespan"].cumsum()
    hotspots = stats[
        stats["cumperc_makespan"] < max(keep_ratio * 100, stats["cumperc_makespan"][0])
    ]

    # Total CPU Time per module
    module_mean_cpu = hotspots.groupby("Module")["mean_cpu"].sum().to_dict()
    hotspots = hotspots.reset_index()
    hotspots["module_cpu"] = hotspots["Module"].map(module_mean_cpu)

    # Clean Function and Module names
    def clean_module(module: str) -> str:
        if m := re.match(r"^\w+\.so", module):
            return m.group(0)
        return module

    hotspots["clean_func"] = hotspots["Function"].apply(
        lambda x: re.sub(r"<.*>", "", x)
    )
    hotspots["clean_module"] = hotspots["Module"].apply(clean_module)

    # CPU Time sorted by "Module CPU Time" -> "CPU Time".
    hotspots = hotspots.sort_values(by=["module_cpu", "mean_cpu"], ascending=False)
    # Reorder the cumulative makespan for plotting.
    hotspots["cumperc_makespan"] = hotspots["perc_makespan"].cumsum()
    hotspots = hotspots.reset_index()
    hotspots.index += 1
    return hotspots

In [63]:
import math

from bokeh.io import output_notebook
from bokeh.models import (
    ColorBar,
    ColumnDataSource,
    FactorRange,
    HoverTool,
    LinearAxis,
    LinearColorMapper,
    Range1d,
    Whisker,
)
from bokeh.plotting import figure, show
from bokeh.transform import transform


output_notebook()


def plot_hotspots(
    data: pd.DataFrame,
    *,
    y_limit: float | None = None,
    colorbar_color: str = "Plasma256",
    colorbar_on: str,
    colorbar_title: str,
    pipeline: str,
):
    xs = [(module, str(i)) for i, module in enumerate(data["clean_module"], start=1)]
    source = ColumnDataSource(
        data=dict(
            x=xs,
            module=data["clean_module"].values,
            func=data["clean_func"].values,
            perc_makespan=data["perc_makespan"].values,
            cumperc_makespan=data["cumperc_makespan"].values,
            mean_cpu=data["mean_cpu"].values,
            std_cpu=data["std_cpu"].values,
            std_lower_cpu=data["mean_cpu"].values - data["std_cpu"].values,
            std_upper_cpu=data["mean_cpu"].values + data["std_cpu"].values,
            mean_mem_bound=data["mean_mem_bound"].values,
            std_mem_bound=data["std_mem_bound"].values,
            mean_fp=data["mean_fp"].values,
            std_fp=data["std_fp"].values,
        )
    )

    p = figure(
        x_range=FactorRange(*xs),
        title=f"Avg. CPU Time: {pipeline}",
        width=1200,
        toolbar_location=None,
    )

    # Tools
    hover = HoverTool()
    hover.tooltips = [
        ("Module", "@module"),
        ("Function", "@func"),
        ("Makespan contribution", "@perc_makespan{1.1}%"),
        ("Makespan contribution (cumulative)", "@cumperc_makespan{1.1}%"),
        ("CPU Time (mean ± std)", "@mean_cpu{1.11} ± @std_cpu{1.11}"),
        (
            "% of memory bound (mean ± std)",
            "@mean_mem_bound{1.11} ± @std_mem_bound{1.11}",
        ),
        ("% of FP ops (mean ± std)", "@mean_fp{1.11} ± @std_fp{1.11}"),
    ]
    p.tools.append(hover)

    # Mean
    color_mapper = LinearColorMapper(palette=colorbar_color, low=0, high=100)
    color_bar = ColorBar(
        color_mapper=color_mapper,
        label_standoff=14,
        location=(0, 0),
        title=colorbar_title,
    )
    p.add_layout(color_bar, "left")
    p.vbar(
        x="x",
        top="mean_cpu",
        width=0.9,
        source=source,
        color=transform(colorbar_on, color_mapper),
    )
    # Error
    error = Whisker(
        base="x",
        upper="std_upper_cpu",
        lower="std_lower_cpu",
        source=source,
        level="annotation",
        line_width=2,
    )
    error.upper_head.size = 20
    error.lower_head.size = 20
    p.add_layout(error)

    # Cumulative makespan
    p.extra_y_ranges = {"percentage": Range1d(start=0, end=100)}
    p.add_layout(LinearAxis(y_range_name="percentage"), "right")
    p.scatter(
        x="x",
        y="cumperc_makespan",
        source=source,
        y_range_name="percentage",
        color="lightgreen",
        size=15,
    )

    # Style
    if not y_limit:
        y_limit = 1.05 * (data["mean_cpu"].values + data["std_cpu"].values).max()
    p.y_range.end = y_limit
    # p.y_range = Range1d(start=0, end=y_limit)
    p.x_range.range_padding = 0.1
    p.xaxis.major_label_orientation = math.pi / 8
    p.xaxis.group_label_orientation = math.pi / 8
    p.xaxis.separator_line_alpha = 0
    p.xgrid.grid_line_color = None

    return show(p)

In [69]:
from pathlib import Path

from bokeh.palettes import Colorblind8
from bokeh.transform import factor_cmap


def plot_makespan(path: Path, *, nthreads: int = 1):
    # Data preparation
    data = pd.concat(
        (
            pd.read_csv(filename, delimiter="\t").assign(
                filename=filename.stem,
                pipeline="/".join(filename.relative_to(path).parts[:2]),
            )
            for filename in path.rglob("*.csv")
        ),
        ignore_index=True,
    )

    group = (
        data.groupby(["pipeline", "filename"])["CPU Time"]
        .sum()
        .divide(nthreads)
        .groupby("pipeline")
        .agg(["mean", "std"])
    )

    source = ColumnDataSource(
        data=dict(
            pipeline=group.index.values,
            mean=group["mean"].values,
            std=group["std"].values,
            std_lower=group["mean"].values - group["std"].values,
            std_upper=group["mean"].values + group["std"].values,
        )
    )

    # Plotting
    cmap = factor_cmap(
        "pipeline", palette=Colorblind8, factors=sorted(data["pipeline"].unique())
    )
    p = figure(
        x_range=group.index.values,
        title=f"Average Makespan",
        width=1200,
        toolbar_location=None,
        y_axis_label="Makespan (seconds)"
    )

    p.vbar(
        x="pipeline",
        top="mean",
        width=0.9,
        source=source,
        line_color=cmap,
        fill_color=cmap,
    )
    # Error
    error = Whisker(
        base="pipeline",
        upper="std_upper",
        lower="std_lower",
        source=source,
        level="annotation",
        line_width=2,
    )
    error.upper_head.size = 20
    error.lower_head.size = 20
    p.add_layout(error)

    # Tools
    hover = HoverTool()
    hover.tooltips = [
        ("Pipeline", "@pipeline"),
        ("Makespan (mean ± std)", "@mean{1.11} ± @std{1.11}"),
    ]
    p.tools.append(hover)

    # Appearance
    p.y_range.end = 1.05 * (group["mean"].values + group["std"].values).max()
    p.y_range.start = 0
    p.xgrid.grid_line_color = None
    p.outline_line_color = None


    return show(p)

In [70]:
def read_profiling_data(path: Path, *, delimiter: str = "\t") -> pd.DataFrame:
    return pd.concat(
        (
            pd.read_csv(filename, delimiter=delimiter)
            for filename in path.rglob("*.csv")
        ),
        ignore_index=True,
    )

# Single-threaded

In [71]:
profiling_dir = Path(
    "/", "mnt", "lustre", "mathdugre", "mri-bottleneck", "vtune_output", "1-threads"
)

plot_makespan(profiling_dir)

In [72]:
experiments: dict[str, tuple[tuple[str, float | None]]] = {
    "ants": (
        ("brainExtraction", 700),
        ("brainExtraction-fp", 700),
        ("registrationSyN", 2000),
        ("registrationSyN-fp", 2000),
    ),
    "fsl": (("fast", None),),
}

for toolkit, pipelines in experiments.items():
    for pipeline, y_limit in pipelines:
        profiling_data = read_profiling_data(profiling_dir / toolkit / pipeline)
        data = data_preparation(profiling_data, keep_ratio=0.8)
        plot_hotspots(
            data,
            pipeline=f"{toolkit}.{pipeline}",
            y_limit=y_limit,
            colorbar_on="mean_mem_bound",
            colorbar_title="% of memory bound",
            colorbar_color="Plasma256",
        )
        print(
            data[["clean_module", "clean_func", "mean_cpu", "std_cpu"]]
            .head(10)
            .to_string()
        )

           clean_module                                                                                   clean_func    mean_cpu    std_cpu
1   libantsUtilities.so                                       itk::LinearInterpolateImageFunction::EvaluateOptimized  560.657287  92.967420
2   libantsUtilities.so                         itk::VectorLinearInterpolateImageFunction::EvaluateAtContinuousIndex  371.581811   3.132710
3   libantsUtilities.so                                              itk::DisplacementFieldTransform::TransformPoint  126.344927   1.684579
4   libantsUtilities.so  itk::MattesMutualInformationImageToImageMetricv4GetValueAndDerivativeThreader::ProcessPoint  104.216804  15.836059
5   libantsUtilities.so                                                      itk::CompositeTransform::TransformPoint  101.391055  12.556387
6   libantsUtilities.so                                    itk::ImageToImageMetricv4::TransformAndEvaluateFixedPoint   81.464469   2.872815
7   libantsUtilities

           clean_module                                                                                   clean_func    mean_cpu    std_cpu
1   libantsUtilities.so                         itk::VectorLinearInterpolateImageFunction::EvaluateAtContinuousIndex  574.707728   4.523207
2   libantsUtilities.so                                       itk::LinearInterpolateImageFunction::EvaluateOptimized  474.175811  59.068021
3   libantsUtilities.so                                              itk::DisplacementFieldTransform::TransformPoint  264.736469   2.855136
4   libantsUtilities.so  itk::MattesMutualInformationImageToImageMetricv4GetValueAndDerivativeThreader::ProcessPoint   97.941218  15.809979
5   libantsUtilities.so                                                                       itk::Matrix::operator*   95.026164  42.317363
6   libantsUtilities.so             itk::CompositeTransform::ComputeJacobianWithRespectToParametersCachedTemporaries   78.394146  13.741931
7   libantsUtilities

           clean_module                                                                                                       clean_func     mean_cpu     std_cpu
1   libantsUtilities.so                                             itk::VectorLinearInterpolateImageFunction::EvaluateAtContinuousIndex  1391.437158   87.456594
2   libantsUtilities.so                                                           itk::LinearInterpolateImageFunction::EvaluateOptimized   536.807891   40.161192
3   libantsUtilities.so                                                                  itk::DisplacementFieldTransform::TransformPoint   458.770323   27.181268
4   libantsUtilities.so                                                        itk::ImageToImageMetricv4::TransformAndEvaluateFixedPoint   221.366608   14.589860
5   libantsUtilities.so  itk::ANTSNeighborhoodCorrelationImageToImageMetricv4GetValueAndDerivativeThreader::UpdateQueuesToNextScanWindow   166.567417   11.507689
6   libantsUtilities.so     

           clean_module                                                                                                       clean_func     mean_cpu     std_cpu
1   libantsUtilities.so                                             itk::VectorLinearInterpolateImageFunction::EvaluateAtContinuousIndex  1820.659235   63.393224
2   libantsUtilities.so                                                                  itk::DisplacementFieldTransform::TransformPoint   814.748431   28.314613
3   libantsUtilities.so                                                           itk::LinearInterpolateImageFunction::EvaluateOptimized   492.745342   15.677649
4   libantsUtilities.so  itk::ANTSNeighborhoodCorrelationImageToImageMetricv4GetValueAndDerivativeThreader::UpdateQueuesToNextScanWindow   238.371289    6.591397
5   libantsUtilities.so                                                       itk::ImageToImageMetricv4::TransformAndEvaluateMovingPoint   150.089840    5.043404
6   libantsUtilities.so     

           clean_module                            clean_func   mean_cpu    std_cpu
1                  fast                    NEWIMAGE::convolve  68.168779   0.721302
2                  fast     ZMRISegmentation::MRFWeightsInner  18.580994   1.830906
3                  fast       ZMRISegmentation::UpdateMembers   3.885647   0.403928
4                  fast          NEWIMAGE::volume::operator()   3.650481   0.426501
5  [Outside any module]            [Outside any known module]  13.046739  33.366878
6          libm-2.31.so                                   exp   6.670110   0.952921
7          libm-2.31.so                          func@0x80fa4   6.316254   0.615154
8          libc-2.31.so                         func@0x18b644   6.106342   0.181953
9    libfsl-newimage.so  NEWIMAGE::maskedIterator::operator++   4.793416   0.068310


# Multi-threaded (32 threads)

In [73]:
profiling_dir = Path(
    "/", "mnt", "lustre", "mathdugre", "mri-bottleneck", "vtune_output", "32-threads"
)

plot_makespan(profiling_dir, nthreads=32)

In [74]:
experiments: dict[str, tuple[tuple[str, float | None]]] = {
    "ants": (
        ("brainExtraction", 1000),
    #     ("brainExtraction-fp", 700),
    #     ("registrationSyN", 1600),
    #     ("registrationSyN-fp", 1600),
    ),
    # "fsl": (("fast", None),),
}

for toolkit, pipelines in experiments.items():
    for pipeline, y_limit in pipelines:
        profiling_data = read_profiling_data(profiling_dir / toolkit / pipeline)
        data = data_preparation(profiling_data, keep_ratio=0.8)
        plot_hotspots(
            data,
            pipeline=f"{toolkit}.{pipeline}",
            y_limit=y_limit,
            colorbar_on="mean_mem_bound",
            colorbar_title="% of memory bound",
            colorbar_color="Plasma256",
        )
        print(
            data[["clean_module", "clean_func", "mean_cpu", "std_cpu"]]
            .head(10)
            .to_string()
        )

           clean_module                                                                                   clean_func    mean_cpu     std_cpu
1   libantsUtilities.so                                       itk::LinearInterpolateImageFunction::EvaluateOptimized  885.319640  147.757082
2   libantsUtilities.so                         itk::VectorLinearInterpolateImageFunction::EvaluateAtContinuousIndex  520.432260    4.887326
3   libantsUtilities.so  itk::MattesMutualInformationImageToImageMetricv4GetValueAndDerivativeThreader::ProcessPoint  204.974632   36.247125
4   libantsUtilities.so                                              itk::DisplacementFieldTransform::TransformPoint  175.867398    3.197330
5   libantsUtilities.so                                                      itk::CompositeTransform::TransformPoint  138.551295   17.625178
6   libantsUtilities.so             itk::CompositeTransform::ComputeJacobianWithRespectToParametersCachedTemporaries  130.255974   20.641471
7   libantsUt

# Test

In [None]:
data = data_preparation(
    read_profiling_data(
        Path(
            "/",
            "mnt",
            "lustre",
            "mathdugre",
            "mri-bottleneck",
            "vtune_output",
            "1-threads",
        )
        / "ants"
        / "registrationSyN"
    ),
    keep_ratio=0.99,
)

In [None]:
with pd.option_context('display.max_colwidth', None, 'display.max_rows', None):
    print(data[["clean_func", "mean_cpu"]])

                                                                                                              clean_func  \
1                                                   itk::VectorLinearInterpolateImageFunction::EvaluateAtContinuousIndex   
2                                                                 itk::LinearInterpolateImageFunction::EvaluateOptimized   
3                                                                        itk::DisplacementFieldTransform::TransformPoint   
4                                                              itk::ImageToImageMetricv4::TransformAndEvaluateFixedPoint   
5        itk::ANTSNeighborhoodCorrelationImageToImageMetricv4GetValueAndDerivativeThreader::UpdateQueuesToNextScanWindow   
6                                                                                                         itk::ImageBase   
7                                                                                                   itk::Image::GetPixel   
8       