From 1ff321c91bde73afef2ddfe64bedde7139a95a1f Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Fri, 26 Jul 2024 14:15:34 +0200
Subject: [PATCH] improve benchmarks automation

This patch:
 - adds an option to run a benchmark a few times to pick a median value
 - adds a timeout for benchmarks, set at 10 minutes by default.
 - adds an option to filter out benchmarks by name
 - adds an option to pick a specific compiler commit to test with
 - adds more compute benchmarks
 - fixes cudaSift
 - uses upstream Velocity Bench
 - adds a simple summary table with results
---
 .github/workflows/benchmarks_compute.yml   |  21 +-
 scripts/benchmarks/benches/SobelFilter.py  |   3 +
 scripts/benchmarks/benches/api_overhead.py |  82 --------
 scripts/benchmarks/benches/base.py         |  15 +-
 scripts/benchmarks/benches/compute.py      | 212 +++++++++++++++++++++
 scripts/benchmarks/benches/cudaSift.py     |   7 +
 scripts/benchmarks/benches/easywave.py     |   2 +
 scripts/benchmarks/benches/hashtable.py    |   3 +
 scripts/benchmarks/benches/options.py      |   3 +
 scripts/benchmarks/benches/quicksilver.py  |   7 +-
 scripts/benchmarks/benches/result.py       |   1 +
 scripts/benchmarks/benches/velocity.py     |  16 +-
 scripts/benchmarks/main.py                 |  72 +++++--
 scripts/benchmarks/output.py               |  76 +++++---
 scripts/benchmarks/utils/utils.py          |  22 ++-
 15 files changed, 384 insertions(+), 158 deletions(-)
 delete mode 100644 scripts/benchmarks/benches/api_overhead.py
 create mode 100644 scripts/benchmarks/benches/compute.py

diff --git a/.github/workflows/benchmarks_compute.yml b/.github/workflows/benchmarks_compute.yml
index 619784b263..86fbb1ddc8 100644
--- a/.github/workflows/benchmarks_compute.yml
+++ b/.github/workflows/benchmarks_compute.yml
@@ -34,6 +34,16 @@ on:
         type: string
         required: false
         default: ''
+      sycl_repo:
+        description: 'Compiler repo'
+        type: string
+        required: true
+        default: 'intel/llvm'
+      sycl_commit:
+        description: 'Compiler commit'
+        type: string
+        required: false
+        default: ''
 
 permissions:
   contents: read
@@ -41,8 +51,6 @@ permissions:
 
 jobs:
   e2e-build-hw:
-    # Run only on upstream; forks will not have the HW
-    # if: github.repository == 'oneapi-src/unified-runtime'
     name: Build SYCL, UR, run Compute Benchmarks
     strategy:
       matrix:
@@ -105,12 +113,19 @@ jobs:
     - name: Checkout SYCL
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
-        repository: intel/llvm
+        repository: ${{inputs.sycl_repo}}
         ref: refs/heads/sycl
         path: sycl-repo
         fetch-depth: 1
         fetch-tags: false
 
+    - name: Fetch specific SYCL commit
+      if: inputs.sycl_commit != ''
+      working-directory: ./sycl-repo
+      run: |
+        git fetch --depth=1 origin ${{ inputs.sycl_commit }}
+        git checkout ${{ inputs.sycl_commit }}
+
     - name: Set CUDA env vars
       if: matrix.adapter.str_name == 'cuda'
       run: |
diff --git a/scripts/benchmarks/benches/SobelFilter.py b/scripts/benchmarks/benches/SobelFilter.py
index e976bfaee8..b28681c2ee 100644
--- a/scripts/benchmarks/benches/SobelFilter.py
+++ b/scripts/benchmarks/benches/SobelFilter.py
@@ -12,7 +12,10 @@
 class SobelFilter(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("sobel_filter", "sobel_filter", vb)
+
+    def download_deps(self):
         self.download_untar("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz")
+        return
 
     def name(self):
         return "Velocity-Bench Sobel Filter"
diff --git a/scripts/benchmarks/benches/api_overhead.py b/scripts/benchmarks/benches/api_overhead.py
deleted file mode 100644
index d34f4c4ee8..0000000000
--- a/scripts/benchmarks/benches/api_overhead.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import os
-import csv
-import io
-from utils.utils import run, git_clone
-from .base import Benchmark
-from .result import Result
-from .options import options
-
-class APIOverheadSYCL(Benchmark):
-    def __init__(self, directory):
-        super().__init__(directory)
-
-    def name(self):
-        return "api_overhead_benchmark_sycl, mean execution time per 10 kernels"
-
-    def unit(self):
-        return "μs"
-
-    def setup(self):
-        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "0f758021dce9ba32341a503739b69db057433c59")
-        build_path = self.create_build_path('compute-benchmarks-build')
-
-        configure_command = [
-            "cmake",
-            f"-B {build_path}",
-            f"-S {repo_path}",
-            f"-DCMAKE_BUILD_TYPE=Release",
-            f"-DBUILD_SYCL=ON",
-            f"-DSYCL_COMPILER_ROOT={options.sycl}",
-            f"-DALLOW_WARNINGS=ON"
-        ]
-        run(configure_command, add_sycl=True)
-
-        run(f"cmake --build {build_path} -j", add_sycl=True)
-        self.benchmark_bin = f"{build_path}/bin/api_overhead_benchmark_sycl"
-
-    def run_internal(self, ioq, env_vars):
-        command = [
-            f"{self.benchmark_bin}",
-            "--test=SubmitKernel",
-            f"--Ioq={ioq}",
-            "--DiscardEvents=0",
-            "--MeasureCompletion=0",
-            "--iterations=100000",
-            "--Profiling=0",
-            "--NumKernels=10",
-            "--KernelExecTime=1",
-            "--csv",
-            "--noHeaders"
-        ]
-        result = self.run_bench(command, env_vars)
-        (label, mean) = self.parse_output(result)
-        return Result(label=label, value=mean, command=command, env=env_vars, stdout=result)
-
-    def run(self, env_vars) -> list[Result]:
-        results = []
-        for ioq in [0, 1]:
-            results.append(self.run_internal(ioq, env_vars))
-
-        return results
-
-    def parse_output(self, output):
-        csv_file = io.StringIO(output)
-        reader = csv.reader(csv_file)
-        next(reader, None)
-        data_row = next(reader, None)
-        if data_row is None:
-            raise ValueError("Benchmark output does not contain data.")
-        try:
-            label = data_row[0]
-            mean = float(data_row[1])
-            return (label, mean)
-        except (ValueError, IndexError) as e:
-            raise ValueError(f"Error parsing output: {e}")
-
-    def teardown(self):
-        return
diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index 25b5d2619f..c7f263c253 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -20,16 +20,6 @@ def __init__(self, directory):
     def run_bench(self, command, env_vars):
         return run(command=command, env_vars=env_vars, add_sycl=True, cwd=options.benchmark_cwd).stdout.decode()
 
-    def create_build_path(self, name):
-        build_path = os.path.join(self.directory, name)
-
-        if options.rebuild and Path(build_path).exists():
-           shutil.rmtree(build_path)
-
-        Path(build_path).mkdir(parents=True, exist_ok=True)
-
-        return build_path
-
     def create_data_path(self, name):
         data_path = os.path.join(self.directory, "data", name)
 
@@ -58,10 +48,13 @@ def name(self):
     def unit(self):
         raise NotImplementedError()
 
+    def lower_is_better(self):
+        return True
+
     def setup(self):
         raise NotImplementedError()
 
-    def run(self, env_vars):
+    def run(self, env_vars) -> Result:
         raise NotImplementedError()
 
     def teardown(self):
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
new file mode 100644
index 0000000000..19bc0b7fd0
--- /dev/null
+++ b/scripts/benchmarks/benches/compute.py
@@ -0,0 +1,212 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import csv
+import io
+from utils.utils import run, git_clone, create_build_path
+from .base import Benchmark
+from .result import Result
+from .options import options
+
+class ComputeBench:
+    def __init__(self, directory):
+        self.directory = directory
+        self.built = False
+        return
+
+    def setup(self):
+        if self.built:
+            return
+
+        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "0f758021dce9ba32341a503739b69db057433c59")
+        build_path = create_build_path(self.directory, 'compute-benchmarks-build')
+
+        configure_command = [
+            "cmake",
+            f"-B {build_path}",
+            f"-S {repo_path}",
+            f"-DCMAKE_BUILD_TYPE=Release",
+            f"-DBUILD_SYCL=ON",
+            f"-DSYCL_COMPILER_ROOT={options.sycl}",
+            f"-DALLOW_WARNINGS=ON"
+        ]
+        run(configure_command, add_sycl=True)
+
+        run(f"cmake --build {build_path} -j", add_sycl=True)
+
+        self.built = True
+        self.bins = os.path.join(build_path, 'bin')
+
+class ComputeBenchmark(Benchmark):
+    def __init__(self, bench, name, test):
+        self.bench = bench
+        self.bench_name = name
+        self.test = test
+        super().__init__(bench.directory)
+
+    def bin_args(self) -> list[str]:
+        return []
+
+    def extra_env_vars(self) -> dict:
+        return {}
+
+    def unit(self):
+        return "μs"
+
+    def setup(self):
+        self.bench.setup()
+        self.benchmark_bin = os.path.join(self.bench.bins, self.bench_name)
+
+    def run(self, env_vars) -> Result:
+        command = [
+            f"{self.benchmark_bin}",
+            f"--test={self.test}",
+            "--csv",
+            "--noHeaders"
+        ]
+
+        command += self.bin_args()
+        env_vars.update(self.extra_env_vars())
+
+        result = self.run_bench(command, env_vars)
+        (label, mean) = self.parse_output(result)
+        return Result(label=label, value=mean, command=command, env=env_vars, stdout=result)
+
+    def parse_output(self, output):
+        csv_file = io.StringIO(output)
+        reader = csv.reader(csv_file)
+        next(reader, None)
+        data_row = next(reader, None)
+        if data_row is None:
+            raise ValueError("Benchmark output does not contain data.")
+        try:
+            label = data_row[0]
+            mean = float(data_row[1])
+            return (label, mean)
+        except (ValueError, IndexError) as e:
+            raise ValueError(f"Error parsing output: {e}")
+
+    def teardown(self):
+        return
+
+class SubmitKernelSYCL(ComputeBenchmark):
+    def __init__(self, bench, ioq):
+        self.ioq = ioq
+        super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel")
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        return f"api_overhead_benchmark_sycl SubmitKernel {order}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--Ioq={self.ioq}",
+            "--DiscardEvents=0",
+            "--MeasureCompletion=0",
+            "--iterations=100000",
+            "--Profiling=0",
+            "--NumKernels=10",
+            "--KernelExecTime=1"
+        ]
+
+class ExecImmediateCopyQueue(ComputeBenchmark):
+    def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
+        self.ioq = ioq
+        self.isCopyOnly = isCopyOnly
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=100000",
+            f"--ioq={self.ioq}",
+            f"--IsCopyOnly={self.isCopyOnly}",
+            "--MeasureCompletionTime=0",
+            f"--src={self.destination}",
+            f"--dst={self.destination}",
+            f"--size={self.size}"
+        ]
+
+class QueueInOrderMemcpy(ComputeBenchmark):
+    def __init__(self, bench, isCopyOnly, source, destination, size):
+        self.isCopyOnly = isCopyOnly
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
+
+    def name(self):
+        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--IsCopyOnly={self.isCopyOnly}",
+            f"--sourcePlacement={self.source}",
+            f"--destinationPlacement={self.destination}",
+            f"--size={self.size}",
+            "--count=100"
+        ]
+
+class QueueMemcpy(ComputeBenchmark):
+    def __init__(self, bench, source, destination, size):
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
+
+    def name(self):
+        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--sourcePlacement={self.source}",
+            f"--destinationPlacement={self.destination}",
+            f"--size={self.size}",
+        ]
+
+class StreamMemory(ComputeBenchmark):
+    def __init__(self, bench, type, size, placement):
+        self.type = type
+        self.size = size
+        self.placement = placement
+        super().__init__(bench, "memory_benchmark_sycl", "StreamMemory")
+
+    def name(self):
+        return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--type={self.type}",
+            f"--size={self.size}",
+            f"--memoryPlacement={self.placement}",
+            "--useEvents=0",
+            "--contents=Zeros",
+        ]
+
+class VectorSum(ComputeBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "miscellaneous_benchmark_sycl", "VectorSum")
+
+    def name(self):
+        return f"miscellaneous_benchmark_sycl VectorSum"
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=1000",
+            "--numberOfElementsX=512",
+            "--numberOfElementsY=256",
+            "--numberOfElementsZ=256",
+        ]
+
diff --git a/scripts/benchmarks/benches/cudaSift.py b/scripts/benchmarks/benches/cudaSift.py
index 6f9c19040e..482d258052 100644
--- a/scripts/benchmarks/benches/cudaSift.py
+++ b/scripts/benchmarks/benches/cudaSift.py
@@ -9,11 +9,18 @@
 from utils.utils import run
 import os
 import re
+import shutil
 
 class CudaSift(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("cudaSift", "cudaSift", vb)
 
+    def download_deps(self):
+        images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData')
+        dest = os.path.join(self.directory, 'inputData')
+        if not os.path.exists(dest):
+            shutil.copytree(images, dest)
+
     def name(self):
         return "Velocity-Bench CudaSift"
 
diff --git a/scripts/benchmarks/benches/easywave.py b/scripts/benchmarks/benches/easywave.py
index 2fa4d95685..2f89482329 100644
--- a/scripts/benchmarks/benches/easywave.py
+++ b/scripts/benchmarks/benches/easywave.py
@@ -14,6 +14,8 @@
 class Easywave(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("easywave", "easyWave_sycl", vb)
+
+    def download_deps(self):
         self.download_untar("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz")
 
     def name(self):
diff --git a/scripts/benchmarks/benches/hashtable.py b/scripts/benchmarks/benches/hashtable.py
index c8cb0bdb03..7558183bf0 100644
--- a/scripts/benchmarks/benches/hashtable.py
+++ b/scripts/benchmarks/benches/hashtable.py
@@ -23,6 +23,9 @@ def unit(self):
     def bin_args(self) -> list[str]:
         return ["--no-verify"]
 
+    def lower_is_better(self):
+        return False
+
     def parse_output(self, stdout: str) -> float:
         match = re.search(r'(\d+\.\d+) million keys/second', stdout)
         if match:
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index c990a44d5f..c035ce6800 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -5,6 +5,9 @@ class Options:
     sycl: str = ""
     rebuild: bool = True
     benchmark_cwd: str = "INVALID"
+    timeout: float = 600
+    iterations: int = 5
+    verbose: bool = False
 
 options = Options()
 
diff --git a/scripts/benchmarks/benches/quicksilver.py b/scripts/benchmarks/benches/quicksilver.py
index 383c8dd5be..7e1f65ee1d 100644
--- a/scripts/benchmarks/benches/quicksilver.py
+++ b/scripts/benchmarks/benches/quicksilver.py
@@ -15,10 +15,10 @@ def __init__(self, vb: VelocityBench):
         super().__init__("QuickSilver", "qs", vb)
         self.data_path = os.path.join(vb.repo_path, "QuickSilver", "Examples", "AllScattering")
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars) -> Result:
         # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0
         if 'UR_L0_USE_IMMEDIATE_COMMANDLISTS' in env_vars and env_vars['UR_L0_USE_IMMEDIATE_COMMANDLISTS'] == '0':
-            return []
+            return None
 
         return super().run(env_vars)
 
@@ -28,6 +28,9 @@ def name(self):
     def unit(self):
         return "MMS/CTT"
 
+    def lower_is_better(self):
+        return False
+
     def bin_args(self) -> list[str]:
         return ["-i", f"{self.data_path}/scatteringOnly.inp"]
 
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 8dd2f4ba9c..896ff4da98 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -16,3 +16,4 @@ class Result:
     stdout: str
     unit: str = ""
     name: str = ""
+    lower_is_better: bool = True
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index fec3abb842..e5601c6563 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -6,15 +6,14 @@
 from utils.utils import git_clone
 from .base import Benchmark
 from .result import Result
-from utils.utils import run
+from utils.utils import run, create_build_path
 import os
 import re
 
 class VelocityBench:
     def __init__(self, directory):
         self.directory = directory
-        # TODO: replace with https://github.com/oneapi-src/Velocity-Bench once all fixes land upstream
-        self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/pbalcer/Velocity-Bench.git", "ae0ae05c7fd1469779ecea4f36e4741b1d956eb4")
+        self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench", "34ee4ebe18d91dfdd38b7d798fd986b41874fcbc")
 
 class VelocityBase(Benchmark):
     def __init__(self, name: str, bin_name: str, vb: VelocityBench):
@@ -24,8 +23,13 @@ def __init__(self, name: str, bin_name: str, vb: VelocityBench):
         self.bin_name = bin_name
         self.code_path = os.path.join(self.vb.repo_path, self.bench_name, 'SYCL')
 
+    def download_deps(self):
+        return
+
     def setup(self):
-        build_path = self.create_build_path(self.bench_name)
+        self.download_deps()
+
+        build_path = create_build_path(self.directory, self.bench_name)
 
         configure_command = [
             "cmake",
@@ -47,7 +51,7 @@ def extra_env_vars(self) -> dict:
     def parse_output(self, stdout: str) -> float:
         raise NotImplementedError()
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars) -> Result:
         env_vars.update(self.extra_env_vars())
 
         command = [
@@ -57,7 +61,7 @@ def run(self, env_vars) -> list[Result]:
 
         result = self.run_bench(command, env_vars)
 
-        return [Result(label=self.bench_name, value=self.parse_output(result), command=command, env=env_vars, stdout=result)]
+        return Result(label=self.bench_name, value=self.parse_output(result), command=command, env=env_vars, stdout=result)
 
     def teardown(self):
         return
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 5dad40c7fe..34238f773c 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -5,9 +5,8 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
 from utils.utils import prepare_workdir, load_benchmark_results, save_benchmark_results;
-from benches.api_overhead import APIOverheadSYCL
+from benches.compute import *
 from benches.hashtable import Hashtable
 from benches.bitcracker import Bitcracker
 from benches.cudaSift import CudaSift
@@ -18,46 +17,72 @@
 from benches.options import options
 from output import generate_markdown
 import argparse
+import re
 
 # Update this if you are changing the layout of the results files
-INTERNAL_WORKDIR_VERSION = '1.0'
-
-def main(directory, additional_env_vars, save_name, compare_names):
-    variants = [
-        ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"),
-        ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""),
-    ]
+INTERNAL_WORKDIR_VERSION = '1.6'
 
+def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
     vb = VelocityBench(directory)
+    cb = ComputeBench(directory)
 
     benchmarks = [
-        APIOverheadSYCL(directory),
+        SubmitKernelSYCL(cb, 0),
+        SubmitKernelSYCL(cb, 1),
+        QueueInOrderMemcpy(cb, 0, 'Device', 'Device', 1024),
+        QueueInOrderMemcpy(cb, 0, 'Host', 'Device', 1024),
+        QueueMemcpy(cb, 'Device', 'Device', 1024),
+        StreamMemory(cb, 'Triad', 10 * 1024, 'Device'),
+        ExecImmediateCopyQueue(cb, 0, 1, 'Device', 'Device', 1024),
+        ExecImmediateCopyQueue(cb, 1, 1, 'Device', 'Host', 1024),
+        VectorSum(cb),
         Hashtable(vb),
         Bitcracker(vb),
-        #CudaSift(vb), TODO: the benchmark is passing, but is outputting "Failed to allocate device data"
+        CudaSift(vb),
         Easywave(vb),
         QuickSilver(vb),
         SobelFilter(vb)
     ]
 
+    if filter:
+        benchmarks = [benchmark for benchmark in benchmarks if filter.search(benchmark.name())]
+
     for benchmark in benchmarks:
+        print(f"setting up {benchmark.name()}... ", end='', flush=True)
         benchmark.setup()
+        print("complete.")
 
     results = []
     for benchmark in benchmarks:
-        for env_vars, extra_label in variants:
-            merged_env_vars = {**env_vars, **additional_env_vars}
+        merged_env_vars = {**additional_env_vars}
+        iteration_results = []
+        for iter in range(options.iterations):
+            print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
             bench_results = benchmark.run(merged_env_vars)
-            for res in bench_results:
-                res.unit = benchmark.unit()
-                res.name = benchmark.name()
-                res.label += f" {extra_label}"
-                results.append(res)
+            if bench_results is not None:
+                print(f"complete ({bench_results.value} {benchmark.unit()}).")
+                iteration_results.append(bench_results)
+            else:
+                print(f"did not finish.")
+
+        if len(iteration_results) == 0:
+            continue
+
+        iteration_results.sort(key=lambda res: res.value)
+        median_index = len(iteration_results) // 2
+        median_result = iteration_results[median_index]
+
+        median_result.unit = benchmark.unit()
+        median_result.name = benchmark.name()
+
+        results.append(median_result)
 
     for benchmark in benchmarks:
+        print(f"tearing down {benchmark.name()}... ", end='', flush=True)
         benchmark.teardown()
+        print("complete.")
 
     chart_data = {"This PR" : results}
 
@@ -93,11 +118,20 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
     parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
+    parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5)
+    parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600)
+    parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
+    parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
 
+    options.verbose = args.verbose
     options.rebuild = not args.no_rebuild
     options.sycl = args.sycl
+    options.iterations = args.iterations
+    options.timeout = args.timeout
+
+    benchmark_filter = re.compile(args.filter) if args.filter else None
 
-    main(args.benchmark_directory, additional_env_vars, args.save, args.compare)
+    main(args.benchmark_directory, additional_env_vars, args.save, args.compare, benchmark_filter)
diff --git a/scripts/benchmarks/output.py b/scripts/benchmarks/output.py
index 9cfee303b1..26deabe099 100644
--- a/scripts/benchmarks/output.py
+++ b/scripts/benchmarks/output.py
@@ -5,6 +5,7 @@
 
 import collections
 from benches.base import Result
+import math
 
 # Function to generate the mermaid bar chart script
 def generate_mermaid_script(chart_data: dict[str, list[Result]]):
@@ -19,6 +20,9 @@ def generate_mermaid_script(chart_data: dict[str, list[Result]]):
         # remove duplicates
         labels = list(dict.fromkeys(labels))
         mermaid_script += f"""
+<details>
+<summary>{bname}</summary>
+
 ```mermaid
 ---
 config:
@@ -57,6 +61,8 @@ def generate_mermaid_script(chart_data: dict[str, list[Result]]):
 """
         mermaid_script += f"""
 ```
+
+</details>
 """
 
     return mermaid_script
@@ -83,44 +89,52 @@ def generate_markdown_details(results: list[Result]):
 """)
     return "\n".join(markdown_sections)
 
-def generate_summary(chart_data: dict[str, list[Result]]) -> str:
-    # Calculate the mean value of "This PR" for each benchmark
-    this_pr_means = {}
-    for res in chart_data["This PR"]:
-        if res.name not in this_pr_means:
-            this_pr_means[res.name] = []
-        this_pr_means[res.name].append(res.value)
-    for bname in this_pr_means:
-        this_pr_means[bname] = sum(this_pr_means[bname]) / len(this_pr_means[bname])
-
-    # Calculate the percentage for each entry relative to "This PR"
-    summary_data = {"This PR": 100}
-    for entry_name, results in chart_data.items():
-        if entry_name == "This PR":
-            continue
-        entry_sum = 0
-        for res in results:
-            if res.name in this_pr_means:
-                percentage = (res.value / this_pr_means[res.name]) * 100
-                entry_sum += percentage
-
-        entry_average = entry_sum / len(results) if results else 0
-        summary_data[entry_name] = entry_average
+def generate_summary_table(chart_data: dict[str, list[Result]]):
+    summary_table = "| Benchmark | " + " | ".join(chart_data.keys()) + " |\n"
+    summary_table += "|---" * (len(chart_data) + 1) + "|\n"
 
-    markdown_table = "| Name | Result % |\n| --- | --- |\n"
-    for entry_name, percentage in summary_data.items():
-        markdown_table += f"| {entry_name} | {percentage:.2f}% |\n"
-
-    return markdown_table
+    # Collect all benchmarks and their results
+    benchmark_results = collections.defaultdict(dict)
+    for key, results in chart_data.items():
+        for res in results:
+            benchmark_results[res.name][key] = res
+
+    # Generate the table rows
+    for bname, results in benchmark_results.items():
+        row = f"| {bname} |"
+        best_value = None
+        best_key = None
+
+        # Determine the best value
+        for key, res in results.items():
+            if best_value is None or (res.lower_is_better and res.value < best_value) or (not res.lower_is_better and res.value > best_value):
+                best_value = res.value
+                best_key = key
+
+        # Generate the row with the best value highlighted
+        for key in chart_data.keys():
+            if key in results:
+                value = results[key].value
+                if key == best_key:
+                    row += f" `**{value}**` |"  # Highlight the best value
+                else:
+                    row += f" {value} |"
+            else:
+                row += " - |"
+
+        summary_table += row + "\n"
+
+    return summary_table
 
 def generate_markdown(chart_data: dict[str, list[Result]]):
     mermaid_script = generate_mermaid_script(chart_data)
+    summary_table = generate_summary_table(chart_data)
 
     return f"""
 # Summary
-{generate_summary(chart_data)}
-# Benchmark Results
+{summary_table}
+# Charts
 {mermaid_script}
-## Details
+# Details
 {generate_markdown_details(chart_data["This PR"])}
 """
diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py
index 9dc3f23a9b..5c7beb95d0 100644
--- a/scripts/benchmarks/utils/utils.py
+++ b/scripts/benchmarks/utils/utils.py
@@ -28,9 +28,12 @@ def run(command, env_vars={}, cwd=None, add_sycl=False):
             env['LD_LIBRARY_PATH'] = sycl_lib_path + os.pathsep + env.get('LD_LIBRARY_PATH', '')
 
         env.update(env_vars)
-        result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) # nosec B603
-        print(result.stdout.decode())
-        print(result.stderr.decode())
+        result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, timeout=options.timeout) # nosec B603
+
+        if options.verbose:
+            print(result.stdout.decode())
+            print(result.stderr.decode())
+
         return result
     except subprocess.CalledProcessError as e:
         print(e.stdout.decode())
@@ -70,7 +73,8 @@ def load_benchmark_results(dir, compare_name) -> list[Result]:
         return None
 
 def prepare_bench_cwd(dir):
-    options.benchmark_cwd = os.path.join(dir, 'bcwd')
+    # we need 2 deep to workaround a problem with a fixed relative path in cudaSift
+    options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd')
     if os.path.exists(options.benchmark_cwd):
         shutil.rmtree(options.benchmark_cwd)
     os.makedirs(options.benchmark_cwd)
@@ -97,3 +101,13 @@ def prepare_workdir(dir, version):
 
     with open(version_file_path, 'w') as version_file:
         version_file.write(version)
+
+def create_build_path(directory, name):
+    build_path = os.path.join(directory, name)
+
+    if options.rebuild and Path(build_path).exists():
+        shutil.rmtree(build_path)
+
+    Path(build_path).mkdir(parents=True, exist_ok=True)
+
+    return build_path