# Compare kernel statistics

This notebook compares kernel statistics between two Nsight Systems SQLite report files.

We'll specifically compare the `set_prognostic_edmf_precomputed_quantities_precipitation` kernel between:
- `baseline.sqlite` - baseline run
- `mod.sqlite` - modified run

## Imports and setup

In [1]:
import sqlite3
from pathlib import Path

import pandas as pd
from IPython.display import display

## Helper functions

In [2]:
def get_kernel_stats(db_path, kernel_name_pattern):
    """
    Extract kernel statistics from an nsys SQLite database.

    Args:
        db_path: Path to the SQLite database file
        kernel_name_pattern: SQL LIKE pattern to match kernel names

    Returns:
        Dictionary with aggregated statistics and list of all kernel invocations
    """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Query to get all kernel invocations with the specified name
    query = """
    SELECT
        s.value as kernelName,
        k.start,
        k.end,
        (k.end - k.start) as duration_ns,
        k.gridX,
        k.gridY,
        k.gridZ,
        k.blockX,
        k.blockY,
        k.blockZ,
        k.registersPerThread,
        k.staticSharedMemory,
        k.dynamicSharedMemory,
        k.localMemoryPerThread,
        k.localMemoryTotal,
        k.deviceId,
        k.streamId,
        k.launchType,
        k.sharedMemoryExecuted,
        k.correlationId,
        k.globalPid
    FROM CUPTI_ACTIVITY_KIND_KERNEL k
    JOIN StringIds s ON k.demangledName = s.id
    WHERE s.value LIKE ?
    ORDER BY k.start
    """

    cursor.execute(query, (kernel_name_pattern,))
    rows = cursor.fetchall()

    if not rows:
        conn.close()
        return None, None

    # Calculate statistics
    durations = [row[3] for row in rows]

    stats = {
        "kernel_name": rows[0][0],
        "invocation_count": len(rows),
        "duration_ns": {
            "total": sum(durations),
            "mean": sum(durations) / len(durations),
            "min": min(durations),
            "max": max(durations),
            "median": sorted(durations)[len(durations) // 2],
        },
        "grid_dims": {
            "x": rows[0][4],
            "y": rows[0][5],
            "z": rows[0][6],
        },
        "block_dims": {
            "x": rows[0][7],
            "y": rows[0][8],
            "z": rows[0][9],
        },
        "registers_per_thread": rows[0][10],
        "static_shared_memory": rows[0][11],
        "dynamic_shared_memory": rows[0][12],
        "local_memory_per_thread": rows[0][13],
        "local_memory_total": rows[0][14],
        "device_id": rows[0][15],
        "stream_id": rows[0][16],
        "launch_type": rows[0][17],
        "shared_memory_executed": rows[0][18],
        "correlation_id": rows[0][19],
        "global_pid": rows[0][20],
    }

    # Calculate theoretical occupancy
    # Occupancy = (active warps per SM) / (max warps per SM) * 100%
    block_size = rows[0][7] * rows[0][8] * rows[0][9]
    warps_per_block = (block_size + 31) // 32
    max_warps_per_sm = 48

    # Register file size: ~49KB per SM (determined by occupancy calculator matching nsys)
    max_registers_per_sm = 49152  # registers available per SM
    registers_per_block = rows[0][10] * block_size
    blocks_limited_by_registers = max(1, max_registers_per_sm // registers_per_block) if registers_per_block > 0 else 8

    # Shared memory limit (if applicable, but usually not limiting for this kernel)
    max_shared_mem_per_sm = 96000
    shared_mem_per_block = rows[0][11] + rows[0][12]
    blocks_limited_by_shared_mem = max(1, max_shared_mem_per_sm // shared_mem_per_block) if shared_mem_per_block > 0 else 8

    # Effective blocks per SM is the minimum of register and shared memory constraints
    blocks_per_sm = min(blocks_limited_by_registers, blocks_limited_by_shared_mem)

    # Calculate occupancy
    active_warps = warps_per_block * blocks_per_sm
    occupancy = (active_warps / max_warps_per_sm) * 100
    stats["theoretical_occupancy"] = occupancy
    conn.close()
    return stats, rows


def get_launch_type_name(launch_type_id):
    """Convert launch type ID to human-readable name."""
    launch_types = {
        0: "Regular",
        1: "Cooperative",
    }
    return launch_types.get(launch_type_id, f"Unknown ({launch_type_id})")


In [3]:
def format_duration(ns):
    """Convert nanoseconds to human-readable format."""
    if ns < 1000:
        return f"{ns:.2f} ns"
    elif ns < 1_000_000:
        return f"{ns / 1000:.2f} µs"
    elif ns < 1_000_000_000:
        return f"{ns / 1_000_000:.2f} ms"
    else:
        return f"{ns / 1_000_000_000:.2f} s"

In [4]:
def compare_stats(baseline_stats, mod_stats):
    """
    Compare two sets of kernel statistics and display results in a single table.

    Args:
        baseline_stats: Statistics from baseline run
        mod_stats: Statistics from modified run
    """
    if baseline_stats is None or mod_stats is None:
        print(
            "Error: Could not retrieve statistics from one or both databases"
        )
        return

    print("\n" + "=" * 140)
    print(f"KERNEL: {baseline_stats['kernel_name']}")
    print("=" * 140 + "\n")

    # Build comprehensive data structure with all metrics
    data = {"Metric": [], "Baseline": [], "Modified": [], "% Change": []}

    # Timing Statistics
    metrics = [
        (
            "Invocations",
            baseline_stats["invocation_count"],
            mod_stats["invocation_count"],
            True,
        ),
        (
            "Total Duration",
            baseline_stats["duration_ns"]["total"],
            mod_stats["duration_ns"]["total"],
            False,
        ),
        (
            "Mean Duration",
            baseline_stats["duration_ns"]["mean"],
            mod_stats["duration_ns"]["mean"],
            False,
        ),
        (
            "Median Duration",
            baseline_stats["duration_ns"]["median"],
            mod_stats["duration_ns"]["median"],
            False,
        ),
        (
            "Min Duration",
            baseline_stats["duration_ns"]["min"],
            mod_stats["duration_ns"]["min"],
            False,
        ),
        (
            "Max Duration",
            baseline_stats["duration_ns"]["max"],
            mod_stats["duration_ns"]["max"],
            False,
        ),
    ]

    for metric_name, baseline_val, mod_val, is_count in metrics:
        data["Metric"].append(metric_name)

        if is_count:
            data["Baseline"].append(f"{baseline_val:,}")
            data["Modified"].append(f"{mod_val:,}")
            pct = (
                (mod_val - baseline_val) / baseline_val * 100
                if baseline_val > 0
                else 0
            )
        else:
            data["Baseline"].append(format_duration(baseline_val))
            data["Modified"].append(format_duration(mod_val))
            pct = (mod_val - baseline_val) / baseline_val * 100

        sign = "+" if pct > 0 else ""
        data["% Change"].append(f"{sign}{pct:.2f}%")

    # Kernel Dimensions & Launch
    data["Metric"].append("Grid Dimensions")
    data["Baseline"].append(
        f"<{baseline_stats['grid_dims']['x']}, {baseline_stats['grid_dims']['y']}, {baseline_stats['grid_dims']['z']}>"
    )
    data["Modified"].append(
        f"<{mod_stats['grid_dims']['x']}, {mod_stats['grid_dims']['y']}, {mod_stats['grid_dims']['z']}>"
    )
    data["% Change"].append("—")

    data["Metric"].append("Block Dimensions")
    data["Baseline"].append(
        f"<{baseline_stats['block_dims']['x']}, {baseline_stats['block_dims']['y']}, {baseline_stats['block_dims']['z']}>"
    )
    data["Modified"].append(
        f"<{mod_stats['block_dims']['x']}, {mod_stats['block_dims']['y']}, {mod_stats['block_dims']['z']}>"
    )
    data["% Change"].append("—")

    block_size_baseline = (
        baseline_stats["block_dims"]["x"]
        * baseline_stats["block_dims"]["y"]
        * baseline_stats["block_dims"]["z"]
    )
    block_size_mod = (
        mod_stats["block_dims"]["x"]
        * mod_stats["block_dims"]["y"]
        * mod_stats["block_dims"]["z"]
    )
    data["Metric"].append("Block Size (threads)")
    data["Baseline"].append(f"{block_size_baseline:,}")
    data["Modified"].append(f"{block_size_mod:,}")
    data["% Change"].append("—")

    grid_size_baseline = (
        baseline_stats["grid_dims"]["x"]
        * baseline_stats["grid_dims"]["y"]
        * baseline_stats["grid_dims"]["z"]
    )
    grid_size_mod = (
        mod_stats["grid_dims"]["x"]
        * mod_stats["grid_dims"]["y"]
        * mod_stats["grid_dims"]["z"]
    )
    data["Metric"].append("Grid Size (blocks)")
    data["Baseline"].append(f"{grid_size_baseline:,}")
    data["Modified"].append(f"{grid_size_mod:,}")
    data["% Change"].append("—")

    total_threads_baseline = grid_size_baseline * block_size_baseline
    total_threads_mod = grid_size_mod * block_size_mod
    data["Metric"].append("Total Threads")
    data["Baseline"].append(f"{total_threads_baseline:,}")
    data["Modified"].append(f"{total_threads_mod:,}")
    data["% Change"].append("—")

    data["Metric"].append("Launch Type")
    data["Baseline"].append(
        get_launch_type_name(baseline_stats["launch_type"])
    )
    data["Modified"].append(get_launch_type_name(mod_stats["launch_type"]))
    data["% Change"].append("—")

    # Register Usage
    data["Metric"].append("Registers Per Thread")
    data["Baseline"].append(f"{baseline_stats['registers_per_thread']}")
    data["Modified"].append(f"{mod_stats['registers_per_thread']}")
    if (
        baseline_stats["registers_per_thread"]
        != mod_stats["registers_per_thread"]
    ):
        pct = (
            (
                mod_stats["registers_per_thread"]
                - baseline_stats["registers_per_thread"]
            )
            / baseline_stats["registers_per_thread"]
            * 100
        )
        sign = "+" if pct > 0 else ""
        data["% Change"].append(f"{sign}{pct:.2f}%")
    else:
        data["% Change"].append("—")

    # Memory Usage
    memory_metrics = [
        (
            "Static Shared Memory (bytes)",
            baseline_stats["static_shared_memory"],
            mod_stats["static_shared_memory"],
        ),
        (
            "Dynamic Shared Memory (bytes)",
            baseline_stats["dynamic_shared_memory"],
            mod_stats["dynamic_shared_memory"],
        ),
        (
            "Shared Memory Executed (bytes)",
            baseline_stats["shared_memory_executed"],
            mod_stats["shared_memory_executed"],
        ),
        (
            "Local Memory Per Thread (bytes)",
            baseline_stats["local_memory_per_thread"],
            mod_stats["local_memory_per_thread"],
        ),
        (
            "Local Memory Total (bytes)",
            baseline_stats["local_memory_total"],
            mod_stats["local_memory_total"],
        ),
    ]

    for metric_name, baseline_val, mod_val in memory_metrics:
        data["Metric"].append(metric_name)
        data["Baseline"].append(f"{baseline_val:,}")
        data["Modified"].append(f"{mod_val:,}")

        if baseline_val == mod_val:
            data["% Change"].append("—")
        elif baseline_val == 0:
            data["% Change"].append("—")
        else:
            pct = (mod_val - baseline_val) / baseline_val * 100
            sign = "+" if pct > 0 else ""
            data["% Change"].append(f"{sign}{pct:.2f}%")

    # Theoretical Occupancy
    data["Metric"].append("Theoretical Occupancy (%)")
    data["Baseline"].append(f"{baseline_stats['theoretical_occupancy']:.1f}%")
    data["Modified"].append(f"{mod_stats['theoretical_occupancy']:.1f}%")
    data["% Change"].append("—")

    # Create and display the combined dataframe
    df = pd.DataFrame(data)
    display(df)

    # Performance summary
    print("\n" + "=" * 140)
    baseline_mean = baseline_stats["duration_ns"]["mean"]
    mod_mean = mod_stats["duration_ns"]["mean"]

    if mod_mean < baseline_mean:
        improvement = ((baseline_mean - mod_mean) / baseline_mean) * 100
        print(
            f"✅ Modified version is {improvement:.2f}% FASTER (mean duration)"
        )
    elif mod_mean > baseline_mean:
        regression = ((mod_mean - baseline_mean) / baseline_mean) * 100
        print(
            f"⚠️  Modified version is {regression:.2f}% SLOWER (mean duration)"
        )
    else:
        print("➡️  No change in mean duration")

    print("=" * 140)


## Configuration

In [5]:
# Database paths
baseline_db = Path("../results/nsys/baseline.sqlite")
mod_db = Path("../results/nsys/mod.sqlite")

# Kernel name pattern to match - use L649 for the specific kernel version
kernel_pattern = (
    "%L649%"  # Changed from generic pattern to specific line number
)

# Check if files exist
if not baseline_db.exists():
    print(f"Error: Baseline database not found: {baseline_db}")
else:
    print(f"✓ Found baseline database: {baseline_db}")

if not mod_db.exists():
    print(f"Error: Modified database not found: {mod_db}")
else:
    print(f"✓ Found modified database: {mod_db}")

✓ Found baseline database: ../results/nsys/baseline.sqlite
✓ Found modified database: ../results/nsys/mod.sqlite


## Compare kernel statistics

In [6]:
def list_available_kernels(db_path, pattern="%set_prognostic_edmf%"):
    """
    List all kernels matching a pattern in the database.

    Args:
        db_path: Path to SQLite database
        pattern: SQL LIKE pattern for kernel names
    """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    query = """
    SELECT DISTINCT s.value as kernelName, COUNT(*) as count
    FROM CUPTI_ACTIVITY_KIND_KERNEL k
    JOIN StringIds s ON k.demangledName = s.id
    WHERE s.value LIKE ?
    GROUP BY s.value
    ORDER BY s.value
    """

    cursor.execute(query, (pattern,))
    results = cursor.fetchall()
    conn.close()

    return results


# List available kernels
print("Available kernels in baseline database:")
baseline_kernels = list_available_kernels(baseline_db, "%set_prognostic_edmf%")
for kernel_name, count in baseline_kernels:
    print(f"  [{count:4d} invocations] {kernel_name}")

print("\nAvailable kernels in modified database:")
mod_kernels = list_available_kernels(mod_db, "%set_prognostic_edmf%")
for kernel_name, count in mod_kernels:
    print(f"  [{count:4d} invocations] {kernel_name}")

Available kernels in baseline database:
  [ 490 invocations] set_prognostic_edmf_precomputed_quantities_draft__NVTX
  [ 280 invocations] set_prognostic_edmf_precomputed_quantities_environment__NVTX
  [ 450 invocations] set_prognostic_edmf_precomputed_quantities_explicit_closures__NVTX
  [  70 invocations] set_prognostic_edmf_precomputed_quantities_implicit_closures__NVTX
  [  25 invocations] set_prognostic_edmf_precomputed_quantities_precipitation__FILE_ClimaAtmos_jl_src_cache_prognostic_edmf_precomputed_quantities_jl_L583
  [  25 invocations] set_prognostic_edmf_precomputed_quantities_precipitation__FILE_ClimaAtmos_jl_src_cache_prognostic_edmf_precomputed_quantities_jl_L589
  [  25 invocations] set_prognostic_edmf_precomputed_quantities_precipitation__FILE_ClimaAtmos_jl_src_cache_prognostic_edmf_precomputed_quantities_jl_L596
  [  25 invocations] set_prognostic_edmf_precomputed_quantities_precipitation__FILE_ClimaAtmos_jl_src_cache_prognostic_edmf_precomputed_quantities_jl_L602
  [  2

In [7]:
# Get statistics from both databases
print("Loading kernel statistics...\n")
baseline_stats, baseline_rows = get_kernel_stats(baseline_db, kernel_pattern)
mod_stats, mod_rows = get_kernel_stats(mod_db, kernel_pattern)

# Compare and display results
compare_stats(baseline_stats, mod_stats)

Loading kernel statistics...


KERNEL: set_prognostic_edmf_precomputed_quantities_precipitation__FILE_ClimaAtmos_jl_src_cache_prognostic_edmf_precomputed_quantities_jl_L649



Unnamed: 0,Metric,Baseline,Modified,% Change
0,Invocations,25,25,0.00%
1,Total Duration,463.84 ms,296.03 ms,-36.18%
2,Mean Duration,18.55 ms,11.84 ms,-36.18%
3,Median Duration,18.54 ms,11.84 ms,-36.13%
4,Min Duration,18.48 ms,11.81 ms,-36.11%
5,Max Duration,18.64 ms,11.87 ms,-36.34%
6,Grid Dimensions,"<4, 4, 1536>","<4, 4, 1536>",—
7,Block Dimensions,"<64, 1, 1>","<64, 1, 1>",—
8,Block Size (threads),64,64,—
9,Grid Size (blocks),24576,24576,—



✅ Modified version is 36.18% FASTER (mean duration)
