# ResNet18: сравнение nn.Conv2d и Baseline TritonConv2d + sparsity

Здесь строятся две модели ResNet18: референс на `nn.Conv2d` и baseline на `TritonConv2d` (img2col→GEMM→col2img). Бенчмарки включают: (1) тренировочную петлю с подсчётом времени forward/backward и памяти; (2) пер-слойные замеры свёрток; (3) эксперименты с разрежением (channel/block/input) в baseline-модели. Все замеры на CUDA с прогревом. Режимы sparsity влияют и на forward, и на backward (маски отдельных каналов/блоков/входных каналов).

**Метрики, которые считаются и выводятся:**
- В тренировочной петле (`run_benchmark`):
  - `avg_forward_ms`, `avg_backward_ms`, `avg_step_ms` — средние времена (мс) после warmup.
  - `samples_per_s` — пропускная способность (batch_size / step_time).
  - `max_mem_alloc_mb`, `max_mem_reserved_mb` — пиковое выделение и резервирование CUDA-памяти (MB).
- В сравнении вариантов (`sparsity_compare_df`, `ranking_df`):
  - `speedup_forward_vs_torch`, `speedup_backward_vs_torch`, `speedup_step_vs_torch` — отношение метрик к nn.Conv2d; >1 — быстрее Torch, <1 — медленнее.
  - `throughput_ratio_vs_torch`, `mem_*_ratio_vs_torch` — отношение пропускной способности и памяти к Torch.
- Пер-слойный бенч (`benchmark_conv_layers`):
  - Для каждого слоя: `avg_forward_ms`, `avg_backward_ms`, `avg_step_ms`, `throughput_sps`, `max_mem_alloc_mb`, `max_mem_reserved_mb`.
  - Метаданные слоя: тип, параметры ядра/stride/padding, для TritonConv2d — `channel_keep_ratio`, `input_keep_ratio`, `block_size`, `grad_block_size` (когда применена sparsity).

**Как читать результаты:**
- Для тренировки: сравнивайте `avg_step_ms` и `samples_per_s` между вариантами; смотрите память, чтобы оценить влияние sparsity на аллокацию.
- Для sparsity: строки с `mode` и `keep_ratio` показывают, сколько каналов осталось; скорость может расти, но учитывайте точность (forward/backward маски совпадают).
- Для пер-слойных бенчей: `variant` (`nn.Conv2d` или `Baseline TritonConv2d`) и `batch_size` помогают понять, где Triton выигрывает/проигрывает в зависимости от размера входа.


In [6]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path().resolve().parent))


In [7]:
import copy
import json
import math
import random
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision
from torchvision import transforms

from conv_gemm.baseline_layers.triton_conv2d import TritonConv2d


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type != "cuda":
    raise RuntimeError("CUDA GPU is required for this benchmark")

seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = True

data_root = Path("../data").resolve()
data_root.mkdir(parents=True, exist_ok=True)

config = {
    "data_root": str(data_root),
    "num_classes": 10,
    "batch_sizes": [32, 64, 96, 128, 160, 192, 256],
    "num_workers": 4,
    "train_subset": 8192,
    "lr": 1e-3,
    "momentum": 0.9,
    "weight_decay": 5e-4,
    "warmup_steps": 5,
    "model_warmup_steps": 3,
    "benchmark_steps": 40,
    "baseline_conv": {
        "BLOCK_M": 64,
        "BLOCK_N": 64,
        "BLOCK_K": 64,
        "NUM_WARPS": 4,
        "NUM_STAGES": 2,
    },
    "sparsity_bench": {
        "modes": ["channel", "block", "input"],
        "keep_ratios": [0.75, 0.6, 0.5, 0.25],
        "block_size": 4,
        "batch_size": 128,
    },
    "conv_layer_bench": {
        "warmup_steps": 5,
        "bench_steps": 20,
    },
}
print(json.dumps(config, indent=2))


{
  "data_root": "/home/manzhura/ITMO/EDLM/conv2d-img2col-gemm/data",
  "num_classes": 10,
  "batch_sizes": [
    32,
    64,
    96,
    128,
    160,
    192,
    256
  ],
  "num_workers": 4,
  "train_subset": 8192,
  "lr": 0.001,
  "momentum": 0.9,
  "weight_decay": 0.0005,
  "warmup_steps": 5,
  "model_warmup_steps": 3,
  "benchmark_steps": 40,
  "baseline_conv": {
    "BLOCK_M": 64,
    "BLOCK_N": 64,
    "BLOCK_K": 64,
    "NUM_WARPS": 4,
    "NUM_STAGES": 2
  },
  "sparsity_bench": {
    "modes": [
      "channel",
      "block",
      "input"
    ],
    "keep_ratios": [
      0.75,
      0.6,
      0.5,
      0.25
    ],
    "block_size": 4,
    "batch_size": 128
  },
  "conv_layer_bench": {
    "warmup_steps": 5,
    "bench_steps": 20
  }
}


In [10]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

full_train = torchvision.datasets.CIFAR10(
    root=config["data_root"], train=True, download=True, transform=transform_train
)
if config["train_subset"] is not None and config["train_subset"] < len(full_train):
    g = torch.Generator().manual_seed(seed)
    subset_idx = torch.randperm(len(full_train), generator=g)[: config["train_subset"]]
    train_dataset = torch.utils.data.Subset(full_train, subset_idx)
else:
    train_dataset = full_train


def make_loader(batch_size: int) -> DataLoader:
    return DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=config["num_workers"],
        pin_memory=True,
    )

train_loaders: Dict[int, DataLoader] = {}
for bs in config["batch_sizes"]:
    train_loaders[bs] = make_loader(bs)

print({bs: len(loader) for bs, loader in train_loaders.items()})


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170M/170M [00:12<00:00, 13.2MB/s]


{32: 256, 64: 128, 96: 85, 128: 64, 160: 51, 192: 42, 256: 32}


In [12]:
def make_triton_conv(src: nn.Conv2d, cfg: dict) -> TritonConv2d:
    if src.groups != 1:
        raise ValueError("Baseline TritonConv2d currently supports groups=1 only")
    layer = TritonConv2d(
        in_channels=src.in_channels,
        out_channels=src.out_channels,
        kernel_size=src.kernel_size,
        stride=src.stride,
        padding=src.padding,
        dilation=src.dilation,
        bias=(src.bias is not None),
        # **cfg,
    ).to(src.weight.device)
    with torch.no_grad():
        layer.weight.copy_(src.weight.detach().to(layer.weight.dtype))
        if layer.bias is not None and src.bias is not None:
            layer.bias.copy_(src.bias.detach().to(layer.bias.dtype))
    return layer


def replace_convs_with_baseline(module: nn.Module, cfg: dict):
    for name, child in module.named_children():
        if isinstance(child, nn.Conv2d):
            setattr(module, name, make_triton_conv(child, cfg))
        else:
            replace_convs_with_baseline(child, cfg)


def build_model_pair(config: dict):
    reference = torchvision.models.resnet18(num_classes=config["num_classes"])
    baseline = copy.deepcopy(reference)
    replace_convs_with_baseline(baseline, config["baseline_conv"])
    return reference.half(), baseline.half()


def apply_sparsity_to_model(model: nn.Module, mode: str, keep_ratio: float, block_size: int = 4):
    for layer in model.modules():
        if isinstance(layer, TritonConv2d):
            layer.clear_sparsity()
            if keep_ratio >= 1.0:
                continue
            if mode == "channel":
                layer.set_channel_sparsity(keep_ratio)
                layer.set_backward_channel_sparsity(keep_ratio)
            elif mode == "block":
                layer.set_block_sparsity(keep_ratio, block_size=block_size)
                layer.set_backward_block_sparsity(keep_ratio, block_size=block_size)
            elif mode == "input":
                layer.set_input_channel_sparsity(keep_ratio)
                layer.set_backward_input_channel_sparsity(keep_ratio)
            else:
                raise ValueError(f"Unknown sparsity mode: {mode}")


In [13]:
def run_benchmark(model: nn.Module, label: str, loader: DataLoader, config: dict):
    model = model.to(device)
    model.train()
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config["lr"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
    )

    warmup = config["warmup_steps"]
    total_steps = config["benchmark_steps"]
    model_warmup = config.get("model_warmup_steps", 0)
    records = []

    if model_warmup > 0:
        warmup_iter = iter(loader)
        for _ in range(model_warmup):
            try:
                images, targets = next(warmup_iter)
            except StopIteration:
                warmup_iter = iter(loader)
                images, targets = next(warmup_iter)

            images = images.half().to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            outputs = model(images)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.zero_grad(set_to_none=True)
            torch.cuda.synchronize()

        # Extra GPU warmup to drop JIT/cudnn noise from timed iterations
        torch.cuda.reset_peak_memory_stats(device)

    data_iter = iter(loader)

    for step in range(total_steps):
        try:
            images, targets = next(data_iter)
        except StopIteration:
            data_iter = iter(loader)
            images, targets = next(data_iter)

        images = images.half().to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        torch.cuda.reset_peak_memory_stats(device)
        torch.cuda.synchronize()

        fwd_start = torch.cuda.Event(enable_timing=True)
        fwd_end = torch.cuda.Event(enable_timing=True)
        bwd_start = torch.cuda.Event(enable_timing=True)
        bwd_end = torch.cuda.Event(enable_timing=True)

        fwd_start.record()
        outputs = model(images)
        fwd_end.record()
        loss = criterion(outputs, targets)

        bwd_start.record()
        loss.backward()
        bwd_end.record()
        optimizer.step()

        torch.cuda.synchronize()

        fwd_ms = fwd_start.elapsed_time(fwd_end)
        bwd_ms = bwd_start.elapsed_time(bwd_end)
        step_ms = fwd_ms + bwd_ms
        mem_alloc = torch.cuda.max_memory_allocated(device) / 1024 ** 2
        mem_reserved = torch.cuda.max_memory_reserved(device) / 1024 ** 2

        if step >= warmup:
            records.append({
                "label": label,
                "step": step,
                "loss": float(loss.item()),
                "fwd_ms": fwd_ms,
                "bwd_ms": bwd_ms,
                "step_ms": step_ms,
                "throughput_sps": images.size(0) / (step_ms / 1000.0),
                "max_mem_alloc_mb": mem_alloc,
                "max_mem_reserved_mb": mem_reserved,
            })

    if not records:
        raise RuntimeError("No data recorded for benchmark")

    df = pd.DataFrame(records)
    summary = {
        "label": label,
        "avg_forward_ms": df["fwd_ms"].mean(),
        "avg_backward_ms": df["bwd_ms"].mean(),
        "avg_step_ms": df["step_ms"].mean(),
        "samples_per_s": df["throughput_sps"].mean(),
        "max_mem_alloc_mb": df["max_mem_alloc_mb"].max(),
        "max_mem_reserved_mb": df["max_mem_reserved_mb"].max(),
    }
    return df, summary


In [14]:
def is_conv_module(module: nn.Module) -> bool:
    return isinstance(module, (nn.Conv2d, TritonConv2d))


def collect_conv_input_shapes(model: nn.Module, sample: torch.Tensor) -> Dict[str, torch.Size]:
    shapes: Dict[str, torch.Size] = {}
    handles = []

    def make_hook(layer_name: str):
        def _hook(mod, inp):
            shapes.setdefault(layer_name, inp[0].shape)
            return None  # do not override inputs
        return _hook

    for name, module in model.named_modules():
        if is_conv_module(module):
            handles.append(module.register_forward_pre_hook(make_hook(name)))
    with torch.no_grad():
        model(sample)
    for h in handles:
        h.remove()
    return shapes


def conv_metadata(name: str, module: nn.Module) -> Dict[str, object]:
    meta = {
        "layer": name,
        "layer_type": type(module).__name__,
        "in_channels": getattr(module, "in_channels", None),
        "out_channels": getattr(module, "out_channels", None),
        "kernel_size": tuple(getattr(module, "kernel_size", [])) if hasattr(module, "kernel_size") else None,
        "stride": tuple(getattr(module, "stride", [])) if hasattr(module, "stride") else None,
        "padding": tuple(getattr(module, "padding", [])) if hasattr(module, "padding") else None,
        "dilation": tuple(getattr(module, "dilation", [])) if hasattr(module, "dilation") else None,
    }
    if isinstance(module, TritonConv2d):
        keep_out = float(module.channel_mask.float().mean().item()) if hasattr(module, "channel_mask") else 1.0
        keep_in = float(module.input_channel_mask.float().mean().item()) if hasattr(module, "input_channel_mask") else 1.0
        meta.update({
            "channel_keep_ratio": keep_out,
            "input_keep_ratio": keep_in,
            "block_size": getattr(module, "block_size", None),
            "grad_block_size": getattr(module, "grad_block_size", None),
        })
    return meta


def benchmark_single_conv(module: nn.Module, input_shape: torch.Size, device: torch.device, warmup: int, steps: int) -> Dict[str, float]:
    x = torch.randn(input_shape, device=device, dtype=torch.float16, requires_grad=True)
    layer = copy.deepcopy(module).to(device)
    layer.train()
    torch.cuda.synchronize()

    for _ in range(warmup):
        layer.zero_grad(set_to_none=True)
        out = layer(x)
        loss = out.float().sum()
        loss.backward()
        torch.cuda.synchronize()

    torch.cuda.reset_peak_memory_stats(device)
    records: List[Dict[str, float]] = []

    for _ in range(steps):
        layer.zero_grad(set_to_none=True)
        torch.cuda.reset_peak_memory_stats(device)
        torch.cuda.synchronize()

        fwd_start = torch.cuda.Event(enable_timing=True)
        fwd_end = torch.cuda.Event(enable_timing=True)
        bwd_start = torch.cuda.Event(enable_timing=True)
        bwd_end = torch.cuda.Event(enable_timing=True)

        fwd_start.record()
        out = layer(x)
        fwd_end.record()

        loss = out.float().sum()

        bwd_start.record()
        loss.backward()
        bwd_end.record()

        torch.cuda.synchronize()

        fwd_ms = fwd_start.elapsed_time(fwd_end)
        bwd_ms = bwd_start.elapsed_time(bwd_end)
        step_ms = fwd_ms + bwd_ms
        records.append({
            "avg_forward_ms": fwd_ms,
            "avg_backward_ms": bwd_ms,
            "avg_step_ms": step_ms,
            "throughput_sps": input_shape[0] / (step_ms / 1000.0),
            "max_mem_alloc_mb": torch.cuda.max_memory_allocated(device) / 1024 ** 2,
            "max_mem_reserved_mb": torch.cuda.max_memory_reserved(device) / 1024 ** 2,
        })

    if not records:
        raise RuntimeError("No data recorded for conv benchmark")

    df = pd.DataFrame(records)
    return {
        "avg_forward_ms": df["avg_forward_ms"].mean(),
        "avg_backward_ms": df["avg_backward_ms"].mean(),
        "avg_step_ms": df["avg_step_ms"].mean(),
        "throughput_sps": df["throughput_sps"].mean(),
        "max_mem_alloc_mb": df["max_mem_alloc_mb"].max(),
        "max_mem_reserved_mb": df["max_mem_reserved_mb"].max(),
    }


def benchmark_conv_layers(torch_model: nn.Module, baseline_model: nn.Module, batch_size: int, config: dict):
    bench_cfg = config.get("conv_layer_bench", {"warmup_steps": 3, "bench_steps": 10})
    warmup = bench_cfg.get("warmup_steps", 3)
    steps = bench_cfg.get("bench_steps", 10)

    sample = torch.randn(batch_size, 3, 32, 32, device=device, dtype=torch.float16)
    torch_model = torch_model.to(device).eval()
    baseline_model = baseline_model.to(device).eval()

    input_shapes = collect_conv_input_shapes(torch_model, sample)
    torch_conv_map = dict(torch_model.named_modules())
    baseline_conv_map = dict(baseline_model.named_modules())

    rows: List[Dict[str, object]] = []
    for name, inp_shape in input_shapes.items():
        torch_layer = torch_conv_map.get(name)
        baseline_layer = baseline_conv_map.get(name)
        if not (is_conv_module(torch_layer) and is_conv_module(baseline_layer)):
            continue

        for variant, layer in [("nn.Conv2d", torch_layer), ("Baseline TritonConv2d", baseline_layer)]:
            summary = benchmark_single_conv(layer, inp_shape, device, warmup, steps)
            meta = conv_metadata(name, layer)
            meta.update({
                "variant": variant,
                "batch_size": batch_size,
            })
            meta.update(summary)
            rows.append(meta)

    torch.cuda.empty_cache()
    return rows


### Тренировочный бенч (whole-model)

Ниже — таблица с усреднёнными метриками для `nn.Conv2d` и Baseline `TritonConv2d` на полном ResNet18 при заданном `batch_size`:
- Время: `avg_forward_ms`, `avg_backward_ms`, `avg_step_ms`.
- Скорость: `samples_per_s`.
- Память: `max_mem_alloc_mb`, `max_mem_reserved_mb`.
Используйте эти значения как базовый ориентир перед включением sparsity.


In [15]:
batch_summaries = []
batch_details = []
conv_layer_rows = []

for bs, loader in train_loaders.items():
    print(f"=== Batch size {bs} ===")
    torch_model, baseline_model = build_model_pair(config)

    # per-layer bench (forward FP16, backward FP32)
    conv_layer_rows.extend(benchmark_conv_layers(torch_model, baseline_model, bs, config))

    torch_df, torch_summary = run_benchmark(torch_model, f"nn.Conv2d (bs={bs})", loader, config)
    torch_summary.update({"variant": "nn.Conv2d", "batch_size": bs})
    batch_summaries.append(torch_summary)
    batch_details.append(torch_df.assign(variant="nn.Conv2d", batch_size=bs))

    baseline_df, baseline_summary = run_benchmark(baseline_model, f"Baseline TritonConv2d (bs={bs})", loader, config)
    baseline_summary.update({"variant": "Baseline TritonConv2d", "batch_size": bs})
    batch_summaries.append(baseline_summary)
    batch_details.append(baseline_df.assign(variant="Baseline TritonConv2d", batch_size=bs))

summary_df = pd.DataFrame(batch_summaries).set_index(["variant", "batch_size"])
summary_df


=== Batch size 32 ===
=== Batch size 64 ===
=== Batch size 96 ===
=== Batch size 128 ===
=== Batch size 160 ===
=== Batch size 192 ===
=== Batch size 256 ===


Unnamed: 0_level_0,Unnamed: 1_level_0,label,avg_forward_ms,avg_backward_ms,avg_step_ms,samples_per_s,max_mem_alloc_mb,max_mem_reserved_mb
variant,batch_size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
nn.Conv2d,32,nn.Conv2d (bs=32),3.129189,2.526213,5.655401,5713.794252,127.072266,138.0
Baseline TritonConv2d,32,Baseline TritonConv2d (bs=32),16.541553,11.801686,28.343239,1190.182559,154.655273,182.0
nn.Conv2d,64,nn.Conv2d (bs=64),3.018501,2.57705,5.59555,11605.944033,127.260254,138.0
Baseline TritonConv2d,64,Baseline TritonConv2d (bs=64),19.343052,14.796444,34.139496,1961.792757,169.953613,204.0
nn.Conv2d,96,nn.Conv2d (bs=96),2.984024,2.630002,5.614026,17245.318782,127.57373,144.0
Baseline TritonConv2d,96,Baseline TritonConv2d (bs=96),19.321949,14.905766,34.227716,2875.52738,195.14209,240.0
nn.Conv2d,128,nn.Conv2d (bs=128),2.827016,2.964769,5.791785,22337.548931,132.817871,168.0
Baseline TritonConv2d,128,Baseline TritonConv2d (bs=128),19.31607,16.518956,35.835026,3619.624595,213.830078,312.0
nn.Conv2d,160,nn.Conv2d (bs=160),3.68212,3.065725,6.747845,23804.115582,140.381836,162.0
Baseline TritonConv2d,160,Baseline TritonConv2d (bs=160),11.315238,14.260141,25.575379,6273.461936,235.769043,284.0


Вывод `detail_df.groupby(...).describe()` содержит count/mean/std/min/25%/50%/75%/max для метрик `step_ms`, `fwd_ms`, `bwd_ms`, `max_mem_alloc_mb` отдельно по каждому `(variant, batch_size)`.


In [16]:
detail_df = pd.concat(batch_details, ignore_index=True)
metrics = ["step_ms", "fwd_ms", "bwd_ms", "max_mem_alloc_mb"]
detail_df.groupby(["variant", "batch_size"])[metrics].describe()


Unnamed: 0_level_0,Unnamed: 1_level_0,step_ms,step_ms,step_ms,step_ms,step_ms,step_ms,step_ms,step_ms,fwd_ms,fwd_ms,...,bwd_ms,bwd_ms,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
variant,batch_size,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Baseline TritonConv2d,32,35.0,28.343239,6.449088,17.763648,21.686512,28.838496,34.879056,37.69344,35.0,16.541553,...,14.311424,15.296512,35.0,154.655273,0.0,154.655273,154.655273,154.655273,154.655273,154.655273
Baseline TritonConv2d,64,35.0,34.139496,6.79522,18.694144,30.803328,35.318081,36.399632,48.715775,35.0,19.343052,...,15.80544,16.402336,35.0,169.953613,0.0,169.953613,169.953613,169.953613,169.953613,169.953613
Baseline TritonConv2d,96,35.0,34.227716,5.084941,22.205953,31.768832,36.004065,36.549759,49.398912,35.0,19.321949,...,15.500816,16.351233,35.0,195.14209,0.0,195.14209,195.14209,195.14209,195.14209,195.14209
Baseline TritonConv2d,128,35.0,35.835026,3.752803,24.877025,34.715152,37.468832,38.083183,39.139265,35.0,19.31607,...,17.124767,17.670143,35.0,213.830078,0.0,213.830078,213.830078,213.830078,213.830078,213.830078
Baseline TritonConv2d,160,35.0,25.575379,1.444005,24.161984,24.683088,25.313408,25.982368,31.222687,35.0,11.315238,...,14.216176,17.912832,35.0,235.769043,0.0,235.769043,235.769043,235.769043,235.769043,235.769043
Baseline TritonConv2d,192,35.0,31.971872,15.369424,26.870751,27.101264,27.370049,28.143104,108.83741,35.0,15.010317,...,17.03168,23.214081,35.0,261.332031,0.0,261.332031,261.332031,261.332031,261.332031,261.332031
Baseline TritonConv2d,256,35.0,33.276546,4.550427,30.807584,31.060784,31.539968,33.866511,56.799232,35.0,11.928773,...,22.415872,26.157824,35.0,298.083496,0.0,298.083496,298.083496,298.083496,298.083496,298.083496
nn.Conv2d,32,35.0,5.655401,0.598762,5.043584,5.161664,5.606784,6.11592,7.86432,35.0,3.129189,...,2.737728,4.16256,35.0,127.072266,0.0,127.072266,127.072266,127.072266,127.072266,127.072266
nn.Conv2d,64,35.0,5.59555,0.704996,4.46544,5.076976,5.43776,5.908992,7.309792,35.0,3.018501,...,2.702272,3.891072,35.0,127.260254,0.0,127.260254,127.260254,127.260254,127.260254,127.260254
nn.Conv2d,96,35.0,5.614026,0.529583,5.01184,5.115904,5.392352,6.089136,6.763648,35.0,2.984024,...,2.896784,3.62496,35.0,127.57373,0.0,127.57373,127.57373,127.57373,127.57373,127.57373


In [17]:
forward_bs_top = (
    summary_df.reset_index()
    .sort_values("avg_forward_ms")
    .groupby("variant")
    .head(3)
    .reset_index(drop=True)
)

backward_bs_top = (
    summary_df.reset_index()
    .sort_values("avg_backward_ms")
    .groupby("variant")
    .head(3)
    .reset_index(drop=True)
)

forward_bs_top, backward_bs_top


(                 variant  batch_size                           label  \
 0              nn.Conv2d         128              nn.Conv2d (bs=128)   
 1              nn.Conv2d          96               nn.Conv2d (bs=96)   
 2              nn.Conv2d          64               nn.Conv2d (bs=64)   
 3  Baseline TritonConv2d         160  Baseline TritonConv2d (bs=160)   
 4  Baseline TritonConv2d         256  Baseline TritonConv2d (bs=256)   
 5  Baseline TritonConv2d         192  Baseline TritonConv2d (bs=192)   
 
    avg_forward_ms  avg_backward_ms  avg_step_ms  samples_per_s  \
 0        2.827016         2.964769     5.791785   22337.548931   
 1        2.984024         2.630002     5.614026   17245.318782   
 2        3.018501         2.577050     5.595550   11605.944033   
 3       11.315238        14.260141    25.575379    6273.461936   
 4       11.928773        21.347773    33.276546    7787.248553   
 5       15.010317        16.961555    31.971872    6554.576549   
 
    max_mem_allo

Per-layer metrics: forward/backward time and memory for each batch size and variant.

In [18]:
conv_layer_df = pd.DataFrame(conv_layer_rows)
conv_layer_df


Unnamed: 0,layer,layer_type,in_channels,out_channels,kernel_size,stride,padding,dilation,variant,batch_size,avg_forward_ms,avg_backward_ms,avg_step_ms,throughput_sps,max_mem_alloc_mb,max_mem_reserved_mb,channel_keep_ratio,input_keep_ratio,block_size,grad_block_size
0,conv1,Conv2d,3,64,"(7, 7)","(2, 2)","(3, 3)","(1, 1)",nn.Conv2d,32,0.083046,0.160826,0.243872,132337.776140,57.015137,68.0,,,,
1,conv1,TritonConv2d,3,64,"(7, 7)","(2, 2)","(3, 3)","(1, 1)",Baseline TritonConv2d,32,0.442019,0.776563,1.218582,26268.444360,74.058594,90.0,1.0,1.0,,
2,layer1.0.conv1,Conv2d,64,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,32,0.089254,0.152520,0.241774,136234.527725,55.202637,68.0,,,,
3,layer1.0.conv1,TritonConv2d,64,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",Baseline TritonConv2d,32,0.811946,0.679099,1.491045,23210.801667,70.132812,90.0,1.0,1.0,,
4,layer1.0.conv2,Conv2d,64,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,32,0.134598,0.311456,0.446054,72197.255429,55.202637,90.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,layer4.0.downsample.0,TritonConv2d,256,512,"(1, 1)","(2, 2)","(0, 0)","(1, 1)",Baseline TritonConv2d,256,0.916042,0.860070,1.776112,144183.714106,69.757812,70.0,1.0,1.0,,
276,layer4.1.conv1,Conv2d,512,512,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,256,0.140611,0.309704,0.450315,568691.095826,77.257324,90.0,,,,
277,layer4.1.conv1,TritonConv2d,512,512,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",Baseline TritonConv2d,256,0.973586,0.865216,1.838802,140062.220837,114.757812,130.0,1.0,1.0,,
278,layer4.1.conv2,Conv2d,512,512,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,256,0.130096,0.312629,0.442725,581715.011749,77.257324,130.0,,,,


In [19]:
torch_conv_df = conv_layer_df[conv_layer_df["variant"] == "nn.Conv2d"]
baseline_conv_df = conv_layer_df[conv_layer_df["variant"] == "Baseline TritonConv2d"]

conv_layer_compare_df = torch_conv_df.merge(
    baseline_conv_df,
    on=["layer", "batch_size"],
    suffixes=("_torch", "_baseline"),
)

conv_layer_compare_df["speedup_forward"] = conv_layer_compare_df["avg_forward_ms_torch"] / conv_layer_compare_df["avg_forward_ms_baseline"]
conv_layer_compare_df["speedup_backward"] = conv_layer_compare_df["avg_backward_ms_torch"] / conv_layer_compare_df["avg_backward_ms_baseline"]
conv_layer_compare_df["speedup_step"] = conv_layer_compare_df["avg_step_ms_torch"] / conv_layer_compare_df["avg_step_ms_baseline"]
conv_layer_compare_df["throughput_ratio"] = conv_layer_compare_df["throughput_sps_baseline"] / conv_layer_compare_df["throughput_sps_torch"]
conv_layer_compare_df["mem_alloc_ratio"] = conv_layer_compare_df["max_mem_alloc_mb_baseline"] / conv_layer_compare_df["max_mem_alloc_mb_torch"]
conv_layer_compare_df["mem_reserved_ratio"] = conv_layer_compare_df["max_mem_reserved_mb_baseline"] / conv_layer_compare_df["max_mem_reserved_mb_torch"]
conv_layer_compare_df


Unnamed: 0,layer,layer_type_torch,in_channels_torch,out_channels_torch,kernel_size_torch,stride_torch,padding_torch,dilation_torch,variant_torch,batch_size,...,channel_keep_ratio_baseline,input_keep_ratio_baseline,block_size_baseline,grad_block_size_baseline,speedup_forward,speedup_backward,speedup_step,throughput_ratio,mem_alloc_ratio,mem_reserved_ratio
0,conv1,Conv2d,3,64,"(7, 7)","(2, 2)","(3, 3)","(1, 1)",nn.Conv2d,32,...,1.0,1.0,,,0.187880,0.207099,0.200128,0.198495,1.298929,1.323529
1,layer1.0.conv1,Conv2d,64,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,32,...,1.0,1.0,,,0.109927,0.224592,0.162151,0.170374,1.270461,1.323529
2,layer1.0.conv2,Conv2d,64,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,32,...,1.0,1.0,,,0.151958,0.370563,0.258394,0.257067,1.270461,1.000000
3,layer1.1.conv1,Conv2d,64,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,32,...,1.0,1.0,,,0.141861,0.400772,0.263423,0.262489,1.270461,1.000000
4,layer1.1.conv2,Conv2d,64,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,32,...,1.0,1.0,,,0.142536,0.384875,0.260990,0.259953,1.270461,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,layer4.0.conv1,Conv2d,256,512,"(3, 3)","(2, 2)","(1, 1)","(1, 1)",nn.Conv2d,256,...,1.0,1.0,,,0.126829,0.348895,0.225280,0.231554,1.278724,1.255814
136,layer4.0.conv2,Conv2d,512,512,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,256,...,1.0,1.0,,,0.156501,0.362069,0.250072,0.250627,1.485397,1.465116
137,layer4.0.downsample.0,Conv2d,256,512,"(1, 1)","(2, 2)","(0, 0)","(1, 1)",nn.Conv2d,256,...,1.0,1.0,,,0.137157,0.354888,0.242592,0.242499,1.064886,1.060606
138,layer4.1.conv1,Conv2d,512,512,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",nn.Conv2d,256,...,1.0,1.0,,,0.144426,0.357950,0.244896,0.246289,1.485397,1.444444


In [20]:
conv_layer_ranking_df = conv_layer_compare_df[[
    "layer",
    "batch_size",
    "kernel_size_torch",
    "stride_torch",
    "padding_torch",
    "dilation_torch",
    "channel_keep_ratio_baseline",
    "input_keep_ratio_baseline",
    "block_size_baseline",
    "grad_block_size_baseline",
    "avg_forward_ms_torch",
    "avg_forward_ms_baseline",
    "avg_backward_ms_torch",
    "avg_backward_ms_baseline",
    "avg_step_ms_torch",
    "avg_step_ms_baseline",
    "throughput_ratio",
    "speedup_forward",
    "speedup_backward",
    "speedup_step",
    "mem_alloc_ratio",
    "mem_reserved_ratio",
]].sort_values("speedup_step", ascending=False).reset_index(drop=True)
conv_layer_ranking_df.head(15)


Unnamed: 0,layer,batch_size,kernel_size_torch,stride_torch,padding_torch,dilation_torch,channel_keep_ratio_baseline,input_keep_ratio_baseline,block_size_baseline,grad_block_size_baseline,...,avg_backward_ms_torch,avg_backward_ms_baseline,avg_step_ms_torch,avg_step_ms_baseline,throughput_ratio,speedup_forward,speedup_backward,speedup_step,mem_alloc_ratio,mem_reserved_ratio
0,layer4.0.conv1,128,"(3, 3)","(2, 2)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.193026,0.392453,0.278341,0.91307,0.30346,0.163873,0.491844,0.30484,1.233255,1.0
1,layer4.0.conv1,96,"(3, 3)","(2, 2)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.245619,0.608869,0.381936,1.337021,0.30538,0.187209,0.403403,0.285662,1.212119,1.046512
2,layer3.1.conv2,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.181112,0.449296,0.259686,0.915749,0.283652,0.168451,0.403102,0.283578,1.204192,1.0
3,layer4.0.conv2,128,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.215501,0.511493,0.320051,1.173712,0.278067,0.157879,0.421317,0.272683,1.398639,1.444444
4,layer4.0.downsample.0,32,"(1, 1)","(2, 2)","(0, 0)","(1, 1)",1.0,1.0,,,...,0.283782,0.666102,0.397226,1.460392,0.29812,0.142823,0.426034,0.271999,1.034387,1.030303
5,layer2.0.conv1,32,"(3, 3)","(2, 2)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.301246,0.745971,0.433376,1.60324,0.286691,0.154129,0.403831,0.270313,1.095275,1.029412
6,layer2.0.downsample.0,32,"(1, 1)","(2, 2)","(0, 0)","(1, 1)",1.0,1.0,,,...,0.290182,0.65987,0.395958,1.466955,0.295756,0.131059,0.439757,0.269919,1.0294,1.0
7,layer4.0.conv1,160,"(3, 3)","(2, 2)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.184387,0.442437,0.271282,1.011886,0.275864,0.152594,0.416754,0.268095,1.236068,1.022222
8,layer4.0.conv2,64,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.19968,0.463666,0.29399,1.110941,0.27258,0.145704,0.430655,0.264632,1.36437,1.232558
9,layer4.0.conv2,96,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,,,...,0.208946,0.507544,0.313651,1.187634,0.272012,0.153959,0.41168,0.264098,1.391945,1.255814


`baseline_vs_torch_df` сравнивает nn.Conv2d и Baseline TritonConv2d: пары столбцов с абсолютными значениями (forward/backward/step время, throughput, память) и коэффициенты ускорения (`speedup_*`, `throughput_ratio`, `mem_*_ratio`).


In [28]:
baseline_compare_rows = []
for bs in config["batch_sizes"]:
    torch_row = summary_df.loc[("nn.Conv2d", bs)]
    baseline_row = summary_df.loc[("Baseline TritonConv2d", bs)]
    comparison = {
        "batch_size": bs,
        "torch_forward_ms": torch_row["avg_forward_ms"],
        "baseline_forward_ms": baseline_row["avg_forward_ms"],
        "torch_backward_ms": torch_row["avg_backward_ms"],
        "baseline_backward_ms": baseline_row["avg_backward_ms"],
        "torch_step_ms": torch_row["avg_step_ms"],
        "baseline_step_ms": baseline_row["avg_step_ms"],
        "torch_samples_per_s": torch_row["samples_per_s"],
        "baseline_samples_per_s": baseline_row["samples_per_s"],
        "speedup_forward": torch_row["avg_forward_ms"] / baseline_row["avg_forward_ms"],
        "speedup_backward": torch_row["avg_backward_ms"] / baseline_row["avg_backward_ms"],
        "speedup_step": torch_row["avg_step_ms"] / baseline_row["avg_step_ms"],
        "throughput_ratio": baseline_row["samples_per_s"] / torch_row["samples_per_s"],
        "torch_mem_alloc_mb": torch_row["max_mem_alloc_mb"],
        "baseline_mem_alloc_mb": baseline_row["max_mem_alloc_mb"],
        "torch_mem_reserved_mb": torch_row["max_mem_reserved_mb"],
        "baseline_mem_reserved_mb": baseline_row["max_mem_reserved_mb"],
        "mem_alloc_ratio": baseline_row["max_mem_alloc_mb"] / torch_row["max_mem_alloc_mb"],
        "mem_reserved_ratio": baseline_row["max_mem_reserved_mb"] / torch_row["max_mem_reserved_mb"],
    }
    baseline_compare_rows.append(comparison)

baseline_vs_torch_df = pd.DataFrame(baseline_compare_rows).set_index("batch_size")
baseline_vs_torch_df


Unnamed: 0_level_0,torch_forward_ms,baseline_forward_ms,torch_backward_ms,baseline_backward_ms,torch_step_ms,baseline_step_ms,torch_samples_per_s,baseline_samples_per_s,speedup_forward,speedup_backward,speedup_step,throughput_ratio,torch_mem_alloc_mb,baseline_mem_alloc_mb,torch_mem_reserved_mb,baseline_mem_reserved_mb,mem_alloc_ratio,mem_reserved_ratio
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
32,3.129189,16.541553,2.526213,11.801686,5.655401,28.343239,5713.794252,1190.182559,0.189171,0.214055,0.199533,0.2083,127.072266,154.655273,138.0,182.0,1.217066,1.318841
64,3.018501,19.343052,2.57705,14.796444,5.59555,34.139496,11605.944033,1961.792757,0.156051,0.174167,0.163903,0.169033,127.260254,169.953613,138.0,204.0,1.335481,1.478261
96,2.984024,19.321949,2.630002,14.905766,5.614026,34.227716,17245.318782,2875.52738,0.154437,0.176442,0.16402,0.166742,127.57373,195.14209,144.0,240.0,1.529642,1.666667
128,2.827016,19.31607,2.964769,16.518956,5.791785,35.835026,22337.548931,3619.624595,0.146356,0.179477,0.161624,0.162042,132.817871,213.830078,168.0,312.0,1.60995,1.857143
160,3.68212,11.315238,3.065725,14.260141,6.747845,25.575379,23804.115582,6273.461936,0.325413,0.214986,0.263841,0.263545,140.381836,235.769043,162.0,284.0,1.679484,1.753086
192,3.461543,15.010317,3.442064,16.961555,6.903607,31.971872,28129.045602,6554.576549,0.230611,0.202933,0.215928,0.233018,148.069824,261.332031,180.0,330.0,1.764924,1.833333
256,3.392876,11.928773,3.734885,21.347773,7.127761,33.276546,36395.475281,7787.248553,0.284428,0.174954,0.214198,0.213962,160.321289,298.083496,184.0,488.0,1.859288,2.652174


### Эксперимент со sparsity (channel/block/input)

В этой секции к baseline-модели применяются маски:
- `mode="channel"` — обнуляем выходные каналы (Cout).
- `mode="block"` — обнуляем фильтры блоками по `block_size`.
- `mode="input"` — обнуляем входные каналы (Cin).
`keep_ratio` задаёт долю оставленных каналов. Маски действуют и на forward, и на backward. Метрики:
- Время/скорость: `avg_forward_ms`, `avg_backward_ms`, `avg_step_ms`, `samples_per_s`.
- Память: `max_mem_alloc_mb`, `max_mem_reserved_mb`.
- Сравнение с Torch: `speedup_*_vs_torch`, `throughput_ratio_vs_torch`, `mem_*_ratio_vs_torch`.
Сравнивайте сценарии: где `speedup_*_vs_torch > 1` при приемлемой памяти — наиболее выигрышные комбинации.


In [29]:
sparsity_cfg = config["sparsity_bench"]
sparsity_bs = sparsity_cfg["batch_size"]
if sparsity_bs not in train_loaders:
    train_loaders[sparsity_bs] = make_loader(sparsity_bs)
sparsity_loader = train_loaders[sparsity_bs]

sparsity_summaries = []
sparsity_details = []

for mode in sparsity_cfg["modes"]:
    for ratio in sparsity_cfg["keep_ratios"]:
        _, baseline_model = build_model_pair(config)
        apply_sparsity_to_model(
            baseline_model,
            mode,
            keep_ratio=ratio,
            block_size=sparsity_cfg.get("block_size", 4),
        )
        label = f"{mode.capitalize()} sparsity (keep={ratio:.2f}, bs={sparsity_bs})"
        bench_df, bench_summary = run_benchmark(baseline_model, label, sparsity_loader, config)
        bench_summary.update({
            "variant": f"Sparsity::{mode}",
            "mode": mode,
            "keep_ratio": ratio,
            "batch_size": sparsity_bs,
        })
        sparsity_summaries.append(bench_summary)
        sparsity_details.append(
            bench_df.assign(variant=f"Sparsity::{mode}", mode=mode, keep_ratio=ratio, batch_size=sparsity_bs)
        )

sparsity_summary_df = pd.DataFrame(sparsity_summaries).sort_values("samples_per_s", ascending=False).reset_index(drop=True)
sparsity_summary_df


Unnamed: 0,label,avg_forward_ms,avg_backward_ms,avg_step_ms,samples_per_s,max_mem_alloc_mb,max_mem_reserved_mb,variant,mode,keep_ratio,batch_size
0,"Input sparsity (keep=0.50, bs=128)",14.020874,14.151008,28.171882,4548.125423,189.021484,494.0,Sparsity::input,input,0.5,128
1,"Channel sparsity (keep=0.25, bs=128)",14.740871,14.232346,28.973216,4423.123514,210.30127,494.0,Sparsity::channel,channel,0.25,128
2,"Channel sparsity (keep=0.50, bs=128)",14.634723,14.544662,29.179385,4389.509238,212.101562,494.0,Sparsity::channel,channel,0.5,128
3,"Block sparsity (keep=0.50, bs=128)",14.732156,14.499875,29.232031,4384.17195,212.101562,494.0,Sparsity::block,block,0.5,128
4,"Block sparsity (keep=0.75, bs=128)",14.709338,14.571549,29.280887,4373.672844,215.029297,494.0,Sparsity::block,block,0.75,128
5,"Block sparsity (keep=0.60, bs=128)",14.493759,15.14244,29.636199,4326.114462,214.270996,494.0,Sparsity::block,block,0.6,128
6,"Channel sparsity (keep=0.60, bs=128)",14.915532,15.244357,30.159888,4252.90442,213.857422,494.0,Sparsity::channel,channel,0.6,128
7,"Input sparsity (keep=0.25, bs=128)",16.516674,14.454682,30.971356,4219.935188,172.362793,494.0,Sparsity::input,input,0.25,128
8,"Input sparsity (keep=0.60, bs=128)",16.310909,14.570102,30.881011,4205.270766,191.394531,494.0,Sparsity::input,input,0.6,128
9,"Channel sparsity (keep=0.75, bs=128)",15.800607,15.110925,30.911532,4184.812538,215.029297,494.0,Sparsity::channel,channel,0.75,128


### Как интерпретировать таблицу сравнения sparsity

`sparsity_compare_df` содержит все сценарии sparsity, отсортированные по `samples_per_s` (или speedup):
- `speedup_forward_vs_torch`, `speedup_backward_vs_torch`, `speedup_step_vs_torch` показывают ускорение относительно nn.Conv2d.
- `throughput_ratio_vs_torch` — прирост/просадка пропускной способности.
- `mem_*_ratio_vs_torch` — отношение пиков памяти; <1 означает экономию памяти.
Ищите строки с `speedup_step_vs_torch > 1` и приемлемыми `mem_*_ratio_vs_torch`, чтобы выбрать конфигурации для использования.


In [30]:
sparsity_reference = summary_df.loc[("nn.Conv2d", sparsity_bs)]

sparsity_compare_df = sparsity_summary_df.copy()
sparsity_compare_df["speedup_forward_vs_torch"] = sparsity_reference["avg_forward_ms"] / sparsity_compare_df["avg_forward_ms"]
sparsity_compare_df["speedup_backward_vs_torch"] = sparsity_reference["avg_backward_ms"] / sparsity_compare_df["avg_backward_ms"]
sparsity_compare_df["speedup_step_vs_torch"] = sparsity_reference["avg_step_ms"] / sparsity_compare_df["avg_step_ms"]
sparsity_compare_df["throughput_ratio_vs_torch"] = sparsity_compare_df["samples_per_s"] / sparsity_reference["samples_per_s"]
sparsity_compare_df["mem_alloc_ratio_vs_torch"] = sparsity_compare_df["max_mem_alloc_mb"] / sparsity_reference["max_mem_alloc_mb"]
sparsity_compare_df["mem_reserved_ratio_vs_torch"] = sparsity_compare_df["max_mem_reserved_mb"] / sparsity_reference["max_mem_reserved_mb"]
sparsity_compare_df = sparsity_compare_df.sort_values("samples_per_s", ascending=False).reset_index(drop=True)
sparsity_compare_df


Unnamed: 0,label,avg_forward_ms,avg_backward_ms,avg_step_ms,samples_per_s,max_mem_alloc_mb,max_mem_reserved_mb,variant,mode,keep_ratio,batch_size,speedup_forward_vs_torch,speedup_backward_vs_torch,speedup_step_vs_torch,throughput_ratio_vs_torch,mem_alloc_ratio_vs_torch,mem_reserved_ratio_vs_torch
0,"Input sparsity (keep=0.50, bs=128)",14.020874,14.151008,28.171882,4548.125423,189.021484,494.0,Sparsity::input,input,0.5,128,0.201629,0.209509,0.205587,0.203609,1.423163,2.940476
1,"Channel sparsity (keep=0.25, bs=128)",14.740871,14.232346,28.973216,4423.123514,210.30127,494.0,Sparsity::channel,channel,0.25,128,0.191781,0.208312,0.199901,0.198013,1.583381,2.940476
2,"Channel sparsity (keep=0.50, bs=128)",14.634723,14.544662,29.179385,4389.509238,212.101562,494.0,Sparsity::channel,channel,0.5,128,0.193172,0.203839,0.198489,0.196508,1.596935,2.940476
3,"Block sparsity (keep=0.50, bs=128)",14.732156,14.499875,29.232031,4384.17195,212.101562,494.0,Sparsity::block,block,0.5,128,0.191894,0.204469,0.198131,0.196269,1.596935,2.940476
4,"Block sparsity (keep=0.75, bs=128)",14.709338,14.571549,29.280887,4373.672844,215.029297,494.0,Sparsity::block,block,0.75,128,0.192192,0.203463,0.197801,0.195799,1.618979,2.940476
5,"Block sparsity (keep=0.60, bs=128)",14.493759,15.14244,29.636199,4326.114462,214.270996,494.0,Sparsity::block,block,0.6,128,0.195051,0.195792,0.195429,0.19367,1.613269,2.940476
6,"Channel sparsity (keep=0.60, bs=128)",14.915532,15.244357,30.159888,4252.90442,213.857422,494.0,Sparsity::channel,channel,0.6,128,0.189535,0.194483,0.192036,0.190393,1.610155,2.940476
7,"Input sparsity (keep=0.25, bs=128)",16.516674,14.454682,30.971356,4219.935188,172.362793,494.0,Sparsity::input,input,0.25,128,0.171161,0.205108,0.187005,0.188917,1.297738,2.940476
8,"Input sparsity (keep=0.60, bs=128)",16.310909,14.570102,30.881011,4205.270766,191.394531,494.0,Sparsity::input,input,0.6,128,0.173321,0.203483,0.187552,0.18826,1.44103,2.940476
9,"Channel sparsity (keep=0.75, bs=128)",15.800607,15.110925,30.911532,4184.812538,215.029297,494.0,Sparsity::channel,channel,0.75,128,0.178918,0.1962,0.187366,0.187344,1.618979,2.940476


### Ренкинг лучших сценариев

В этой таблице собраны лучшие сценарии по различным критериям (быстрейший шаг, максимальный throughput, наименьшая память). Смотрите на:
- `label` / `variant` — какой режим (sparsity или baseline) и какой batch.
- `speedup_*_vs_torch` или абсолютные времена — что именно оптимизируем.
- Память — если целитесь в ограничение по GPU, учитывайте `mem_*`.
Используйте ренкинг как быструю шпаргалку для выбора режима на конкретном GPU/батче.


In [31]:
ranking_df = sparsity_compare_df[[
    "variant",
    "mode",
    "keep_ratio",
    "samples_per_s",
    "throughput_ratio_vs_torch",
    "speedup_forward_vs_torch",
    "speedup_backward_vs_torch",
    "speedup_step_vs_torch",
    "mem_alloc_ratio_vs_torch",
    "mem_reserved_ratio_vs_torch",
]].copy()
ranking_df = ranking_df.sort_values("throughput_ratio_vs_torch", ascending=False).reset_index(drop=True)
ranking_df


Unnamed: 0,variant,mode,keep_ratio,samples_per_s,throughput_ratio_vs_torch,speedup_forward_vs_torch,speedup_backward_vs_torch,speedup_step_vs_torch,mem_alloc_ratio_vs_torch,mem_reserved_ratio_vs_torch
0,Sparsity::input,input,0.5,4548.125423,0.203609,0.201629,0.209509,0.205587,1.423163,2.940476
1,Sparsity::channel,channel,0.25,4423.123514,0.198013,0.191781,0.208312,0.199901,1.583381,2.940476
2,Sparsity::channel,channel,0.5,4389.509238,0.196508,0.193172,0.203839,0.198489,1.596935,2.940476
3,Sparsity::block,block,0.5,4384.17195,0.196269,0.191894,0.204469,0.198131,1.596935,2.940476
4,Sparsity::block,block,0.75,4373.672844,0.195799,0.192192,0.203463,0.197801,1.618979,2.940476
5,Sparsity::block,block,0.6,4326.114462,0.19367,0.195051,0.195792,0.195429,1.613269,2.940476
6,Sparsity::channel,channel,0.6,4252.90442,0.190393,0.189535,0.194483,0.192036,1.610155,2.940476
7,Sparsity::input,input,0.25,4219.935188,0.188917,0.171161,0.205108,0.187005,1.297738,2.940476
8,Sparsity::input,input,0.6,4205.270766,0.18826,0.173321,0.203483,0.187552,1.44103,2.940476
9,Sparsity::channel,channel,0.75,4184.812538,0.187344,0.178918,0.1962,0.187366,1.618979,2.940476


Final rankings for model batch sizes and per-layer convs.

Model batch-size rankings (step/throughput/memory).

In [25]:
model_step_top = (
    summary_df.reset_index()
    .sort_values("avg_step_ms")
    .groupby("variant")
    .head(3)
    .reset_index(drop=True)
)

model_throughput_top = (
    summary_df.reset_index()
    .sort_values("samples_per_s", ascending=False)
    .groupby("variant")
    .head(3)
    .reset_index(drop=True)
)

model_memory_top = (
    summary_df.reset_index()
    .sort_values("max_mem_alloc_mb")
    .groupby("variant")
    .head(3)
    .reset_index(drop=True)
)


In [26]:
model_rankings_df = pd.concat(
    [
        model_step_top.assign(metric="fastest_step"),
        model_throughput_top.assign(metric="highest_throughput"),
        model_memory_top.assign(metric="lowest_mem_alloc"),
    ],
    ignore_index=True,
)
model_rankings_df = model_rankings_df[[
    "metric",
    "variant",
    "batch_size",
    "avg_forward_ms",
    "avg_backward_ms",
    "avg_step_ms",
    "samples_per_s",
    "max_mem_alloc_mb",
    "max_mem_reserved_mb",
]]
model_rankings_df


Unnamed: 0,metric,variant,batch_size,avg_forward_ms,avg_backward_ms,avg_step_ms,samples_per_s,max_mem_alloc_mb,max_mem_reserved_mb
0,fastest_step,nn.Conv2d,64,3.018501,2.57705,5.59555,11605.944033,127.260254,138.0
1,fastest_step,nn.Conv2d,96,2.984024,2.630002,5.614026,17245.318782,127.57373,144.0
2,fastest_step,nn.Conv2d,32,3.129189,2.526213,5.655401,5713.794252,127.072266,138.0
3,fastest_step,Baseline TritonConv2d,160,11.315238,14.260141,25.575379,6273.461936,235.769043,284.0
4,fastest_step,Baseline TritonConv2d,32,16.541553,11.801686,28.343239,1190.182559,154.655273,182.0
5,fastest_step,Baseline TritonConv2d,192,15.010317,16.961555,31.971872,6554.576549,261.332031,330.0
6,highest_throughput,nn.Conv2d,256,3.392876,3.734885,7.127761,36395.475281,160.321289,184.0
7,highest_throughput,nn.Conv2d,192,3.461543,3.442064,6.903607,28129.045602,148.069824,180.0
8,highest_throughput,nn.Conv2d,160,3.68212,3.065725,6.747845,23804.115582,140.381836,162.0
9,highest_throughput,Baseline TritonConv2d,256,11.928773,21.347773,33.276546,7787.248553,298.083496,488.0


In [27]:
conv_forward_top = conv_layer_compare_df.sort_values("avg_forward_ms_baseline").head(10).assign(metric="forward_time")
conv_backward_top = conv_layer_compare_df.sort_values("avg_backward_ms_baseline").head(10).assign(metric="backward_time")
conv_speedup_top = conv_layer_compare_df.sort_values("speedup_step", ascending=False).head(15).assign(metric="speedup_step")

conv_layer_best_df = pd.concat(
    [conv_forward_top, conv_backward_top, conv_speedup_top],
    ignore_index=True,
)

conv_layer_best_df = conv_layer_best_df[[
    "metric",
    "layer",
    "batch_size",
    "layer_type_baseline",
    "kernel_size_baseline",
    "stride_baseline",
    "padding_baseline",
    "dilation_baseline",
    "channel_keep_ratio_baseline",
    "input_keep_ratio_baseline",
    "block_size_baseline",
    "grad_block_size_baseline",
    "avg_forward_ms_baseline",
    "avg_backward_ms_baseline",
    "avg_step_ms_baseline",
    "speedup_forward",
    "speedup_backward",
    "speedup_step",
    "throughput_ratio",
    "mem_alloc_ratio",
    "mem_reserved_ratio",
]]
conv_layer_best_df


Unnamed: 0,metric,layer,batch_size,layer_type_baseline,kernel_size_baseline,stride_baseline,padding_baseline,dilation_baseline,channel_keep_ratio_baseline,input_keep_ratio_baseline,...,grad_block_size_baseline,avg_forward_ms_baseline,avg_backward_ms_baseline,avg_step_ms_baseline,speedup_forward,speedup_backward,speedup_step,throughput_ratio,mem_alloc_ratio,mem_reserved_ratio
0,forward_time,conv1,32,TritonConv2d,"(7, 7)","(2, 2)","(3, 3)","(1, 1)",1.0,1.0,...,,0.442019,0.776563,1.218582,0.18788,0.207099,0.200128,0.198495,1.298929,1.323529
1,forward_time,layer1.1.conv1,128,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.460432,0.989592,1.450024,0.175373,0.177064,0.176527,0.175538,1.869446,1.0
2,forward_time,layer1.1.conv2,128,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.460854,0.991088,1.451942,0.144063,0.172061,0.163175,0.163142,1.869446,1.0
3,forward_time,layer3.1.conv2,64,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.466453,0.449296,0.915749,0.168451,0.403102,0.283578,0.283652,1.204192,1.0
4,forward_time,layer2.0.conv2,192,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.476136,0.615542,1.091678,0.145632,0.273928,0.217972,0.218212,1.69105,1.5
5,forward_time,layer4.0.conv1,128,TritonConv2d,"(3, 3)","(2, 2)","(1, 1)","(1, 1)",1.0,1.0,...,,0.520618,0.392453,0.91307,0.163873,0.491844,0.30484,0.30346,1.233255,1.0
6,forward_time,layer1.0.conv2,128,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.541578,1.024398,1.565976,0.184247,0.242309,0.222229,0.220613,1.869446,1.0
7,forward_time,layer3.1.conv1,64,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.562176,0.472083,1.034259,0.135138,0.346153,0.231455,0.232097,1.204192,1.285714
8,forward_time,layer1.0.conv1,128,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.56672,1.013517,1.580237,0.148735,0.185033,0.172016,0.169039,1.869446,2.027027
9,forward_time,layer3.0.conv2,64,TritonConv2d,"(3, 3)","(1, 1)","(1, 1)","(1, 1)",1.0,1.0,...,,0.568173,0.43604,1.004213,0.138006,0.379804,0.242997,0.242941,1.204192,1.333333
