# ResNet18 Baseline Conv2d Benchmark

Сравнение nn.Conv2d и кастомной img2col→GEMM свёртки (Baseline TritonConv2d) на ResNet18 с разными batch size и сценариями спарсификации.


In [1]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path().resolve().parent))


In [2]:
import copy
import json
import math
import random
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision
from torchvision import transforms

from conv_gemm.baseline_layers.triton_conv2d import TritonConv2d


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type != "cuda":
    raise RuntimeError("CUDA GPU is required for this benchmark")

seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = True

data_root = Path("../data").resolve()
data_root.mkdir(parents=True, exist_ok=True)

config = {
    "data_root": str(data_root),
    "num_classes": 10,
    "batch_sizes": [32, 64, 96, 128, 160, 192, 256],
    "num_workers": 4,
    "train_subset": 8192,
    "lr": 1e-3,
    "momentum": 0.9,
    "weight_decay": 5e-4,
    "warmup_steps": 5,
    "benchmark_steps": 40,
    "baseline_conv": {
        "BLOCK_M": 64,
        "BLOCK_N": 64,
        "BLOCK_K": 64,
        "NUM_WARPS": 4,
        "NUM_STAGES": 2,
    },
    "sparsity_bench": {
        "modes": ["channel", "block", "input"],
        "keep_ratios": [0.75, 0.6, 0.5, 0.25],
        "block_size": 4,
        "batch_size": 128,
    },
}
print(json.dumps(config, indent=2))


{
  "data_root": "/home/manzhura/ITMO/EDLM/conv2d-img2col-gemm/data",
  "num_classes": 10,
  "batch_sizes": [
    32,
    64,
    96,
    128,
    160,
    192,
    256
  ],
  "num_workers": 4,
  "train_subset": 8192,
  "lr": 0.001,
  "momentum": 0.9,
  "weight_decay": 0.0005,
  "warmup_steps": 5,
  "benchmark_steps": 40,
  "baseline_conv": {
    "BLOCK_M": 64,
    "BLOCK_N": 64,
    "BLOCK_K": 64,
    "NUM_WARPS": 4,
    "NUM_STAGES": 2
  },
  "sparsity_bench": {
    "modes": [
      "channel",
      "block",
      "input"
    ],
    "keep_ratios": [
      0.75,
      0.6,
      0.5,
      0.25
    ],
    "block_size": 4,
    "batch_size": 128
  }
}


In [4]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

full_train = torchvision.datasets.CIFAR10(
    root=config["data_root"], train=True, download=True, transform=transform_train
)
if config["train_subset"] is not None and config["train_subset"] < len(full_train):
    g = torch.Generator().manual_seed(seed)
    subset_idx = torch.randperm(len(full_train), generator=g)[: config["train_subset"]]
    train_dataset = torch.utils.data.Subset(full_train, subset_idx)
else:
    train_dataset = full_train


def make_loader(batch_size: int) -> DataLoader:
    return DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=config["num_workers"],
        pin_memory=True,
    )

train_loaders: Dict[int, DataLoader] = {}
for bs in config["batch_sizes"]:
    train_loaders[bs] = make_loader(bs)

print({bs: len(loader) for bs, loader in train_loaders.items()})


{32: 256, 64: 128, 96: 85, 128: 64, 160: 51, 192: 42, 256: 32}


In [8]:
def make_triton_conv(src: nn.Conv2d, cfg: dict) -> TritonConv2d:
    if src.groups != 1:
        raise ValueError("Baseline TritonConv2d currently supports groups=1 only")
    layer = TritonConv2d(
        in_channels=src.in_channels,
        out_channels=src.out_channels,
        kernel_size=src.kernel_size,
        stride=src.stride,
        padding=src.padding,
        dilation=src.dilation,
        bias=(src.bias is not None),
        # **cfg,
    ).to(src.weight.device)
    with torch.no_grad():
        layer.weight.copy_(src.weight.detach().to(layer.weight.dtype))
        if layer.bias is not None and src.bias is not None:
            layer.bias.copy_(src.bias.detach().to(layer.bias.dtype))
    return layer


def replace_convs_with_baseline(module: nn.Module, cfg: dict):
    for name, child in module.named_children():
        if isinstance(child, nn.Conv2d):
            setattr(module, name, make_triton_conv(child, cfg))
        else:
            replace_convs_with_baseline(child, cfg)


def build_model_pair(config: dict):
    reference = torchvision.models.resnet18(num_classes=config["num_classes"])
    baseline = copy.deepcopy(reference)
    replace_convs_with_baseline(baseline, config["baseline_conv"])
    return reference.half(), baseline.half()


def apply_sparsity_to_model(model: nn.Module, mode: str, keep_ratio: float, block_size: int = 4):
    for layer in model.modules():
        if isinstance(layer, TritonConv2d):
            layer.clear_sparsity()
            if keep_ratio >= 1.0:
                continue
            if mode == "channel":
                layer.set_channel_sparsity(keep_ratio)
                layer.set_backward_channel_sparsity(keep_ratio)
            elif mode == "block":
                layer.set_block_sparsity(keep_ratio, block_size=block_size)
                layer.set_backward_block_sparsity(keep_ratio, block_size=block_size)
            elif mode == "input":
                layer.set_input_channel_sparsity(keep_ratio)
                layer.set_backward_input_channel_sparsity(keep_ratio)
            else:
                raise ValueError(f"Unknown sparsity mode: {mode}")


In [9]:
def run_benchmark(model: nn.Module, label: str, loader: DataLoader, config: dict):
    model = model.to(device)
    model.train()
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config["lr"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
    )

    warmup = config["warmup_steps"]
    total_steps = config["benchmark_steps"]
    records = []
    data_iter = iter(loader)

    for step in range(total_steps):
        try:
            images, targets = next(data_iter)
        except StopIteration:
            data_iter = iter(loader)
            images, targets = next(data_iter)

        images = images.half().to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        torch.cuda.reset_peak_memory_stats(device)
        torch.cuda.synchronize()

        fwd_start = torch.cuda.Event(enable_timing=True)
        fwd_end = torch.cuda.Event(enable_timing=True)
        bwd_start = torch.cuda.Event(enable_timing=True)
        bwd_end = torch.cuda.Event(enable_timing=True)

        fwd_start.record()
        outputs = model(images)
        fwd_end.record()
        loss = criterion(outputs, targets)

        bwd_start.record()
        loss.backward()
        bwd_end.record()
        optimizer.step()

        torch.cuda.synchronize()

        fwd_ms = fwd_start.elapsed_time(fwd_end)
        bwd_ms = bwd_start.elapsed_time(bwd_end)
        step_ms = fwd_ms + bwd_ms
        mem_alloc = torch.cuda.max_memory_allocated(device) / 1024 ** 2
        mem_reserved = torch.cuda.max_memory_reserved(device) / 1024 ** 2

        if step >= warmup:
            records.append({
                "label": label,
                "step": step,
                "loss": float(loss.item()),
                "fwd_ms": fwd_ms,
                "bwd_ms": bwd_ms,
                "step_ms": step_ms,
                "throughput_sps": images.size(0) / (step_ms / 1000.0),
                "max_mem_alloc_mb": mem_alloc,
                "max_mem_reserved_mb": mem_reserved,
            })

    if not records:
        raise RuntimeError("No data recorded for benchmark")

    df = pd.DataFrame(records)
    summary = {
        "label": label,
        "avg_forward_ms": df["fwd_ms"].mean(),
        "avg_backward_ms": df["bwd_ms"].mean(),
        "avg_step_ms": df["step_ms"].mean(),
        "samples_per_s": df["throughput_sps"].mean(),
        "max_mem_alloc_mb": df["max_mem_alloc_mb"].max(),
        "max_mem_reserved_mb": df["max_mem_reserved_mb"].max(),
    }
    return df, summary


Таблица `summary_df` показывает средние метрики по каждому batch size: `avg_forward_ms`, `avg_backward_ms`, `avg_step_ms`, `samples_per_s`, а также пики памяти (`max_mem_alloc_mb`, `max_mem_reserved_mb`).


In [10]:
batch_summaries = []
batch_details = []

for bs, loader in train_loaders.items():
    print(f"=== Batch size {bs} ===")
    torch_model, baseline_model = build_model_pair(config)

    torch_df, torch_summary = run_benchmark(torch_model, f"nn.Conv2d (bs={bs})", loader, config)
    torch_summary.update({"variant": "nn.Conv2d", "batch_size": bs})
    batch_summaries.append(torch_summary)
    batch_details.append(torch_df.assign(variant="nn.Conv2d", batch_size=bs))

    baseline_df, baseline_summary = run_benchmark(baseline_model, f"Baseline TritonConv2d (bs={bs})", loader, config)
    baseline_summary.update({"variant": "Baseline TritonConv2d", "batch_size": bs})
    batch_summaries.append(baseline_summary)
    batch_details.append(baseline_df.assign(variant="Baseline TritonConv2d", batch_size=bs))

summary_df = pd.DataFrame(batch_summaries).set_index(["variant", "batch_size"])
summary_df


=== Batch size 32 ===
=== Batch size 64 ===
=== Batch size 96 ===
=== Batch size 128 ===
=== Batch size 160 ===
=== Batch size 192 ===
=== Batch size 256 ===


Unnamed: 0_level_0,Unnamed: 1_level_0,label,avg_forward_ms,avg_backward_ms,avg_step_ms,samples_per_s,max_mem_alloc_mb,max_mem_reserved_mb
variant,batch_size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
nn.Conv2d,32,nn.Conv2d (bs=32),3.352386,3.272921,6.625306,4944.97642,134.588379,144.0
Baseline TritonConv2d,32,Baseline TritonConv2d (bs=32),14.967819,10.237487,25.205306,1315.25864,184.434082,212.0
nn.Conv2d,64,nn.Conv2d (bs=64),3.449143,3.612469,7.061612,9263.492765,133.901367,146.0
Baseline TritonConv2d,64,Baseline TritonConv2d (bs=64),14.682221,12.818234,27.500455,2421.983281,198.357422,234.0
nn.Conv2d,96,nn.Conv2d (bs=96),4.871213,4.244751,9.115963,12463.983149,135.214844,170.0
Baseline TritonConv2d,96,Baseline TritonConv2d (bs=96),14.985442,13.275414,28.260856,3437.792421,222.795898,276.0
nn.Conv2d,128,nn.Conv2d (bs=128),3.484797,4.517356,8.002153,17313.273701,143.02002,162.0
Baseline TritonConv2d,128,Baseline TritonConv2d (bs=128),18.745069,14.676995,33.422063,4092.673452,245.483887,308.0
nn.Conv2d,160,nn.Conv2d (bs=160),3.765251,3.878726,7.643977,21641.398212,148.897949,162.0
Baseline TritonConv2d,160,Baseline TritonConv2d (bs=160),14.19638,18.071107,32.267487,5117.24665,267.797852,314.0


Вывод `detail_df.groupby(...).describe()` содержит count/mean/std/min/25%/50%/75%/max для метрик `step_ms`, `fwd_ms`, `bwd_ms`, `max_mem_alloc_mb` отдельно по каждому `(variant, batch_size)`.


In [11]:
detail_df = pd.concat(batch_details, ignore_index=True)
metrics = ["step_ms", "fwd_ms", "bwd_ms", "max_mem_alloc_mb"]
detail_df.groupby(["variant", "batch_size"])[metrics].describe()


Unnamed: 0_level_0,Unnamed: 1_level_0,step_ms,step_ms,step_ms,step_ms,step_ms,step_ms,step_ms,step_ms,fwd_ms,fwd_ms,...,bwd_ms,bwd_ms,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb,max_mem_alloc_mb
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
variant,batch_size,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Baseline TritonConv2d,32,35.0,25.205306,5.661807,20.064992,22.152496,23.545792,25.194736,45.035456,35.0,14.967819,...,10.262912,31.393408,35.0,184.434082,0.0,184.434082,184.434082,184.434082,184.434082,184.434082
Baseline TritonConv2d,64,35.0,27.500455,6.705584,22.549472,23.527472,24.793759,27.669617,50.465823,35.0,14.682221,...,13.02016,29.878271,35.0,198.357422,0.0,198.357422,198.357422,198.357422,198.357422,198.357422
Baseline TritonConv2d,96,35.0,28.260856,3.328288,23.014912,26.144688,27.162752,29.711168,38.442881,35.0,14.985442,...,13.830144,18.161663,35.0,222.795898,0.0,222.795898,222.795898,222.795898,222.795898,222.795898
Baseline TritonConv2d,128,35.0,33.422063,9.799308,23.451936,26.837024,28.814336,38.268896,58.330786,35.0,18.745069,...,14.815232,36.040703,35.0,245.483887,0.0,245.483887,245.483887,245.483887,245.483887,245.483887
Baseline TritonConv2d,160,35.0,32.267487,6.840601,26.865376,27.99992,29.643712,33.469808,60.41888,35.0,14.19638,...,17.794559,45.735935,35.0,267.797852,0.0,267.797852,267.797852,267.797852,267.797852,267.797852
Baseline TritonConv2d,192,35.0,34.034393,6.863784,27.454624,30.557808,31.530016,34.828513,58.785151,35.0,14.511505,...,18.081792,46.098431,35.0,290.48584,0.0,290.48584,290.48584,290.48584,290.48584,290.48584
Baseline TritonConv2d,256,35.0,38.468748,6.607062,31.87424,34.618976,37.166624,39.910048,62.222399,35.0,14.575157,...,24.30504,41.905025,35.0,330.862305,0.0,330.862305,330.862305,330.862305,330.862305,330.862305
nn.Conv2d,32,35.0,6.625306,1.094632,5.305344,5.846768,6.147808,7.2128,9.580544,35.0,3.352386,...,3.766032,6.1696,35.0,134.588379,0.0,134.588379,134.588379,134.588379,134.588379,134.588379
nn.Conv2d,64,35.0,7.061612,1.164192,5.438464,6.523296,6.748544,7.613648,11.556416,35.0,3.449143,...,4.11904,6.786048,35.0,133.901367,0.0,133.901367,133.901367,133.901367,133.901367,133.901367
nn.Conv2d,96,35.0,9.115963,6.323146,5.613088,6.458848,7.282304,9.238448,38.306112,35.0,4.871213,...,4.218368,17.64352,35.0,135.214844,0.0,135.214844,135.214844,135.214844,135.214844,135.214844


`baseline_vs_torch_df` сравнивает nn.Conv2d и Baseline TritonConv2d: пары столбцов с абсолютными значениями (forward/backward/step время, throughput, память) и коэффициенты ускорения (`speedup_*`, `throughput_ratio`, `mem_*_ratio`).


In [12]:
baseline_compare_rows = []
for bs in config["batch_sizes"]:
    torch_row = summary_df.loc[("nn.Conv2d", bs)]
    baseline_row = summary_df.loc[("Baseline TritonConv2d", bs)]
    comparison = {
        "batch_size": bs,
        "torch_forward_ms": torch_row["avg_forward_ms"],
        "baseline_forward_ms": baseline_row["avg_forward_ms"],
        "torch_backward_ms": torch_row["avg_backward_ms"],
        "baseline_backward_ms": baseline_row["avg_backward_ms"],
        "torch_step_ms": torch_row["avg_step_ms"],
        "baseline_step_ms": baseline_row["avg_step_ms"],
        "torch_samples_per_s": torch_row["samples_per_s"],
        "baseline_samples_per_s": baseline_row["samples_per_s"],
        "speedup_forward": torch_row["avg_forward_ms"] / baseline_row["avg_forward_ms"],
        "speedup_backward": torch_row["avg_backward_ms"] / baseline_row["avg_backward_ms"],
        "speedup_step": torch_row["avg_step_ms"] / baseline_row["avg_step_ms"],
        "throughput_ratio": baseline_row["samples_per_s"] / torch_row["samples_per_s"],
        "torch_mem_alloc_mb": torch_row["max_mem_alloc_mb"],
        "baseline_mem_alloc_mb": baseline_row["max_mem_alloc_mb"],
        "torch_mem_reserved_mb": torch_row["max_mem_reserved_mb"],
        "baseline_mem_reserved_mb": baseline_row["max_mem_reserved_mb"],
        "mem_alloc_ratio": baseline_row["max_mem_alloc_mb"] / torch_row["max_mem_alloc_mb"],
        "mem_reserved_ratio": baseline_row["max_mem_reserved_mb"] / torch_row["max_mem_reserved_mb"],
    }
    baseline_compare_rows.append(comparison)

baseline_vs_torch_df = pd.DataFrame(baseline_compare_rows).set_index("batch_size")
baseline_vs_torch_df


Unnamed: 0_level_0,torch_forward_ms,baseline_forward_ms,torch_backward_ms,baseline_backward_ms,torch_step_ms,baseline_step_ms,torch_samples_per_s,baseline_samples_per_s,speedup_forward,speedup_backward,speedup_step,throughput_ratio,torch_mem_alloc_mb,baseline_mem_alloc_mb,torch_mem_reserved_mb,baseline_mem_reserved_mb,mem_alloc_ratio,mem_reserved_ratio
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
32,3.352386,14.967819,3.272921,10.237487,6.625306,25.205306,4944.97642,1315.25864,0.223973,0.3197,0.262854,0.265979,134.588379,184.434082,144.0,212.0,1.370357,1.472222
64,3.449143,14.682221,3.612469,12.818234,7.061612,27.500455,9263.492765,2421.983281,0.23492,0.281823,0.256782,0.261455,133.901367,198.357422,146.0,234.0,1.48137,1.60274
96,4.871213,14.985442,4.244751,13.275414,9.115963,28.260856,12463.983149,3437.792421,0.325063,0.319745,0.322565,0.275818,135.214844,222.795898,170.0,276.0,1.647718,1.623529
128,3.484797,18.745069,4.517356,14.676995,8.002153,33.422063,17313.273701,4092.673452,0.185905,0.307785,0.239427,0.236389,143.02002,245.483887,162.0,308.0,1.71643,1.901235
160,3.765251,14.19638,3.878726,18.071107,7.643977,32.267487,21641.398212,5117.24665,0.265226,0.214637,0.236894,0.236456,148.897949,267.797852,162.0,314.0,1.798533,1.938272
192,3.79072,14.511505,5.501792,19.522888,9.292512,34.034393,23614.536015,5805.30109,0.261222,0.281812,0.273033,0.245836,155.960938,290.48584,196.0,340.0,1.862555,1.734694
256,3.992804,14.575157,4.867109,23.893591,8.859913,38.468748,30712.047041,6802.338198,0.273946,0.203699,0.230315,0.221488,170.462402,330.862305,200.0,524.0,1.940969,2.62


Таблица `summary_df` показывает средние метрики по каждому batch size: `avg_forward_ms`, `avg_backward_ms`, `avg_step_ms`, `samples_per_s`, а также пики памяти (`max_mem_alloc_mb`, `max_mem_reserved_mb`).


In [13]:
sparsity_cfg = config["sparsity_bench"]
sparsity_bs = sparsity_cfg["batch_size"]
if sparsity_bs not in train_loaders:
    train_loaders[sparsity_bs] = make_loader(sparsity_bs)
sparsity_loader = train_loaders[sparsity_bs]

sparsity_summaries = []
sparsity_details = []

for mode in sparsity_cfg["modes"]:
    for ratio in sparsity_cfg["keep_ratios"]:
        _, baseline_model = build_model_pair(config)
        apply_sparsity_to_model(
            baseline_model,
            mode,
            keep_ratio=ratio,
            block_size=sparsity_cfg.get("block_size", 4),
        )
        label = f"{mode.capitalize()} sparsity (keep={ratio:.2f}, bs={sparsity_bs})"
        bench_df, bench_summary = run_benchmark(baseline_model, label, sparsity_loader, config)
        bench_summary.update({
            "variant": f"Sparsity::{mode}",
            "mode": mode,
            "keep_ratio": ratio,
            "batch_size": sparsity_bs,
        })
        sparsity_summaries.append(bench_summary)
        sparsity_details.append(
            bench_df.assign(variant=f"Sparsity::{mode}", mode=mode, keep_ratio=ratio, batch_size=sparsity_bs)
        )

sparsity_summary_df = pd.DataFrame(sparsity_summaries).sort_values("samples_per_s", ascending=False).reset_index(drop=True)
sparsity_summary_df


Unnamed: 0,label,avg_forward_ms,avg_backward_ms,avg_step_ms,samples_per_s,max_mem_alloc_mb,max_mem_reserved_mb,variant,mode,keep_ratio,batch_size
0,"Input sparsity (keep=0.25, bs=128)",22.774992,15.296472,38.071464,3527.153975,201.454102,530.0,Sparsity::input,input,0.25,128
1,"Input sparsity (keep=0.75, bs=128)",22.180967,15.33512,37.516087,3511.374428,233.928223,530.0,Sparsity::input,input,0.75,128
2,"Block sparsity (keep=0.50, bs=128)",22.696155,15.941438,38.637594,3432.325143,240.817871,530.0,Sparsity::block,block,0.5,128
3,"Channel sparsity (keep=0.25, bs=128)",24.155531,15.187616,39.343147,3373.784959,242.205078,530.0,Sparsity::channel,channel,0.25,128
4,"Input sparsity (keep=0.50, bs=128)",23.975558,15.610528,39.586086,3353.336249,219.550293,530.0,Sparsity::input,input,0.5,128
5,"Channel sparsity (keep=0.75, bs=128)",23.896154,16.314183,40.210337,3348.705365,242.558105,528.0,Sparsity::channel,channel,0.75,128
6,"Block sparsity (keep=0.25, bs=128)",24.241746,15.560325,39.802071,3339.391017,242.205078,530.0,Sparsity::block,block,0.25,128
7,"Channel sparsity (keep=0.50, bs=128)",23.52389,17.282415,40.806305,3251.883801,240.817871,530.0,Sparsity::channel,channel,0.5,128
8,"Block sparsity (keep=0.75, bs=128)",24.818793,16.722576,41.541369,3222.426748,242.558105,530.0,Sparsity::block,block,0.75,128
9,"Channel sparsity (keep=0.60, bs=128)",24.400073,17.726504,42.126578,3170.031484,243.114746,530.0,Sparsity::channel,channel,0.6,128


`sparsity_compare_df` добавляет к тем же сценариям относительные значения относительно эталонного nn.Conv2d (`speedup_*_vs_torch`, `throughput_ratio_vs_torch`, `mem_*_ratio_vs_torch`).


In [14]:
sparsity_reference = summary_df.loc[("nn.Conv2d", sparsity_bs)]

sparsity_compare_df = sparsity_summary_df.copy()
sparsity_compare_df["speedup_forward_vs_torch"] = sparsity_reference["avg_forward_ms"] / sparsity_compare_df["avg_forward_ms"]
sparsity_compare_df["speedup_backward_vs_torch"] = sparsity_reference["avg_backward_ms"] / sparsity_compare_df["avg_backward_ms"]
sparsity_compare_df["speedup_step_vs_torch"] = sparsity_reference["avg_step_ms"] / sparsity_compare_df["avg_step_ms"]
sparsity_compare_df["throughput_ratio_vs_torch"] = sparsity_compare_df["samples_per_s"] / sparsity_reference["samples_per_s"]
sparsity_compare_df["mem_alloc_ratio_vs_torch"] = sparsity_compare_df["max_mem_alloc_mb"] / sparsity_reference["max_mem_alloc_mb"]
sparsity_compare_df["mem_reserved_ratio_vs_torch"] = sparsity_compare_df["max_mem_reserved_mb"] / sparsity_reference["max_mem_reserved_mb"]
sparsity_compare_df = sparsity_compare_df.sort_values("samples_per_s", ascending=False).reset_index(drop=True)
sparsity_compare_df


Unnamed: 0,label,avg_forward_ms,avg_backward_ms,avg_step_ms,samples_per_s,max_mem_alloc_mb,max_mem_reserved_mb,variant,mode,keep_ratio,batch_size,speedup_forward_vs_torch,speedup_backward_vs_torch,speedup_step_vs_torch,throughput_ratio_vs_torch,mem_alloc_ratio_vs_torch,mem_reserved_ratio_vs_torch
0,"Input sparsity (keep=0.25, bs=128)",22.774992,15.296472,38.071464,3527.153975,201.454102,530.0,Sparsity::input,input,0.25,128,0.15301,0.29532,0.210188,0.203725,1.408573,3.271605
1,"Input sparsity (keep=0.75, bs=128)",22.180967,15.33512,37.516087,3511.374428,233.928223,530.0,Sparsity::input,input,0.75,128,0.157108,0.294576,0.213299,0.202814,1.635633,3.271605
2,"Block sparsity (keep=0.50, bs=128)",22.696155,15.941438,38.637594,3432.325143,240.817871,530.0,Sparsity::block,block,0.5,128,0.153541,0.283372,0.207108,0.198248,1.683805,3.271605
3,"Channel sparsity (keep=0.25, bs=128)",24.155531,15.187616,39.343147,3373.784959,242.205078,530.0,Sparsity::channel,channel,0.25,128,0.144265,0.297437,0.203394,0.194867,1.693505,3.271605
4,"Input sparsity (keep=0.50, bs=128)",23.975558,15.610528,39.586086,3353.336249,219.550293,530.0,Sparsity::input,input,0.5,128,0.145348,0.289379,0.202146,0.193686,1.535102,3.271605
5,"Channel sparsity (keep=0.75, bs=128)",23.896154,16.314183,40.210337,3348.705365,242.558105,528.0,Sparsity::channel,channel,0.75,128,0.145831,0.276897,0.199007,0.193418,1.695973,3.259259
6,"Block sparsity (keep=0.25, bs=128)",24.241746,15.560325,39.802071,3339.391017,242.205078,530.0,Sparsity::block,block,0.25,128,0.143752,0.290312,0.201049,0.19288,1.693505,3.271605
7,"Channel sparsity (keep=0.50, bs=128)",23.52389,17.282415,40.806305,3251.883801,240.817871,530.0,Sparsity::channel,channel,0.5,128,0.148139,0.261385,0.196101,0.187826,1.683805,3.271605
8,"Block sparsity (keep=0.75, bs=128)",24.818793,16.722576,41.541369,3222.426748,242.558105,530.0,Sparsity::block,block,0.75,128,0.14041,0.270135,0.192631,0.186125,1.695973,3.271605
9,"Channel sparsity (keep=0.60, bs=128)",24.400073,17.726504,42.126578,3170.031484,243.114746,530.0,Sparsity::channel,channel,0.6,128,0.142819,0.254836,0.189955,0.183098,1.699865,3.271605


`ranking_df` — упорядоченный рейтинг сценариев спарсификации: показывает `mode`, `keep_ratio`, абсолютный throughput и его отношение к торчу, а также ускорения forward/backward/step и изменение памяти.


In [15]:
ranking_df = sparsity_compare_df[[
    "variant",
    "mode",
    "keep_ratio",
    "samples_per_s",
    "throughput_ratio_vs_torch",
    "speedup_forward_vs_torch",
    "speedup_backward_vs_torch",
    "speedup_step_vs_torch",
    "mem_alloc_ratio_vs_torch",
    "mem_reserved_ratio_vs_torch",
]].copy()
ranking_df = ranking_df.sort_values("throughput_ratio_vs_torch", ascending=False).reset_index(drop=True)
ranking_df


Unnamed: 0,variant,mode,keep_ratio,samples_per_s,throughput_ratio_vs_torch,speedup_forward_vs_torch,speedup_backward_vs_torch,speedup_step_vs_torch,mem_alloc_ratio_vs_torch,mem_reserved_ratio_vs_torch
0,Sparsity::input,input,0.25,3527.153975,0.203725,0.15301,0.29532,0.210188,1.408573,3.271605
1,Sparsity::input,input,0.75,3511.374428,0.202814,0.157108,0.294576,0.213299,1.635633,3.271605
2,Sparsity::block,block,0.5,3432.325143,0.198248,0.153541,0.283372,0.207108,1.683805,3.271605
3,Sparsity::channel,channel,0.25,3373.784959,0.194867,0.144265,0.297437,0.203394,1.693505,3.271605
4,Sparsity::input,input,0.5,3353.336249,0.193686,0.145348,0.289379,0.202146,1.535102,3.271605
5,Sparsity::channel,channel,0.75,3348.705365,0.193418,0.145831,0.276897,0.199007,1.695973,3.259259
6,Sparsity::block,block,0.25,3339.391017,0.19288,0.143752,0.290312,0.201049,1.693505,3.271605
7,Sparsity::channel,channel,0.5,3251.883801,0.187826,0.148139,0.261385,0.196101,1.683805,3.271605
8,Sparsity::block,block,0.75,3222.426748,0.186125,0.14041,0.270135,0.192631,1.695973,3.271605
9,Sparsity::channel,channel,0.6,3170.031484,0.183098,0.142819,0.254836,0.189955,1.699865,3.271605
