# Поиск быстрых конфигураций (shape × блоки Triton × sparsity)

Этот ноутбук перебирает сетку параметров свёртки (Kh/Kw, H/W, batch B), тюнинг блоков Triton (BLOCK_M/N/K, NUM_WARPS/NUM_STAGES) и три режима разрежения (`channel`, `block`, `input` с `keep_ratio`). Цель — найти комбинации, где `TritonConv2d` быстрее `nn.Conv2d` при приемлемой точности. Все замеры на CUDA, с прогревом.

**Метрики, которые считаются и выводятся:**
- `time_ms` — среднее время forward TritonConv2d (мс) для конкретной конфигурации.
- `torch_time_ms` — среднее время `nn.Conv2d` на той же задаче.
- `speedup_vs_torch` — отношение `torch_time_ms / time_ms`; >1 — Triton быстрее, <1 — медленнее.
- Ошибки (Triton vs torch): `mae`, `max`, `rel_l2`.
- Описание задачи: `kernel` (формат `KhxKw@B,H,W,k`) и `block` (компактная строка `BLOCK_M-BLOCK_N-BLOCK_K/NUM_WARPSxNUM_STAGES`).
- Sparsity: `mode` (`channel`, `block`, `input`), `keep_ratio` (доля оставленных каналов), `block_size` (для block sparsity), а также явные поля BLOCK_* для удобства фильтрации.

**Как читать результаты:**
- Сравнивайте `speedup_vs_torch` и ошибки: строки с `speedup_vs_torch > 1` и малыми `mae`/`rel_l2` — кандидаты на “лучшие”.
- При малом `keep_ratio` ожидается ускорение, но эффективность зависит от паддинга K (img2col) и выбранных BLOCK_*; иногда `speedup_vs_torch` падает.
- Для block sparsity оценивайте сочетание `keep_ratio` и `block_size` — группировка может улучшать или ухудшать тайлинг.


In [1]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path().resolve().parent))
print('sys.path[0]=', sys.path[0])

sys.path[0]= /home/manzhura/ITMO/EDLM/conv2d-img2col-gemm


In [None]:
import time
import torch
import pandas as pd
from pathlib import Path
import importlib
from conv_gemm.baseline_layers.triton_conv2d import TritonConv2d
import conv_gemm.baseline_operators.triton_conv2d_fp16_fn as tri_fn
from conv_gemm.configs import kernel_config as kc

torch.backends.cudnn.benchmark = True
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float16 if device == 'cuda' else torch.float32
print(f'device={device}, dtype={dtype}')


device=cuda, dtype=torch.float16


In [None]:
def sync_device():
    if device == 'cuda':
        torch.cuda.synchronize()

def clone_weights(dst: torch.nn.Module, src: torch.nn.Module):
    with torch.no_grad():
        dst.weight.copy_(src.weight)
        if dst.bias is not None and src.bias is not None:
            dst.bias.copy_(src.bias)

def benchmark_layer(layer: torch.nn.Module, x: torch.Tensor, warmup: int = 10, iters: int = 50) -> float:
    layer.eval()
    with torch.no_grad():
        for _ in range(warmup):
            _ = layer(x)
        sync_device()
        if device == 'cuda':
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()
            for _ in range(iters):
                _ = layer(x)
            end.record()
            sync_device()
            elapsed_ms = start.elapsed_time(end)
        else:
            t0 = time.perf_counter()
            for _ in range(iters):
                _ = layer(x)
            elapsed_ms = (time.perf_counter() - t0) * 1e3
    return elapsed_ms / iters

def build_torch_conv(cfg):
    return torch.nn.Conv2d(
        in_channels=cfg['in_channels'],
        out_channels=cfg['out_channels'],
        kernel_size=cfg['kernel_size'],
        stride=cfg['stride'],
        padding=cfg['padding'],
        dilation=cfg['dilation'],
        bias=True,
    ).to(device=device, dtype=dtype)

def calc_diff(ref: torch.Tensor, test: torch.Tensor):
    diff = (ref - test).float()
    return {
        'mae': diff.abs().mean().item(),
        'max': diff.abs().max().item(),
        'rel_l2': (torch.norm(diff) / torch.norm(ref)).item(),
    }

def apply_sparsity(layer: TritonConv2d, mode: str, ratio: float):
    layer.clear_sparsity()
    if mode == 'dense':
        return
    if mode == 'channel':
        layer.set_channel_sparsity(ratio)
    elif mode == 'block':
        layer.set_block_sparsity(ratio, block_size=4)
    elif mode == 'input':
        layer.set_input_channel_sparsity(ratio)
    else:
        raise ValueError(f'unknown mode {mode}')

def apply_block_cfg(cfg):
    tri_fn.FP16_GEMM_CFG = kc.KernelConfig(
        BLOCK_M=cfg['BLOCK_M'],
        BLOCK_N=cfg['BLOCK_N'],
        BLOCK_K=cfg['BLOCK_K'],
        NUM_WARPS=cfg['NUM_WARPS'],
        NUM_STAGES=cfg['NUM_STAGES'],
    )
    kc.FP16_GEMM_CFG = tri_fn.FP16_GEMM_CFG
    importlib.reload(tri_fn)

torch.manual_seed(0)
if device == 'cuda':
    torch.cuda.manual_seed(0)

In [4]:
kernel_grid = [
    dict(name='1x1@56', in_channels=64, out_channels=64, kernel_size=1, stride=1, padding=0, dilation=1, B=32, H=56, W=56),
    dict(name='3x3@56', in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, dilation=1, B=16, H=56, W=56),
    dict(name='5x5@56', in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2, dilation=1, B=16, H=56, W=56),
    dict(name='7x7@56', in_channels=64, out_channels=128, kernel_size=7, stride=1, padding=3, dilation=1, B=16, H=56, W=56),
    dict(name='11x11@56', in_channels=64, out_channels=128, kernel_size=11, stride=1, padding=5, dilation=1, B=8, H=56, W=56),
    dict(name='3x3@112', in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, dilation=1, B=16, H=112, W=112),
    dict(name='5x5@112', in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2, dilation=1, B=16, H=112, W=112),
    dict(name='7x7@112', in_channels=64, out_channels=128, kernel_size=7, stride=1, padding=3, dilation=1, B=12, H=112, W=112),
    dict(name='11x11@112', in_channels=64, out_channels=128, kernel_size=11, stride=1, padding=5, dilation=1, B=8, H=112, W=112),
    dict(name='3x3@224', in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, dilation=1, B=8, H=224, W=224),
    dict(name='7x7@224', in_channels=64, out_channels=128, kernel_size=7, stride=1, padding=3, dilation=1, B=6, H=224, W=224),
    dict(name='11x11@224', in_channels=64, out_channels=128, kernel_size=11, stride=1, padding=5, dilation=1, B=4, H=224, W=224),
    dict(name='13x13@224', in_channels=64, out_channels=128, kernel_size=13, stride=1, padding=6, dilation=1, B=2, H=224, W=224),
    dict(name='3x3@512', in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, dilation=1, B=4, H=512, W=512),
    dict(name='7x7@512', in_channels=64, out_channels=128, kernel_size=7, stride=1, padding=3, dilation=1, B=2, H=512, W=512),
    dict(name='3x3@32', in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, dilation=1, B=32, H=32, W=32),
    dict(name='5x5@32', in_channels=128, out_channels=256, kernel_size=5, stride=1, padding=2, dilation=1, B=32, H=32, W=32),
    dict(name='7x7@32', in_channels=128, out_channels=256, kernel_size=7, stride=1, padding=3, dilation=1, B=32, H=32, W=32),
    dict(name='3x3@16', in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, dilation=1, B=64, H=16, W=16),
    dict(name='3x3_s2@112', in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1, dilation=1, B=16, H=112, W=112),
    dict(name='3x3_dil2@56', in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2, dilation=2, B=16, H=56, W=56),
]
print('kernel configs:', len(kernel_grid))

block_grid = [
    dict(name='64-64-16/4x2', BLOCK_M=64, BLOCK_N=64, BLOCK_K=16, NUM_WARPS=4, NUM_STAGES=2),
    dict(name='64-64-32/4x2', BLOCK_M=64, BLOCK_N=64, BLOCK_K=32, NUM_WARPS=4, NUM_STAGES=2),
    dict(name='64-64-64/4x2', BLOCK_M=64, BLOCK_N=64, BLOCK_K=64, NUM_WARPS=4, NUM_STAGES=2),
    dict(name='64-128-32/4x2', BLOCK_M=64, BLOCK_N=128, BLOCK_K=32, NUM_WARPS=4, NUM_STAGES=2),
    dict(name='128-64-32/8x2', BLOCK_M=128, BLOCK_N=64, BLOCK_K=32, NUM_WARPS=8, NUM_STAGES=2),
    dict(name='128-128-32/8x2', BLOCK_M=128, BLOCK_N=128, BLOCK_K=32, NUM_WARPS=8, NUM_STAGES=2),
    dict(name='128-128-64/8x2', BLOCK_M=128, BLOCK_N=128, BLOCK_K=64, NUM_WARPS=8, NUM_STAGES=2),
    dict(name='128-64-32/8x3', BLOCK_M=128, BLOCK_N=64, BLOCK_K=32, NUM_WARPS=8, NUM_STAGES=3),
    dict(name='64-128-32/4x3', BLOCK_M=64, BLOCK_N=128, BLOCK_K=32, NUM_WARPS=4, NUM_STAGES=3),
    dict(name='64-64-32/2x2', BLOCK_M=64, BLOCK_N=64, BLOCK_K=32, NUM_WARPS=2, NUM_STAGES=2),
]
print('block configs:', len(block_grid))

keep_ratios = [1.0, 0.85, 0.75, 0.65, 0.5, 0.35, 0.25]
modes = ['dense', 'channel', 'block', 'input']
print('keep_ratios:', keep_ratios)
print('modes:', modes)

kernel configs: 21
block configs: 10
keep_ratios: [1.0, 0.85, 0.75, 0.65, 0.5, 0.35, 0.25]
modes: ['dense', 'channel', 'block', 'input']


### Что будет выведено в `results_df`

`results_df` содержит строку на каждую комбинацию (shape × блоки Triton × sparsity режим/keep_ratio). Столбцы:
- Время: `time_ms` (Triton), `torch_time_ms`, `speedup_vs_torch`.
- Ошибки: `mae`, `max`, `rel_l2`.
- Параметры: `kernel`, `mode`, `keep_ratio`, `block_size` (если есть), `block` и явные BLOCK_M/N/K, NUM_WARPS, NUM_STAGES.
Файл сохраняется в `notebooks/notebooks/sparsity_kernel_search_results.csv` для дальнейшей фильтрации.


In [None]:

rows = []
if device != 'cuda':
    print('CUDA not available.')
else:
    combos_per_kernel = len(block_grid) * (1 + 3 * len(keep_ratios))
    total = len(kernel_grid) * combos_per_kernel
    done = 0
    print(f'total combinations: {total}')
    for cfg in kernel_grid:
        torch_conv = build_torch_conv(cfg)
        x = torch.randn(cfg['B'], cfg['in_channels'], cfg['H'], cfg['W'], device=device, dtype=dtype)
        torch_time = benchmark_layer(torch_conv, x)
        torch_out = torch_conv(x).detach().float()
        for block in block_grid:
            apply_block_cfg(block)
            for mode in modes:
                ratios = keep_ratios if mode != 'dense' else [1.0]
                for ratio in ratios:
                    layer = TritonConv2d(
                        in_channels=cfg['in_channels'],
                        out_channels=cfg['out_channels'],
                        kernel_size=cfg['kernel_size'],
                        stride=cfg['stride'],
                        padding=cfg['padding'],
                        dilation=cfg['dilation'],
                        bias=True,
                    ).to(device)
                    clone_weights(layer, torch_conv)
                    apply_sparsity(layer, mode, ratio)
                    time_ms = benchmark_layer(layer, x)
                    with torch.no_grad():
                        y = layer(x).float()
                    diff_stats = calc_diff(torch_out, y)
                    done += 1
                    print(f"[{done}/{total}] kernel={cfg['name']}, block={block['name']}, mode={mode}, keep_ratio={ratio:.2f}, time_ms={time_ms:.3f}, speedup_vs_torch={torch_time/time_ms if time_ms>0 else float('nan'):.3f}", flush=True)
                    rows.append({
                        'kernel': cfg['name'],
                        'B': cfg['B'], 'H': cfg['H'], 'W': cfg['W'], 'k': cfg['kernel_size'],
                        'block': block['name'],
                        'BLOCK_M': block['BLOCK_M'], 'BLOCK_N': block['BLOCK_N'], 'BLOCK_K': block['BLOCK_K'],
                        'NUM_WARPS': block['NUM_WARPS'], 'NUM_STAGES': block['NUM_STAGES'],
                        'mode': mode,
                        'keep_ratio': float(ratio),
                        'time_ms': time_ms,
                        'torch_time_ms': torch_time,
                        'speedup_vs_torch': torch_time / time_ms if time_ms > 0 else float('nan'),
                        'mae': diff_stats['mae'],
                        'max': diff_stats['max'],
                        'rel_l2': diff_stats['rel_l2'],
                    })
results_df = pd.DataFrame(rows)
results_df


total combinations: 4620
[1/4620] kernel=1x1@56, block=64-64-16/4x2, mode=dense, keep_ratio=1.00, time_ms=0.690, speedup_vs_torch=0.169
[2/4620] kernel=1x1@56, block=64-64-16/4x2, mode=channel, keep_ratio=1.00, time_ms=0.684, speedup_vs_torch=0.171
[3/4620] kernel=1x1@56, block=64-64-16/4x2, mode=channel, keep_ratio=0.85, time_ms=0.863, speedup_vs_torch=0.136
[4/4620] kernel=1x1@56, block=64-64-16/4x2, mode=channel, keep_ratio=0.75, time_ms=0.940, speedup_vs_torch=0.124
[5/4620] kernel=1x1@56, block=64-64-16/4x2, mode=channel, keep_ratio=0.65, time_ms=0.815, speedup_vs_torch=0.143
[6/4620] kernel=1x1@56, block=64-64-16/4x2, mode=channel, keep_ratio=0.50, time_ms=0.741, speedup_vs_torch=0.158
[7/4620] kernel=1x1@56, block=64-64-16/4x2, mode=channel, keep_ratio=0.35, time_ms=0.705, speedup_vs_torch=0.166
[8/4620] kernel=1x1@56, block=64-64-16/4x2, mode=channel, keep_ratio=0.25, time_ms=0.633, speedup_vs_torch=0.185
[9/4620] kernel=1x1@56, block=64-64-16/4x2, mode=block, keep_ratio=1.00, 

Unnamed: 0,kernel,B,H,W,k,block,BLOCK_M,BLOCK_N,BLOCK_K,NUM_WARPS,NUM_STAGES,mode,keep_ratio,time_ms,torch_time_ms,speedup_vs_torch,mae,max,rel_l2
0,1x1@56,32,56,56,1,64-64-16/4x2,64,64,16,4,2,dense,1.00,0.690237,0.116961,0.169451,0.000081,0.001953,0.000349
1,1x1@56,32,56,56,1,64-64-16/4x2,64,64,16,4,2,channel,1.00,0.684175,0.116961,0.170952,0.000081,0.001953,0.000349
2,1x1@56,32,56,56,1,64-64-16/4x2,64,64,16,4,2,channel,0.85,0.862966,0.116961,0.135534,0.066459,3.164062,0.363663
3,1x1@56,32,56,56,1,64-64-16/4x2,64,64,16,4,2,channel,0.75,0.940052,0.116961,0.124420,0.107343,3.164062,0.464506
4,1x1@56,32,56,56,1,64-64-16/4x2,64,64,16,4,2,channel,0.65,0.815452,0.116961,0.143431,0.149413,3.164062,0.551657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4615,3x3_dil2@56,16,56,56,3,64-64-32/2x2,64,64,32,2,2,input,0.75,1.192079,0.309840,0.259916,0.220624,1.561493,0.491858
4616,3x3_dil2@56,16,56,56,3,64-64-32/2x2,64,64,32,2,2,input,0.65,1.037353,0.309840,0.298683,0.259121,1.747925,0.577678
4617,3x3_dil2@56,16,56,56,3,64-64-32/2x2,64,64,32,2,2,input,0.50,0.962518,0.309840,0.321906,0.313619,2.298828,0.698951
4618,3x3_dil2@56,16,56,56,3,64-64-32/2x2,64,64,32,2,2,input,0.35,0.898109,0.309840,0.344991,0.360284,2.861572,0.802752


### Отбор лучших конфигураций

Здесь результаты сортируются и сохраняются:
- `sparsity_kernel_search_results.csv` — полный протокол (все комбинации).
- `sparsity_kernel_best.csv` — по одной лучшей строке на каждый `kernel`, отобранной по минимальному `time_ms` (или максимальному `speedup_vs_torch`, если так настроено), при условии приемлемой ошибки.
Смотрите в `sparsity_kernel_best.csv` поля `block`, BLOCK_*, `mode`, `keep_ratio` — их можно использовать как готовые настройки для конкретных размерностей задач.


In [None]:
if not results_df.empty:
    best_df = results_df.sort_values('speedup_vs_torch', ascending= False).groupby('kernel', as_index=False).first()
    display(best_df[['kernel', 'block', 'mode', 'keep_ratio', 'time_ms', 'speedup_vs_torch']])
else:
    print('CUDA not available.')

Unnamed: 0,kernel,block,mode,keep_ratio,time_ms,speedup_vs_torch
0,11x11@112,64-128-32/4x3,input,0.25,4.055122,1.156814
1,11x11@224,64-128-32/4x2,input,0.25,9.451843,0.972255
2,11x11@56,64-128-32/4x2,input,0.25,1.31357,1.029903
3,13x13@224,128-128-32/8x2,input,0.25,6.089298,1.051223
4,1x1@56,64-64-16/4x2,channel,0.25,0.633467,0.184637
5,3x3@112,64-128-32/4x2,input,0.25,2.235843,0.548419
6,3x3@16,64-128-32/4x3,input,0.25,1.035939,1.016548
7,3x3@224,64-64-64/4x2,block,0.25,5.116416,0.4578
8,3x3@32,64-128-32/4x3,input,0.25,0.950318,0.65876
9,3x3@512,64-64-64/4x2,block,0.25,12.938813,0.466389


In [None]:
if not results_df.empty:
    out_all = Path('notebooks/sparsity_kernel_search_results.csv')
    out_best = Path('notebooks/sparsity_kernel_best.csv')
    results_df.to_csv(out_all, index=False)
    results_df.sort_values('time_ms', ascending=True).groupby('kernel', as_index=False).first().to_csv(out_best, index=False)
    print('saved:', out_all)
    print('saved:', out_best)
else:
    print('result dataframe is empty.')

saved: notebooks/sparsity_kernel_search_results.csv
saved: notebooks/sparsity_kernel_best.csv


### Как интерпретировать “best” выборку

Показаны лучшие конфигурации по каждой задаче `kernel`. Обращайте внимание на:
- `speedup_vs_torch`: >1 — ускорение, <1 — проигрыш.
- Ошибки (`mae`, `rel_l2`): должны оставаться низкими (обычно 1e-4–1e-3 для fp16).
- `mode`/`keep_ratio`: какие режимы разрежения реально помогают на данном размере.
- `block`: совпадение BLOCK_M/N/K с формой задачи (не всегда крупные блоки дают выигрыш при малом K).
Используйте эти строки как шпаргалку для дальнейших бенчей или фиксации параметров в коде.


In [2]:
import pandas as pd
from pathlib import Path

csv_path = Path("notebooks/sparsity_kernel_search_results.csv")

if csv_path.exists():
    results_df = pd.read_csv(csv_path)
    print(f"Loaded: {csv_path}")
else:
    raise FileNotFoundError(f"CSV not found: {csv_path}")
df = results_df.copy()

# === 1) Топ-3 для input ===
top3_input = (
    df[df["mode"] == "input"]
    .sort_values("speedup_vs_torch", ascending=False)
    .head(3)
)

# === 2) Топ-1 для block ===
top1_block = (
    df[df["mode"] == "block"]
    .sort_values("speedup_vs_torch", ascending=False)
    .head(1)
)

# === 3) Топ-1 для channel ===
top1_channel = (
    df[df["mode"] == "channel"]
    .sort_values("speedup_vs_torch", ascending=False)
    .head(1)
)

slide_df = pd.concat([top3_input, top1_block, top1_channel], ignore_index=True)

slide_df = slide_df[[
    "kernel", "block", "mode", "keep_ratio",
    "time_ms", "speedup_vs_torch"
]]

slide_df = slide_df.set_index(["kernel", "block", "mode", "keep_ratio"])

slide_df


Loaded: notebooks/sparsity_kernel_search_results.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,time_ms,speedup_vs_torch
kernel,block,mode,keep_ratio,Unnamed: 4_level_1,Unnamed: 5_level_1
7x7@32,64-128-32/4x3,input,0.25,1.860874,1.353994
7x7@32,64-128-32/4x2,input,0.25,1.899889,1.326189
7x7@32,128-128-32/8x2,input,0.25,1.902182,1.32459
3x3@16,64-128-32/4x3,block,0.25,1.156465,0.910604
3x3@16,64-128-32/4x3,channel,0.25,1.161277,0.90683
