# Channel / Block / Input Sparsity Bench (FP16)
?????????? ????????, ??????????? `channel_sparsity_bench.ipynb`: ???? ?????? ???????, sweep ?? keep_ratio ??? channel/block/input sparsity, ????????? ? torch Conv2d, ??????????? ? ???-10 ?? ???????.

In [1]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path().resolve().parent))
print('sys.path[0]=', sys.path[0])

sys.path[0]= /mnt/d/VSCode-Projects/conv2d-img2col-gemm


In [2]:
import time
import torch
import pandas as pd
from pathlib import Path
import importlib

from conv_gemm.baseline_layers.triton_conv2d import TritonConv2d
from conv_gemm.configs import kernel_config as kc
import conv_gemm.baseline_operators.triton_conv2d_fp16_fn as tri_fn

torch.backends.cudnn.benchmark = True
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float16 if device == 'cuda' else torch.float32
print(f'device={device}, dtype={dtype}')
if device != 'cuda':
    print('?? CUDA ?????????? ? ?????????? ????? ????????? ? ?? ???????????????.')


device=cuda, dtype=torch.float16


In [3]:
# ????????? ???????
eval_warmup = 10
eval_iters = 50


def sync_device():
    if device == 'cuda':
        torch.cuda.synchronize()

def benchmark_layer(module: torch.nn.Module, x: torch.Tensor, warmup: int = eval_warmup, iters: int = eval_iters):
    module.eval()
    with torch.no_grad():
        for _ in range(warmup):
            module(x)
    sync_device()
    t0 = time.perf_counter()
    with torch.no_grad():
        for _ in range(iters):
            module(x)
    sync_device()
    return (time.perf_counter() - t0) * 1e3 / iters

def calc_diff(ref: torch.Tensor, test: torch.Tensor):
    diff = (ref - test).float()
    return {
        'mae': diff.abs().mean().item(),
        'max': diff.abs().max().item(),
        'rel_l2': (torch.norm(diff) / torch.norm(ref)).item(),
    }

def clone_weights(dst: torch.nn.Module, src: torch.nn.Module):
    with torch.no_grad():
        dst.weight.copy_(src.weight)
        if dst.bias is not None and src.bias is not None:
            dst.bias.copy_(src.bias)

def apply_block_cfg(cfg):
    tri_fn.FP16_GEMM_CFG = kc.KernelConfig(
        BLOCK_M=cfg['BLOCK_M'], BLOCK_N=cfg['BLOCK_N'], BLOCK_K=cfg['BLOCK_K'],
        NUM_WARPS=cfg['NUM_WARPS'], NUM_STAGES=cfg['NUM_STAGES'],
    )
    kc.FP16_GEMM_CFG = tri_fn.FP16_GEMM_CFG
    importlib.reload(tri_fn)

def build_triton(block_cfg):
    apply_block_cfg(block_cfg)
    layer = TritonConv2d(**params).to(device=device)
    clone_weights(layer, torch_conv)
    return layer


In [4]:
# ???? ???????? ???? (????? ???????? ?? ????)
params = dict(in_channels=1, out_channels=3, kernel_size=11, stride=1, padding=1, bias=True)
B, H, W = 16, 1024, 1024

torch.manual_seed(0)
if device == 'cuda':
    torch.cuda.manual_seed(0)

# ????????? torch Conv2d
torch_conv = torch.nn.Conv2d(**params).to(device=device, dtype=dtype)

# ??????? ?????? ?????? (????? ????????)
baseline_block_cfg = dict(BLOCK_M=64, BLOCK_N=64, BLOCK_K=32, NUM_WARPS=4, NUM_STAGES=2)

keep_ratios = [1.0, 0.85, 0.75, 0.65, 0.5, 0.35, 0.25]
print('keep_ratios:', keep_ratios)


keep_ratios: [1.0, 0.85, 0.75, 0.65, 0.5, 0.35, 0.25]


In [5]:
# ?????? ?????????? ??????? forward
calls_per_ratio = 2 + eval_warmup + eval_iters  # torch_out + triton_out + warmup+iters ? benchmark
calls_per_sweep = len(keep_ratios) * calls_per_ratio
total_calls = 3 * calls_per_sweep  # channel, block, input
print(f'?????? ????? forward-??????? ?? ???? ???????: {total_calls} (?? {calls_per_sweep} ?? ???? ????)')


?????? ????? forward-??????? ?? ???? ???????: 1302 (?? 434 ?? ???? ????)


In [None]:
# Dense baseline
tri_dense = build_triton(baseline_block_cfg) if device == 'cuda' else None


def run_channel_sweep(keep_ratios):
    rows = []
    if tri_dense is None:
        raise RuntimeError('Triton ?????????? (??? GPU)')
    total = len(keep_ratios)
    for idx, ratio in enumerate(keep_ratios, start=1):
        tri = build_triton(baseline_block_cfg)
        tri.set_channel_sparsity(ratio)
        x = torch.randn(B, params['in_channels'], H, W, device=device, dtype=dtype)
        with torch.no_grad():
            ref = torch_conv(x).float()
            out = tri(x).float()
        stats = calc_diff(ref, out)
        t_ms = benchmark_layer(tri, x.clone().detach())
        print(f"[channel {idx}/{total}] keep={ratio:.2f}, time_ms={t_ms:.3f}, mae={stats['mae']:.3e}, max={stats['max']:.3e}, rel_l2={stats['rel_l2']:.3e}", flush=True)
        rows.append({'mode': 'channel', 'keep_ratio': ratio, 'mae': stats['mae'], 'max': stats['max'],
                     'rel_l2': stats['rel_l2'], 'time_ms': t_ms,
                     'BLOCK_M': baseline_block_cfg['BLOCK_M'], 'BLOCK_N': baseline_block_cfg['BLOCK_N'],
                     'BLOCK_K': baseline_block_cfg['BLOCK_K'], 'NUM_WARPS': baseline_block_cfg['NUM_WARPS'], 'NUM_STAGES': baseline_block_cfg['NUM_STAGES']})
    return pd.DataFrame(rows)

def run_block_sweep(keep_ratios, block_size=4):
    rows = []
    if tri_dense is None:
        raise RuntimeError('Triton ?????????? (??? GPU)')
    total = len(keep_ratios)
    for idx, ratio in enumerate(keep_ratios, start=1):
        tri = build_triton(baseline_block_cfg)
        tri.set_block_sparsity(ratio, block_size=block_size)
        x = torch.randn(B, params['in_channels'], H, W, device=device, dtype=dtype)
        with torch.no_grad():
            ref = torch_conv(x).float()
            out = tri(x).float()
        stats = calc_diff(ref, out)
        t_ms = benchmark_layer(tri, x.clone().detach())
        print(f"[block {idx}/{total}] keep={ratio:.2f}, time_ms={t_ms:.3f}, mae={stats['mae']:.3e}, max={stats['max']:.3e}, rel_l2={stats['rel_l2']:.3e}", flush=True)
        rows.append({'mode': f'block-{block_size}', 'keep_ratio': ratio, 'mae': stats['mae'], 'max': stats['max'],
                     'rel_l2': stats['rel_l2'], 'time_ms': t_ms,
                     'BLOCK_M': baseline_block_cfg['BLOCK_M'], 'BLOCK_N': baseline_block_cfg['BLOCK_N'],
                     'BLOCK_K': baseline_block_cfg['BLOCK_K'], 'NUM_WARPS': baseline_block_cfg['NUM_WARPS'], 'NUM_STAGES': baseline_block_cfg['NUM_STAGES']})
    return pd.DataFrame(rows)

def run_input_sweep(keep_ratios):
    rows = []
    if tri_dense is None:
        raise RuntimeError('Triton ?????????? (??? GPU)')
    total = len(keep_ratios)
    for idx, ratio in enumerate(keep_ratios, start=1):
        tri = build_triton(baseline_block_cfg)
        tri.set_input_channel_sparsity(ratio)
        x = torch.randn(B, params['in_channels'], H, W, device=device, dtype=dtype)
        with torch.no_grad():
            ref = torch_conv(x).float()
            out = tri(x).float()
        stats = calc_diff(ref, out)
        t_ms = benchmark_layer(tri, x.clone().detach())
        print(f"[input {idx}/{total}] keep={ratio:.2f}, time_ms={t_ms:.3f}, mae={stats['mae']:.3e}, max={stats['max']:.3e}, rel_l2={stats['rel_l2']:.3e}", flush=True)
        rows.append({'mode': 'input', 'keep_ratio': ratio, 'mae': stats['mae'], 'max': stats['max'],
                     'rel_l2': stats['rel_l2'], 'time_ms': t_ms,
                     'BLOCK_M': baseline_block_cfg['BLOCK_M'], 'BLOCK_N': baseline_block_cfg['BLOCK_N'],
                     'BLOCK_K': baseline_block_cfg['BLOCK_K'], 'NUM_WARPS': baseline_block_cfg['NUM_WARPS'], 'NUM_STAGES': baseline_block_cfg['NUM_STAGES']})
    return pd.DataFrame(rows)

channel_sweep_df = run_channel_sweep(keep_ratios)
block_sweep_df = run_block_sweep(keep_ratios, block_size=4)
input_sweep_df = run_input_sweep(keep_ratios)

channel_sweep_df, block_sweep_df, input_sweep_df


[channel 1/7] keep=1.00, time_ms=701.986, mae=8.535e-05, max=1.953e-03, rel_l2=3.846e-04
[channel 2/7] keep=0.85, time_ms=701.884, mae=8.524e-05, max=1.953e-03, rel_l2=3.844e-04
[channel 3/7] keep=0.75, time_ms=637.952, mae=1.502e-01, max=3.105e+00, rel_l2=5.696e-01
[channel 4/7] keep=0.65, time_ms=691.378, mae=1.502e-01, max=3.219e+00, rel_l2=5.694e-01


In [None]:
out_dir = Path('notebooks')
out_dir.mkdir(parents=True, exist_ok=True)
channel_sweep_df.to_csv(out_dir / 'channel_sparsity_bench_results.csv', index=False)
block_sweep_df.to_csv(out_dir / 'block_sparsity_bench_results.csv', index=False)
input_sweep_df.to_csv(out_dir / 'input_sparsity_bench_results.csv', index=False)
print('saved results to notebooks/*_sparsity_bench_results.csv')


In [None]:
all_df = pd.concat([
    channel_sweep_df.assign(mode_group='channel'),
    block_sweep_df.assign(mode_group='block'),
    input_sweep_df.assign(mode_group='input')
], ignore_index=True)
all_df_sorted = all_df.sort_values('time_ms', ascending=True)
print('Top-10 fastest configs:')
display(all_df_sorted.head(10)[['mode', 'keep_ratio', 'time_ms', 'mae', 'max', 'rel_l2']])
