-
Notifications
You must be signed in to change notification settings - Fork 52
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
When I used bitblas to tune a matmul operator on the A100, I encountered the following error.

Environment Configuration:
Ubuntu 22.04.2 LTS
bitblas: 2bd1dee
cuda12.1
The specific test script is as follows:
from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.arch import CUDA
from bitblas.base.utils import apply_and_build
import tvm
from tvm.script import tir as T
import bitblas
@tvm.script.ir_module
class FusedSingleOp:
@T.prim_func(private=True)
def dense1(lv11: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), B: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), T_matmul_NT: T.Buffer((T.int64(1024), T.int64(1024)), "float16")):
T.func_attr({"layout_free_buffers": [1], "op_attrs": {"op_name": "nn.dense", "out_dtype": "float16", "units": None}, "op_pattern": 4, "tir.noalias": T.bool(True)})
# with T.block("root"):
for i0, i1, k in T.grid(T.int64(1024), T.int64(1024), T.int64(1024)):
with T.block("T_matmul_NT"):
v_i0, v_i1, v_k = T.axis.remap("SSR", [i0, i1, k])
T.reads(lv11[v_i0, v_k], B[v_i1, v_k])
T.writes(T_matmul_NT[v_i0, v_i1])
with T.init():
T_matmul_NT[v_i0, v_i1] = T.float16(0)
T_matmul_NT[v_i0, v_i1] = T_matmul_NT[v_i0, v_i1] + lv11[v_i0, v_k] * B[v_i1, v_k]
ir_module = FusedSingleOp
func = ir_module["dense1"]
target = tvm.target.Target("cuda")
arch = CUDA(target)
policy = DefaultPolicy(func=func, arch=arch)
try:
tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target)
except Exception:
tags = None
# Tune with Tensor Core if possible
if tags:
policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags)
configs = policy.emit_config(topk=20)
cpresults, best = apply_and_build(func, configs, arch, parallel_build=False)
print(best.code)
When I tested the same script on the V100 with same Environment Configuration, the errorInternalError: Check failed: func->buffer_map.size() == 0 (3 vs. 0) : This pass must be called after MakePackedAPIdid not occur. Instead, it resulted in:

The specific test script is as follows:
from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.arch import CUDA
from bitblas.base.utils import apply_and_build
import tvm
from tvm.script import tir as T
import bitblas
@tvm.script.ir_module
class FusedSingleOp:
@T.prim_func(private=True)
def dense1(lv11: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), B: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), T_matmul_NT: T.Buffer((T.int64(1024), T.int64(1024)), "float16")):
T.func_attr({"layout_free_buffers": [1], "op_attrs": {"op_name": "nn.dense", "out_dtype": "float16", "units": None}, "op_pattern": 4, "tir.noalias": T.bool(True)})
# with T.block("root"):
for i0, i1, k in T.grid(T.int64(1024), T.int64(1024), T.int64(1024)):
with T.block("T_matmul_NT"):
v_i0, v_i1, v_k = T.axis.remap("SSR", [i0, i1, k])
T.reads(lv11[v_i0, v_k], B[v_i1, v_k])
T.writes(T_matmul_NT[v_i0, v_i1])
with T.init():
T_matmul_NT[v_i0, v_i1] = T.float16(0)
T_matmul_NT[v_i0, v_i1] = T_matmul_NT[v_i0, v_i1] + lv11[v_i0, v_k] * B[v_i1, v_k]
ir_module = FusedSingleOp
func = ir_module["dense1"]
target = tvm.target.Target("cuda")
arch = CUDA(target)
policy = DefaultPolicy(func=func, arch=arch)
try:
tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target)
except Exception:
tags = None
# Tune with Tensor Core if possible
if tags:
policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags)
configs = policy.emit_config(topk=20)
cpresults, best = apply_and_build(func, configs, arch, parallel_build=False)
print(best.code)
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working