如何在不依赖模版的情况下找到最佳的schedule，与基于模板的autotvm不同，auto-scheduler不需要任何模板，用户只需要编写计算声明，如需任何schedule命令或模板，auto-scheduler可以自动生成一个大的搜索空间，并在空间中找到最优的schedule

In [2]:
import os

import numpy as np
import tvm
from tvm import te, auto_scheduler

定义矩阵乘法

In [3]:
@auto_scheduler.register_workload
def matmul_add(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)
    C = te.placeholder((N, M), name="C", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    matmul = te.compute(
        (N, M),
        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
        name="matmul",
        # 开启对tensor B的自动排布转换
        attrs={"layout_free_placeholers": [B]}, 
    )
    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")

    return [A, B, C, out]

创建搜索任务

在这里为这个矩阵乘法质地你个特定的参数，如下面是两个大小为1024*1024的矩阵乘法

In [4]:
target = tvm.target.Target("llvm")
N = L = M = 1024
task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)

# 检查计算图
print("Computational DAG:")
print(task.compute_dag)

Computational DAG:
A = PLACEHOLDER [1024, 1024]
B = PLACEHOLDER [1024, 1024]
matmul(i, j) += (A[i, k]*B[k, j])
C = PLACEHOLDER [1024, 1024]
out(i, j) = (matmul[i, j] + C[i, j])



下面来为auto-scheduler设置参数

+ `num_measure_trials`表示搜索过程中可用的测试试验次数
+ 用`RecordToFile`将测试记录记录到文件`matmul.json`中

In [5]:
log_file = "matmul.json"
tune_option = auto_scheduler.TuningOptions(
    num_measure_trials=10,
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    verbose=2,
)

开始搜索

In [6]:
# 运行auto-tuning
task.tune(tune_option)
# 应用最佳schedule
sch, args = task.apply_best(log_file)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 3
Sample Initial Population	#s: 2019	fail_ct: 0	Time elapsed: 1.28
GA Iter: 0	Max score: 0.9985	Min score: 0.9350	#Pop: 128	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9850	#Pop: 128	#M+: 1380	#M-: 71
EvolutionarySearch		#s: 128	Time elapsed: 5.68
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 10 programs to measure:
..........**********
No: 1	GFLOPS: 30.84 / 30.84	results: MeasureResult(cost:[0.0697], error_no:0, all_cost:0.46, Tstamp:1677585142.24)
Placeholder: A, B, C
matmul auto_unroll: 16
parallel i.0@j.0@i.1@j.1@ (0,8)
  for k.0 (0,512)
    for i.2 (0,2)
      for j.2 (0,64)
        for k.1 (0,2)
          for i.3 (0,64)
  

检查优化的schedule

In [7]:
print("Lowered TIR:")
print(tvm.lower(sch, args, simple_mode=True))

Lowered TIR:
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 1024), "float32"), B: T.Buffer((1024, 1024), "float32"), C: T.Buffer((1024, 1024), "float32"), out: T.Buffer((1024, 1024), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        for i_outer_outer_j_outer_outer_fused_i_outer_inner_fused in T.parallel(128):
            matmul = T.allocate([512], "float32", "global")
            for j_outer_inner in range(16):
                matmul_1 = T.Buffer((512,), data=matmul)
                matmul_1[0] = T.float32(0)
                matmul_1[16] = T.float32(0)
                matmul_1[32] = T.float32(0)
                matmul_1[48] = T.float32(0)
                matmul_1[64] = T.float32(0)
                matmul_1[80] = T.float32(0)
                matmul_1[96] = T.float32(0)
                matmul_1[112] = T.float32(0)
       

检查正确性并评估

In [8]:
func = tvm.build(sch, args, target)
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = np.random.uniform(size=(N, M)).astype(np.float32)
out_np = a_np.dot(b_np) + c_np

dev = tvm.cpu()
a_tvm = tvm.nd.array(a_np, device=dev)
b_tvm = tvm.nd.array(b_np, device=dev)
c_tvm = tvm.nd.array(c_np, device=dev)
out_tvm = tvm.nd.empty(out_np.shape, device=dev)
# 预先分配内存，提升性能，防止重复的内存分配
func(a_tvm, b_tvm, c_tvm, out_tvm)

# 检查结果
np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)

# Evaluate execution time.
evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
print(
    "Execution time of this operator: %.3f ms"
    % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
)

Execution time of this operator: 25.661 ms


使用记录文件

In [9]:
print("Equivalent python schedule:")
print(task.print_best(log_file))

Equivalent python schedule:
matmul_i, matmul_j, matmul_k = tuple(matmul.op.axis) + tuple(matmul.op.reduce_axis)
out_i, out_j = tuple(out.op.axis) + tuple(out.op.reduce_axis)
matmul_i_o_i, matmul_i_i = s[matmul].split(matmul_i, factor=16)
matmul_i_o_o_i, matmul_i_o_i = s[matmul].split(matmul_i_o_i, factor=2)
matmul_i_o_o_o, matmul_i_o_o_i = s[matmul].split(matmul_i_o_o_i, factor=16)
matmul_j_o_i, matmul_j_i = s[matmul].split(matmul_j, factor=1)
matmul_j_o_o_i, matmul_j_o_i = s[matmul].split(matmul_j_o_i, factor=16)
matmul_j_o_o_o, matmul_j_o_o_i = s[matmul].split(matmul_j_o_o_i, factor=16)
matmul_k_o, matmul_k_i = s[matmul].split(matmul_k, factor=8)
s[matmul].reorder(matmul_i_o_o_o, matmul_j_o_o_o, matmul_i_o_o_i, matmul_j_o_o_i, matmul_k_o, matmul_i_o_i, matmul_j_o_i, matmul_k_i, matmul_i_i, matmul_j_i)
out_i_o_i, out_i_i = s[out].split(out_i, factor=32)
out_i_o_o, out_i_o_i = s[out].split(out_i_o_i, factor=16)
out_j_o_i, out_j_i = s[out].split(out_j, factor=16)
out_j_o_o, out_j_o_i = 

恢复搜索比较复杂，需要自己创建搜索策略和cost model，通过日志文件恢复搜索策略和cost model的状态，下面示例进行了5次实验来恢复它们的状态

In [10]:
def resume_search(task, log_file):
    print("Resume search:")
    cost_model = auto_scheduler.XGBModel()
    cost_model.update_from_file(log_file)
    search_policy = auto_scheduler.SketchPolicy(
        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
    )
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
    )
    task.tune(tune_option, search_policy=search_policy)

resume_search(task, log_file)

Resume search:
----------------------------------------------------------------------
------------------------------  [ Call init-search callbacks ]
----------------------------------------------------------------------
SearchPolicy: Loaded 10 measurement records from matmul.json for ["matmul_add", 1024, 1024, 1024, "float32"]
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 3
Sample Initial Population	#s: 2013	fail_ct: 0	Time elapsed: 1.62
GA Iter: 0	Max score: 0.9972	Min score: 0.9298	#Pop: 128	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9996	Min score: 0.9883	#Pop: 128	#M+: 1374	#M-: 82
EvolutionarySearch		#s: 128	Time elapsed: 6.73
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 5 programs to