In [21]:
import tvm
import tvm.testing
from tvm import te
import numpy as np

In [22]:
tgt = tvm.target.Target(target="llvm", host="llvm")

下述语句只描述了如何计算，并不会发生实际的计算

In [23]:
n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.placeholder((n,), name="B")
# 对A和B中的第i个元素进行加法运算
C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")

 TVM 要求用户提供 schedule，这是对如何执行计算的描述。 TE 中的调度操作可以在其他操作中更改循环顺序、跨不同线程拆分计算以及将数据块组合在一起。调度背后的一个重要概念是它们只描述如何执行计算，因此同一 TE 的不同调度将产生相同的结果。

In [24]:
s = te.create_schedule(C.op)

In [25]:
fadd = tvm.build(s, [A, B, C], tgt, name="myadd")

[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.InjectPrefetch
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.TextureFlatten
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlatten
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferShapeLegalize
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferStrideLegalize
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ThreadScopePropagate
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferBindUnwrapper
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ApplyLayoutTransforms
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlattener
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.AssertSimplifier
[14:31:12] /home/patrick/Code/tvm/src/ir/transform.cc:440

In [26]:
dev = tvm.device(tgt.kind.name, 0)

n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)

fadd(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

In [27]:
import timeit

np_repeat = 100
np_running_time = timeit.timeit(
    setup="import numpy\n"
    "n = 32768\n"
    'dtype = "float32"\n'
    "a = numpy.random.rand(n, 1).astype(dtype)\n"
    "b = numpy.random.rand(n, 1).astype(dtype)\n",
    stmt="answer = a + b",
    number=np_repeat,
)
print("Numpy running time: %f" % (np_running_time / np_repeat))

def evaluate_addition(func, target, optimization, log):
    dev = tvm.device(target.kind.name, 0)
    n = 32768
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)

    evaluator = func.time_evaluator(func.entry_name, dev, number=10)
    mean_time = evaluator(a, b, c).mean
    print("%s: %f" % (optimization, mean_time))

    log.append((optimization, mean_time))

log = [("numpy", np_running_time / np_repeat)]
evaluate_addition(fadd, tgt, "naive", log=log)

Numpy running time: 0.000005
naive: 0.000004


更新schedule以使用并行性，当调度应用于TE中的表达式时，输入和输出保持不变，但在编译时，表达式的实现会发生变化。

默认的调度的这种张量加法是串行运行的，不过可以很容易实现在所有处理器线程上并行化，将并行调度操作应用于我们的计算

In [28]:
s[C].parallel(C.op.axis[0])

In [29]:
print(tvm.lower(s, [A, B, C], simple_mode=True)) # simple_mode=True来返回可读的C风格语言

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.var("int32")
        stride = T.var("int32")
        A_1 = T.match_buffer(A, (n,), strides=(stride,), type="auto")
        stride_1 = T.var("int32")
        B_1 = T.match_buffer(B, (n,), strides=(stride_1,), type="auto")
        stride_2 = T.var("int32")
        C_1 = T.match_buffer(C, (n,), strides=(stride_2,), type="auto")
        for i in T.parallel(n):
            C_2 = T.Buffer((stride_2 * n,), data=C_1.data, type="auto")
            A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
            B_2 = T.Buffer((stride_1 * n,), data=B_1.data, type="auto")
            C_2[i * stride_2] = A_2[i * stride] + B_2[i * stride_1]


[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.InjectPrefetch
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.TextureFlatten
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlatten
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferShapeLegalize
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferStrideLegalize
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ThreadScopePropagate
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferBindUnwrapper
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ApplyLayoutTransforms
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlattener
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.AssertSimplifier
[14:31:16] /home/patrick/Code/tvm/src/ir/transform.cc:440

In [30]:
fadd_parallel = tvm.build(s, [A, B, C], tgt, name="myadd_parallel")
fadd_parallel(a, b, c)

tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

evaluate_addition(fadd_parallel, tgt, "parallel", log=log)

parallel: 0.000005


[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.InjectPrefetch
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.TextureFlatten
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlatten
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferShapeLegalize
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferStrideLegalize
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ThreadScopePropagate
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferBindUnwrapper
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ApplyLayoutTransforms
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlattener
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.AssertSimplifier
[14:34:51] /home/patrick/Code/tvm/src/ir/transform.cc:440

现代 CPU 可以对浮点值执行 SIMD 操作，利用这一优势，可将另一个调度应用于计算表达式中。实现这一点需要多个步骤：首先，用拆分调度原语将调度拆分为内部循环和外部循环。内部循环可用向量化调度原语来调用 SIMD 指令，然后可用并行调度原语对外部循环进行并行化。选择拆分因子作为 CPU 上的线程数量。

In [31]:
# 重新创建 schedule, 因为前面的例子在并行操作中修改了它
n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.placeholder((n,), name="B")
C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")

s = te.create_schedule(C.op)

# 这个因子应该和适合 CPU 的线程数量匹配。
# 这会因架构差异而有所不同，不过好的规则是
# 将这个因子设置为 CPU 可用内核数量。
factor = 4

outer, inner = s[C].split(C.op.axis[0], factor=factor)
s[C].parallel(outer)
s[C].vectorize(inner)

fadd_vector = tvm.build(s, [A, B, C], tgt, name="myadd_parallel")

evaluate_addition(fadd_vector, tgt, "vector", log=log)

print(tvm.lower(s, [A, B, C], simple_mode=True))

vector: 0.000010
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.var("int32")
        stride = T.var("int32")
        A_1 = T.match_buffer(A, (n,), strides=(stride,), type="auto")
        stride_1 = T.var("int32")
        B_1 = T.match_buffer(B, (n,), strides=(stride_1,), type="auto")
        stride_2 = T.var("int32")
        C_1 = T.match_buffer(C, (n,), strides=(stride_2,), type="auto")
        for i_outer in T.parallel((n + 3) // 4):
            for i_inner_s in range(4):
                if T.likely(i_outer * 4 + i_inner_s < n):
                    C_2 = T.Buffer((stride_2 * n,), data=C_1.data, type="auto")
                    A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
                    B_2 = T.Buffer((stride_1 * n,), data=B_1.data, type=

[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.InjectPrefetch
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.TextureFlatten
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlatten
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferShapeLegalize
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferStrideLegalize
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ThreadScopePropagate
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.BufferBindUnwrapper
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.ApplyLayoutTransforms
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.StorageFlattener
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440: Running pass tir.AssertSimplifier
[14:35:58] /home/patrick/Code/tvm/src/ir/transform.cc:440

In [32]:
baseline = log[0][1]
print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20)))
for result in log:
    print(
        "%s\t%s\t%s"
        % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20))
    )


            Operator	              Timing	         Performance
               numpy	5.080509999970673e-06	                 1.0
               naive	3.5270999999999997e-06	  0.6942413261700813
            parallel	4.5199000000000006e-06	  0.8896547787576624
              vector	          9.7442e-06	   1.917957055503532


保存和加载已经编译的模块

In [33]:
from tvm.contrib import cc
from tvm.contrib import utils

temp = utils.tempdir()
fadd.save(temp.relpath("myadd.o"))
if tgt.kind.name == "cuda":
    fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
if tgt.kind.name == "rocm":
    fadd.imported_modules[0].save(temp.relpath("myadd.hsaco"))
if tgt.kind.name.startswith("opencl"):
    fadd.imported_modules[0].save(temp.relpath("myadd.cl"))
cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
print(temp.listdir())

['myadd.o', 'myadd.so']


In [34]:
fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so"))
if tgt.kind.name == "cuda":
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx"))
    fadd1.import_module(fadd1_dev)

if tgt.kind.name == "rocm":
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco"))
    fadd1.import_module(fadd1_dev)

if tgt.kind.name.startswith("opencl"):
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl"))
    fadd1.import_module(fadd1_dev)

fadd1(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

In [35]:
fadd.export_library(temp.relpath("myadd_pack.so"))
fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so"))
fadd2(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())