In [1]:
import tvm
import numpy as np
from tvm import autotvm, te, tir
from functools import partial, reduce
N, H, W, CI = 1, 28, 28, 64
CO = 64
Y, X, K = N*H*W, CO, 9*CI
#sprate = 0.9

In [2]:
from IPython.core.magic import register_cell_magic
import subprocess
import argparse
import sys
import os
@register_cell_magic
def withsave(line, cell):
    parser = argparse.ArgumentParser()
    parser.add_argument('fname')
    parser.add_argument('-f', '--force', action='store_true')
    parser.add_argument('-a', '--append', action='store_true')
    parser.add_argument('--subp')
    parser.add_argument('--no-capout', action='store_false', dest='capout')
    args = parser.parse_args(line.split())
    if not args.fname.endswith('.py'):
        args.fname += '.py'
    assert not (args.force and args.append)
    if os.path.exists(args.fname) and not (args.force or args.append):
        raise FileExistsError(args.fname)
    with open(args.fname, 'a' if args.append else 'w') as f:
        f.write(cell)
    if not args.subp:
        get_ipython().run_cell(cell)
    else:
        get_ipython().user_ns[args.subp] = subprocess.run(
            [sys.executable, args.fname],
            capture_output=args.capout
        )

In [15]:

dtype = 'float32'

cfg = autotvm.get_config()
cfg.define_split("tile_y", Y, num_outputs=3)
cfg.define_split("tile_x", X, num_outputs=3)
cfg.define_split("tile_k", K, num_outputs=2)
if cfg.is_fallback:
    pass

print("cfg[tile_y]", cfg["tile_y"])
print("cfg[tile_x]", cfg["tile_x"])
print("cfg[tile_k]", cfg["tile_k"])

data = te.placeholder((N, H, W, CI), dtype=dtype)
weight = te.placeholder((X, K), dtype=dtype)
idxsplit = lambda x,y: reduce(lambda a,b: a[:-1]+[a[-1]%b,a[-1]//b], y, [x])

@partial(te.compute, (Y, K), name='im2col')
def im2col(row, col):
    jw, jh, jn = idxsplit(row, [W, H])
    jc, kw, kh = idxsplit(col, [CI, 3])
    ih, iw = jh + kh - 1, jw + kw - 1
    return tir.if_then_else(
        tir.all(0 <= ih, ih < H, 0 <= iw, iw < W),
        data[jn, ih, iw, jc], 0)

#packw_bn = cfg["tile_x"].size[-1]
packw_bn = 3
packw = te.compute((X//packw_bn, K, packw_bn),
    lambda xo, k, xi: weight[xo * packw_bn + xi, k],
    name="packed_weight")

k = te.reduce_axis((0, K), name="k")
C = te.compute((Y, X),
    lambda y, x: te.sum(im2col[y, k] * packw[x//packw_bn, k, x%packw_bn], axis=k),
    name="dense_pack")

s = te.create_schedule(C.op)
"""
CC = s.cache_write(C, "global")
y, x = s[C].op.axis
yt, yo, yi = cfg["tile_y"].apply(s, C, y)
xt, xo, xi = cfg["tile_x"].apply(s, C, x)
s[C].reorder(yt, xt, yo, xo, yi, xi)
#xyt = s[C].fuse(yt, xt)
#s[C].parallel(xyt)
#xyo = s[C].fuse(yo, xo)
s[C].unroll(yi)
s[C].vectorize(xi)

s[CC].compute_at(s[C], xo)
yi, xi = s[CC].op.axis
(k,) = s[CC].op.reduce_axis
ko, ki = cfg["tile_k"].apply(s, CC, k)
s[CC].reorder(ko, ki, yi, xi)
s[CC].vectorize(xi)
s[CC].unroll(yi)
s[CC].unroll(ki)

s[im2col].compute_at(s[C], yo)
yi, k = s[im2col].op.axis
ko, ki = s[im2col].split(k, factor=CI)
s[im2col].vectorize(ki)
#s[im2col].unroll(yi)

xo, k, xi = s[packw].op.axis
s[packw].reorder(xo, xi, k)
#s[packw].parallel(xo)"""


cfg[tile_y] [-1, 1, 1]
cfg[tile_x] [-1, 1, 1]
cfg[tile_k] [-1, 1]


'\nCC = s.cache_write(C, "global")\ny, x = s[C].op.axis\nyt, yo, yi = cfg["tile_y"].apply(s, C, y)\nxt, xo, xi = cfg["tile_x"].apply(s, C, x)\ns[C].reorder(yt, xt, yo, xo, yi, xi)\n#xyt = s[C].fuse(yt, xt)\n#s[C].parallel(xyt)\n#xyo = s[C].fuse(yo, xo)\ns[C].unroll(yi)\ns[C].vectorize(xi)\n\ns[CC].compute_at(s[C], xo)\nyi, xi = s[CC].op.axis\n(k,) = s[CC].op.reduce_axis\nko, ki = cfg["tile_k"].apply(s, CC, k)\ns[CC].reorder(ko, ki, yi, xi)\ns[CC].vectorize(xi)\ns[CC].unroll(yi)\ns[CC].unroll(ki)\n\ns[im2col].compute_at(s[C], yo)\nyi, k = s[im2col].op.axis\nko, ki = s[im2col].split(k, factor=CI)\ns[im2col].vectorize(ki)\n#s[im2col].unroll(yi)\n\nxo, k, xi = s[packw].op.axis\ns[packw].reorder(xo, xi, k)\n#s[packw].parallel(xo)'

In [16]:

#data = tvm.nd.array(np.random.rand(N, H, W, CI).astype('float32'))
#weight = tvm.nd.array(np.random.rand(X, K).astype('float32'))

#data_placeholder = te.placeholder(data.shape)
#weight_placeholder = te.placeholder(weight.shape)
#output_placeholder = te.placeholder((CO,Y))


In [17]:

print(tvm.lower(s, [data, weight, C], simple_mode=True))

@main = primfn(placeholder_2: handle, placeholder_3: handle, dense_pack_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {dense_pack: Buffer(dense_pack_2: Pointer(float32), float32, [784, 64], []),
             placeholder_1: Buffer(placeholder_4: Pointer(float32), float32, [64, 576], []),
             placeholder: Buffer(placeholder_5: Pointer(float32), float32, [1, 28, 28, 64], [])}
  buffer_map = {placeholder_2: placeholder, placeholder_3: placeholder_1, dense_pack_1: dense_pack} {
  allocate(im2col: Pointer(global float32), float32, [451584]), storage_scope = global;
  allocate(packed_weight: Pointer(global float32), float32, [36288]), storage_scope = global {
    for (row: int32, 0, 784) {
      for (col: int32, 0, 576) {
        im2col[((row*576) + col)] = @tir.if_then_else(((((1 <= (floordiv(col, 192) + floordiv(row, 28))) && ((floordiv(col, 192) + floordiv(row, 28)) < 29)) && (1 <= (floordiv(floormod(col, 192)

In [18]:
func = tvm.build(s, [data, weight, C])

In [19]:
data = tvm.nd.array(np.random.rand(N, H, W, CI).astype('float32'))
weight = tvm.nd.array(np.random.rand(X, K).astype('float32'))
output_placeholder = tvm.nd.array(np.zeros((Y,CO)).astype('float32'))  #te.placeholder((CO,Y))


In [20]:
#print(output_placeholder)
args = (data, weight, output_placeholder)

In [21]:
func(*args)

In [22]:
print(output_placeholder)
print(output_placeholder.shape)

[[   66.64377     64.6672      63.355003 ...    63.823936    66.084526
   6850.0044  ]
 [   96.17441     93.44349     95.512024 ...    91.52598    100.0269
  10621.691   ]
 [  103.258026    94.276474    98.07382  ...    98.906296   101.88141
  11890.803   ]
 ...
 [   90.22754     86.93985     93.22631  ...    88.73153     94.04543
   6047.695   ]
 [   92.263985    84.32659     94.734375 ...    88.18103     91.66894
   4771.3765  ]
 [   56.017586    56.140312    61.848854 ...    56.439342    57.972046
   2970.314   ]]
(784, 64)


In [24]:
tgtstr = "llvm -mcpu=skylake"
dev = tvm.device(tgtstr, 0)
evt = func.time_evaluator(func.entry_name, dev, number=10)
print(evt(*args).mean)

0.040976574599999996


In [25]:
@autotvm.template('conv2d_3x3_gemm')
def conv2d_3x3_gemm(N, H, W, CI, CO, dtype='float32'):
    '''# My Conv2d_3x3_gemm
                
        yt, xt, yo =>
            yi, k9, ci:vec =>
                @im2col = {yt, yo, yi}/y, {k9, ci}/k
            xo =>
                ko, ki:unroll, yi:unroll, xi:vec =>
                    @ccache = {yt, yo, yi}/y, {xt, xo, xi}/x  // {ko, ki}k
                yi:unroll, xi:vec =>
                    @cout = {yt, yo, yi}/y, {xt, xo, xi}/x
    '''
    Y, X, K = N*H*W, CO, 9*CI
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", Y, num_outputs=3)
    cfg.define_split("tile_x", X, num_outputs=3)
    cfg.define_split("tile_k", K, num_outputs=2)
    if cfg.is_fallback:
        pass

    data = te.placeholder((N, H, W, CI), dtype=dtype)
    weight = te.placeholder((X, K), dtype=dtype)
    idxsplit = lambda x,y: reduce(lambda a,b: a[:-1]+[a[-1]%b,a[-1]//b], y, [x])

    @partial(te.compute, (Y, K), name='im2col')
    def im2col(row, col):
        jw, jh, jn = idxsplit(row, [W, H])
        jc, kw, kh = idxsplit(col, [CI, 3])
        ih, iw = jh + kh - 1, jw + kw - 1
        return tir.if_then_else(
            tir.all(0 <= ih, ih < H, 0 <= iw, iw < W),
            data[jn, ih, iw, jc], 0)
    
    packw_bn = cfg["tile_x"].size[-1]
    packw = te.compute((X//packw_bn, K, packw_bn),
        lambda xo, k, xi: weight[xo * packw_bn + xi, k],
        name="packed_weight")
    
    k = te.reduce_axis((0, K), name="k")
    C = te.compute((Y, X),
        lambda y, x: te.sum(im2col[y, k] * packw[x//packw_bn, k, x%packw_bn], axis=k),
        name="dense_pack")
    
    s = te.create_schedule(C.op)
    CC = s.cache_write(C, "global")
    
    y, x = s[C].op.axis
    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yt, xt, yo, xo, yi, xi)
    #xyt = s[C].fuse(yt, xt)
    #s[C].parallel(xyt)
    #xyo = s[C].fuse(yo, xo)
    s[C].unroll(yi)
    s[C].vectorize(xi)

    s[CC].compute_at(s[C], xo)
    yi, xi = s[CC].op.axis
    (k,) = s[CC].op.reduce_axis
    ko, ki = cfg["tile_k"].apply(s, CC, k)
    s[CC].reorder(ko, ki, yi, xi)
    s[CC].vectorize(xi)
    s[CC].unroll(yi)
    s[CC].unroll(ki)
    
    s[im2col].compute_at(s[C], yo)
    yi, k = s[im2col].op.axis
    ko, ki = s[im2col].split(k, factor=CI)
    s[im2col].vectorize(ki)
    #s[im2col].unroll(yi)

    xo, k, xi = s[packw].op.axis
    s[packw].reorder(xo, xi, k)
    #s[packw].parallel(xo)
    return s, [data, weight, C]


ValueError: Customized func is already registered in autoTVM task conv2d_3x3_gemm

In [26]:
import logging

N, H, W, C = 1, 28, 28, 64

#with open('conv2d_3x3_gemm.dbg', 'w') as logfile:
    
logging.getLogger("autotvm").setLevel(logging.DEBUG)
logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

task = autotvm.task.create('conv2d_3x3_gemm',
                       args=(N, H, W, C, C, 'float32'),
                       target="llvm -mcpu=skylake")
print(task.config_space)

measure_option = autotvm.measure_option(
    builder=autotvm.LocalBuilder(),
    runner=autotvm.LocalRunner(number=5))
tuner = autotvm.tuner.GATuner(task)
tuner.tune(
    n_trial=5,
    measure_option=measure_option,
    callbacks=[autotvm.callback.log_to_file("conv2d_3x3_gemm.log")],
)

print("done")
    

ConfigSpace (len=52920, space_map=
   0 tile_y: Split(policy=factors, product=784, num_outputs=3) len=90
   1 tile_x: Split(policy=factors, product=64, num_outputs=3) len=28
   2 tile_k: Split(policy=factors, product=576, num_outputs=2) len=21
)
Get devices for measurement successfully!
Get devices for measurement successfully!
No: 1	GFLOPS: 31.52/31.52	result: MeasureResult(costs=(0.0022493258,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4011459350585938, timestamp=1646736575.5814383)	[('tile_y', [-1, 14, 14]), ('tile_x', [-1, 1, 64]), ('tile_k', [-1, 32])],None,30207
No: 1	GFLOPS: 31.52/31.52	result: MeasureResult(costs=(0.0022493258,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4011459350585938, timestamp=1646736575.5814383)	[('tile_y', [-1, 14, 14]), ('tile_x', [-1, 1, 64]), ('tile_k', [-1, 32])],None,30207
No: 2	GFLOPS: 49.16/49.16	result: MeasureResult(costs=(0.0014421867999999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5314652919769287, timestamp=1646736575.8333473)

In [27]:
import numpy as np
tgtstr = "llvm -mcpu=skylake"
dev = tvm.device(tgtstr, 0)
with autotvm.apply_history_best("conv2d_3x3_gemm.log"):
    with tvm.target.Target(tgtstr):
        print("load log ok")
        
        N, C, H, W = 1,64,28,28
        s, arg_bufs = conv2d_3x3_gemm(N, H, W, C, C,"float32")
        func = tvm.build(s, arg_bufs)
        
        args = [
        tvm.nd.array(
            np.random.rand(*[a.value for a in p.shape]).astype('float32'))
        for p in arg_bufs]
        
        func(*args)
        
        evt = func.time_evaluator(func.entry_name, dev, number=3)
        
        
        print(evt(*args).mean)
        

Finish loading 37 records
Finish loading 37 records
load log ok
0.0011130223333333333
