In [None]:
import tvm
import numpy as np
from tvm import autotvm, te, tir
from functools import partial, reduce
N, H, W, CI = 1, 28, 28, 64
CO = 64
Y, X, K = N*H*W, CO, 9*CI
sprate = 0.9


In [None]:

dtype = 'float32'

cfg = autotvm.get_config()
cfg.define_split("tile_y", Y, num_outputs=3)
cfg.define_split("tile_x", X, num_outputs=3)
cfg.define_split("tile_k", K, num_outputs=2)
if cfg.is_fallback:
    pass

print("cfg[tile_y]", cfg["tile_y"])
print("cfg[tile_x]", cfg["tile_x"])
print("cfg[tile_k]", cfg["tile_k"])

data = te.placeholder((N, H, W, CI), dtype=dtype)
weight = te.placeholder((X, K), dtype=dtype)
idxsplit = lambda x,y: reduce(lambda a,b: a[:-1]+[a[-1]%b,a[-1]//b], y, [x])

@partial(te.compute, (Y, K), name='im2col')
def im2col(row, col):
    jw, jh, jn = idxsplit(row, [W, H])
    jc, kw, kh = idxsplit(col, [CI, 3])
    ih, iw = jh + kh - 1, jw + kw - 1
    return tir.if_then_else(
        tir.all(0 <= ih, ih < H, 0 <= iw, iw < W),
        data[jn, ih, iw, jc], 0)

#packw_bn = cfg["tile_x"].size[-1]
packw_bn = 3
packw = te.compute((X//packw_bn, K, packw_bn),
    lambda xo, k, xi: weight[xo * packw_bn + xi, k],
    name="packed_weight")

k = te.reduce_axis((0, K), name="k")
C = te.compute((Y, X),
    lambda y, x: te.sum(im2col[y, k] * packw[x//packw_bn, k, x%packw_bn], axis=k),
    name="dense_pack")

s = te.create_schedule(C.op)
CC = s.cache_write(C, "global")
y, x = s[C].op.axis
yt, yo, yi = cfg["tile_y"].apply(s, C, y)
xt, xo, xi = cfg["tile_x"].apply(s, C, x)
s[C].reorder(yt, xt, yo, xo, yi, xi)
#xyt = s[C].fuse(yt, xt)
#s[C].parallel(xyt)
#xyo = s[C].fuse(yo, xo)
s[C].unroll(yi)
s[C].vectorize(xi)

s[CC].compute_at(s[C], xo)
yi, xi = s[CC].op.axis
(k,) = s[CC].op.reduce_axis
ko, ki = cfg["tile_k"].apply(s, CC, k)
s[CC].reorder(ko, ki, yi, xi)
s[CC].vectorize(xi)
s[CC].unroll(yi)
s[CC].unroll(ki)

s[im2col].compute_at(s[C], yo)
yi, k = s[im2col].op.axis
ko, ki = s[im2col].split(k, factor=CI)
s[im2col].vectorize(ki)
#s[im2col].unroll(yi)

xo, k, xi = s[packw].op.axis
s[packw].reorder(xo, xi, k)
#s[packw].parallel(xo)


In [None]:

#data = tvm.nd.array(np.random.rand(N, H, W, CI).astype('float32'))
#weight = tvm.nd.array(np.random.rand(X, K).astype('float32'))

#data_placeholder = te.placeholder(data.shape)
#weight_placeholder = te.placeholder(weight.shape)
#output_placeholder = te.placeholder((CO,Y))


In [None]:

print(tvm.lower(s, [data, weight, C], simple_mode=True))

In [5]:
func = tvm.build(s, [data, weight, C])

In [6]:
data = tvm.nd.array(np.random.rand(N, H, W, CI).astype('float32'))
weight = tvm.nd.array(np.random.rand(X, K).astype('float32'))
output_placeholder = tvm.nd.array(np.zeros((Y,CO)).astype('float32'))  #te.placeholder((CO,Y))


In [7]:
#print(output_placeholder)
args = (data, weight, output_placeholder)

In [8]:
func(*args)

In [9]:
print(output_placeholder)

[[68.4626   64.65777  65.45325  ... 65.20065  62.64582        nan]
 [98.150085 96.98111  96.75384  ... 95.45488  93.135994       nan]
 [98.339935 95.95021  98.749306 ... 94.30134  91.12514        nan]
 ...
 [92.85127  91.81401  90.84366  ... 87.612404 88.13063        nan]
 [91.09684  94.07343  96.49847  ... 89.5788   87.29936        nan]
 [63.668903 64.37031  67.6136   ... 63.435528 62.409897       nan]]


In [10]:
tgtstr = "llvm -mcpu=skylake"
dev = tvm.device(tgtstr, 0)
evt = func.time_evaluator(func.entry_name, dev, number=3)
print(evt(*args).mean)

0.04146727666666667


In [11]:
@autotvm.template('conv_topi_native')
def conv_topi_native(N, H, W, CI, CO, dtype='float32'):
    
    Y, X, K = N*H*W, CO, 9*CI
    
    
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", Y, num_outputs=3)
    cfg.define_split("tile_x", X, num_outputs=3)
    cfg.define_split("tile_k", K, num_outputs=2)
    if cfg.is_fallback:
        pass

    data = te.placeholder((N, H, W, CI), dtype=dtype)
    weight = te.placeholder((X, K), dtype=dtype)
    idxsplit = lambda x,y: reduce(lambda a,b: a[:-1]+[a[-1]%b,a[-1]//b], y, [x])

    @partial(te.compute, (Y, K), name='im2col')
    def im2col(row, col):
        jw, jh, jn = idxsplit(row, [W, H])
        jc, kw, kh = idxsplit(col, [CI, 3])
        ih, iw = jh + kh - 1, jw + kw - 1
        return tir.if_then_else(
            tir.all(0 <= ih, ih < H, 0 <= iw, iw < W),
            data[jn, ih, iw, jc], 0)

    #packw_bn = cfg["tile_x"].size[-1]
    packw_bn = 3
    packw = te.compute((X//packw_bn, K, packw_bn),
        lambda xo, k, xi: weight[xo * packw_bn + xi, k],
        name="packed_weight")

    k = te.reduce_axis((0, K), name="k")
    C = te.compute((Y, X),
        lambda y, x: te.sum(im2col[y, k] * packw[x//packw_bn, k, x%packw_bn], axis=k),
        name="dense_pack")

    s = te.create_schedule(C.op)
    
    CC = s.cache_write(C, "global")
    
    y, x = s[C].op.axis
    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yt, xt, yo, xo, yi, xi)
    #xyt = s[C].fuse(yt, xt)
    #s[C].parallel(xyt)
    #xyo = s[C].fuse(yo, xo)
    s[C].unroll(yi)
    s[C].vectorize(xi)

    s[CC].compute_at(s[C], xo)
    yi, xi = s[CC].op.axis
    (k,) = s[CC].op.reduce_axis
    ko, ki = cfg["tile_k"].apply(s, CC, k)
    s[CC].reorder(ko, ki, yi, xi)
    s[CC].vectorize(xi)
    s[CC].unroll(yi)
    s[CC].unroll(ki)
    
    s[im2col].compute_at(s[C], yo)
    yi, k = s[im2col].op.axis
    ko, ki = s[im2col].split(k, factor=CI)
    s[im2col].vectorize(ki)
    #s[im2col].unroll(yi)

    xo, k, xi = s[packw].op.axis
    s[packw].reorder(xo, xi, k)
    #s[packw].parallel(xo)
    
    
    return s, [data, weight, C]




In [None]:
import logging

N, H, W = 10, 512, 512

convshape = [
    (N, C, H * 32 // C, W * 32 // C)
    for C in [64]#, 128, 256, 512]
]

with open('conv_topi_native.dbg', 'w') as logfile:
    logger = logging.getLogger("autotvm")
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler(logfile))
    
    N, C, H, W = convshape[0] 
    
    task = autotvm.task.create('conv_topi_native',
                           args=(N, H, W, C, C, 'float32'),
                           target="llvm -mcpu=skylake")
    print(task.config_space, file=logfile)

    measure_option = autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(number=4, repeat=3, timeout=20))
    tuner = autotvm.tuner.GATuner(task)
    tuner.tune(
        n_trial=3,
        measure_option=measure_option,
        callbacks=[autotvm.callback.log_to_file("conv_topi_native.log")],
    )
    

In [13]:
import numpy as np
with autotvm.apply_history_best("conv_topi_native.log"):
    with tvm.target.Target("llvm -mcpu=skylake"):
        dev = tvm.device("llvm -mcpu=skylake", 0)
        N, C, H, W = convshape[0] 
        s, arg_bufs = conv_topi_native(N, H, W, C, C,"float32")
        func = tvm.build(s, arg_bufs)
        
        args = [
        tvm.nd.array(
            np.random.rand(*[a.value for a in p.shape]).astype('float32'),
            dev)
        for p in arg_bufs]
        
        #func(*args)
        
        #evt = func.time_evaluator(func.entry_name, dev, number=3)
        
        
        #print(evt(*args).mean)
        

--- Logging error ---
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/nc/lib/python3.7/logging/__init__.py", line 1028, in emit
    stream.write(msg + self.terminator)
ValueError: I/O operation on closed file.
Call stack:
  File "/home/ubuntu/anaconda3/envs/nc/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ubuntu/anaconda3/envs/nc/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/anaconda3/envs/nc/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/anaconda3/envs/nc/lib/python3.7/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/home/ubuntu/anaconda3/envs/nc/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 677, in start
    self.io_loop.start()
  File "/home/ubuntu/.local/lib/python3.7/site-packages/tornado-6.1-py3.7-linux-x86_64.egg/tor