In [None]:
%matplotlib inline


# MicroTVM Host-Driven Example

**Author**: `Andrew Reusch <areusch@octoml.ai>`_



TVMConf 2020

This tutorial walks you through the process of deploying a model on-device using microTVM.
We'll use a model adapted from ARM's pre-quantized [CIFAR10-CNN tutorial](https://github.com/ARM-software/ML-examples/tree/master/cmsisnn-cifar10/models) and run it on an ARM M-class microcontroller.

Importing the Model
-------------------

We'll use the utilties in the microtvm blogpost repo to import and load the model:



In [2]:
import os

import onnx
import tvm
import tvm.relay

import micro_eval
from micro_eval import model

microtvm_blogpost_path = os.path.realpath(os.path.join(micro_eval.__file__, '..', '..', '..'))
config_path = os.path.join(microtvm_blogpost_path, 'data', 'cifar10-config-validate.json')
model_inst, _ = model.instantiate_from_spec(f'cifar10_cnn:micro_dev:{config_path}')

compiled_model = model_inst.build_model()

relay_model, params = compiled_model.ir_mod, compiled_model.params

Great! We now have a Relay model and accompanying parameters. Let's take a look:



In [3]:
print(relay_model)

def @main(%data: Tensor[(1, 32, 32, 4), uint8], %mean_data: Tensor[(1, 32, 32, 4), uint8], %conv0_weight: Tensor[(5, 5, 32, 4), int8], %conv0_bias: Tensor[(32), int8], %conv1_weight: Tensor[(5, 5, 32, 32), int8], %conv1_bias: Tensor[(32), int8], %conv2_weight: Tensor[(5, 5, 64, 32), int8], %conv2_bias: Tensor[(64), int8], %dense0_weight: Tensor[(10, 1024), int8], %dense0_bias: Tensor[(10), int8]) -> Tensor[(1, 10), int8] {
  %0 = cast(%data, dtype="int16") /* from_string */ /* ty=Tensor[(1, 32, 32, 4), int16] */;
  %1 = cast(%mean_data, dtype="int16") /* from_string */ /* ty=Tensor[(1, 32, 32, 4), int16] */;
  %2 = subtract(%0, %1) /* from_string */ /* ty=Tensor[(1, 32, 32, 4), int16] */;
  %3 = cast(%2, dtype="int8") /* from_string */ /* ty=Tensor[(1, 32, 32, 4), int8] */;
  %4 = nn.conv2d(%3, %conv0_weight, padding=[2, 2], channels=32, kernel_size=[5, 5], data_layout="NHWC", kernel_layout="HWOI", out_dtype="int32") /* from_string */ /* ty=Tensor[(1, 32, 32, 32), int32] */;
  %5 = cas

And we now have some parameters:



In [4]:
print('Parameters')
for k, v in params.items():
  print(f' * {k}: {v.shape}')

Parameters
 * mean_data: (1, 32, 32, 4)
 * conv0_weight: (5, 5, 32, 4)
 * conv0_bias: (32,)
 * conv1_weight: (5, 5, 32, 32)
 * conv1_bias: (32,)
 * conv2_weight: (5, 5, 64, 32)
 * conv2_bias: (64,)
 * dense0_weight: (10, 1024)
 * dense0_bias: (10,)


Compiling the Model
-------------------

Let's now run the TVM compiler. This step will lower the model all the way through the TIR to C.
First, we need to define the target we will use with TVM:



In [5]:
TARGET = tvm.target.target.create('c -keys=arm_cpu -mcpu=cortex-m7 -link-params -model=stm32f746xx -runtime=c -system-lib=1')

This target has a few parts:

 - ```-keys=arm_cpu```: Enables operator schedules used on ARM cpus
 - ```-mcpu=cortex-m7```: Specifies the CPU we will use with this model.
 - ```-link-params```: Link supplied model parameters as constants in the generated code.
 - ```-model=stm32f746xx```: Hint to the compiler of the CPU model. Mostly unused at this time.
 - ```-runtime=c```: Build code for the TVM C runtime (i.e. the bare-metal compatible one).
 - ```-system-lib```: Build a "system library." In deployments, the system library is pre-loaded into
   the runtime, rather than a library that needs to be loaded e.g. from a file. This is the simplest
   configuration for a bare-metal microcontroller, so we use it here.

Now we can run the compiler:



In [6]:
with tvm.transform.PassContext(opt_level=3, config={'tir.disable_vectorize': True}):
  graph_json, compiled_model, simplified_params = tvm.relay.build(
    relay_model, target=TARGET, params=params)

Cannot find config for target=c -keys=arm_cpu,cpu -link-params=1 -mcpu=cortex-m7 -model=stm32f746xx -runtime=c -system-lib=1, workload=('conv2d_direct_simd.arm_cpu', ('TENSOR', (1, 32, 32, 4), 'int8'), ('TENSOR', (5, 5, 32, 4), 'int8'), (1, 1), (2, 2), (1, 1), 'int32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=c -keys=arm_cpu,cpu -link-params=1 -mcpu=cortex-m7 -model=stm32f746xx -runtime=c -system-lib=1, workload=('conv2d_direct_simd.arm_cpu', ('TENSOR', (1, 16, 16, 32), 'int8'), ('TENSOR', (5, 5, 32, 32), 'int8'), (1, 1), (2, 2), (1, 1), 'int32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=c -keys=arm_cpu,cpu -link-params=1 -mcpu=cortex-m7 -model=stm32f746xx -runtime=c -system-lib=1, workload=('conv2d_direct_simd.arm_cpu', ('TENSOR', (1, 8, 8, 32), 'int8'), ('TENSOR', (5, 5, 64, 32), 'int8'), (1, 1), (2, 2), (1, 1), 'int32'). A fallback configuration 

Now we've lowered our model into C. Let's look at a fragment:



In [7]:
print('\n'.join(compiled_model.get_source().split('\n')[:800]))

// tvm target: c -keys=arm_cpu,cpu -link-params=1 -mcpu=cortex-m7 -model=stm32f746xx -runtime=c -system-lib=1
#include "tvm/runtime/c_runtime_api.h"
#include "tvm/runtime/c_backend_api.h"
#include <math.h>
void* __tvm_module_ctx = NULL;

#ifdef __cplusplus
extern "C"
#endif
#include <arm_math.h>
#include <arm_nnsupportfunctions.h>

__STATIC_FORCEINLINE int32_t gemm_1x4x1_body_TCRDKLKK(
    int8_t *aa, int8_t *bb, int32_t *cc,
    int A_stride, int B_stride, int C_stride) {
  int16_t aa_pad[4];
  int16_t bb_pad[4];

  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < 4 / 4; j++) {
      read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*4 + j*4], (int32_t*) &aa_pad[i*4 + j*4 + 2]);
    }
  }

  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < 4 / 4; j++) {
      read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*4 + j*4], (int32_t*) &bb_pad[i*4 + j*4 + 2]);
    }
  }

  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < 1; j++) {
      int32_t sum = 0;
      for 

Let's also look at the simplified parameters:



In [8]:
print('Simplified Parameters')
for k, v in simplified_params.items():
  print(f' * {k}: {v.shape}')

print(graph_json)

Simplified Parameters
 * p7: (10, 1024)
 * p0: (1, 32, 32, 4)
 * p8: (10,)
 * p1: (5, 5, 32, 4)
 * p4: (1, 1, 1, 32)
 * p2: (1, 1, 1, 32)
 * p3: (5, 5, 32, 32)
 * p5: (5, 5, 64, 32)
 * p6: (1, 1, 1, 64)
{
  "nodes": [
    {
      "op": "null", 
      "name": "data", 
      "inputs": []
    }, 
    {
      "op": "null", 
      "name": "p0", 
      "inputs": []
    }, 
    {
      "op": "tvm_op", 
      "name": "fused_cast_subtract_cast", 
      "attrs": {
        "num_outputs": "1", 
        "num_inputs": "2", 
        "func_name": "fused_cast_subtract_cast", 
        "flatten_data": "0"
      }, 
      "inputs": [
        [
          0, 
          0, 
          0
        ], 
        [
          1, 
          0, 
          0
        ]
      ]
    }, 
    {
      "op": "null", 
      "name": "p1", 
      "inputs": []
    }, 
    {
      "op": "null", 
      "name": "p2", 
      "inputs": []
    }, 
    {
      "op": "tvm_op", 
      "name": "fused_nn_conv2d_add_right_shift_cast", 
      

Building a host-driven binary
-----------------------------

First we'll build a firmware binary that can be controlled using an attached host computer over
UART. This is a great way to try out the network while writing minimal firmware, and it's also
how autotuning is accomplished. We'll re-use the compilation flow we use with autotuning:



In [9]:
from tvm.micro.contrib import zephyr
opts = model_inst.get_micro_compiler_opts()
opts['lib_opts']['cmake_args'] = ['-DCMAKE_VERBOSE_MAKEFILE=1']

# Instantiate the compiler.
compiler = zephyr.ZephyrCompiler(os.path.join(microtvm_blogpost_path, 'runtimes', 'zephyr'),
                                 board='nucleo_f746zg',
                                 zephyr_toolchain_variant='zephyr')

# A Workspace is a directory that holds compiled libraries.
workspace = tvm.micro.Workspace(debug=True)

# Build the micro-binary, which represents the final firmware image.
micro_bin = tvm.micro.build_static_runtime(workspace, compiler, compiled_model, **opts)

In [10]:
print(os.path.join(micro_bin.base_dir, micro_bin.binary_file))
!~/zephyr-sdk/arm-zephyr-eabi/bin/arm-zephyr-eabi-size {os.path.join(micro_bin.base_dir, micro_bin.binary_file)}

/tmp/tvm-debug-mode-tempdirs/2020-12-01T13-34-55___g5zt24cg/00000/build/runtime/zephyr/zephyr.elf
   text	   data	    bss	    dec	    hex	filename
 132288	   1052	 256676	 390016	  5f380	/tmp/tvm-debug-mode-tempdirs/2020-12-01T13-34-55___g5zt24cg/00000/build/runtime/zephyr/zephyr.elf


Generating test data
--------------------

Now we'll generate some test data to be used to demonstrate inference. We use the dataset
generator from the microtvm blog post:



In [11]:
from micro_eval import dataset
dataset_gen = dataset.DatasetGenerator.instantiate('cifar10', {'shuffle': False})

samples = dataset_gen.generate(1)

# Adapt samples as needed to accomodate the modified input shape.
inputs = model_inst.adapt_sample_inputs(samples[0].inputs)

In [12]:
print(inputs['data'].data)

[[[[158 112  49   0]
   [159 111  47   0]
   [165 116  51   0]
   ...
   [137  95  36   0]
   [126  91  36   0]
   [116  85  33   0]]

  [[152 112  51   0]
   [151 110  40   0]
   [159 114  45   0]
   ...
   [136  95  31   0]
   [125  91  32   0]
   [119  88  34   0]]

  [[151 110  47   0]
   [151 109  33   0]
   [158 111  36   0]
   ...
   [139  98  34   0]
   [130  95  34   0]
   [120  89  33   0]]

  ...

  [[ 68 124 177   0]
   [ 42 100 148   0]
   [ 31  88 137   0]
   ...
   [ 38  97 146   0]
   [ 13  64 108   0]
   [ 40  85 127   0]]

  [[ 61 116 168   0]
   [ 49 102 148   0]
   [ 35  85 132   0]
   ...
   [ 26  82 130   0]
   [ 29  82 126   0]
   [ 20  64 107   0]]

  [[ 54 107 160   0]
   [ 56 105 149   0]
   [ 45  89 132   0]
   ...
   [ 24  77 124   0]
   [ 34  84 129   0]
   [ 21  67 110   0]]]]


Flashing and Running
--------------------

Now we'll flash the binary onto an attached development board and establish communication.



In [13]:
with tvm.micro.Session(binary=micro_bin, flasher=compiler.flasher()) as sess:
  mod = tvm.micro.create_local_graph_runtime(graph_json, sess.get_system_lib(), sess.context)
  mod.set_input('data', inputs['data'].data)  # NOTE: the simplified params are set from flash.
  mod.run()

  micro_output = mod.get_output(0).asnumpy()

print('micro:', micro_output)

  doc = yaml.load(f)


micro: [[  4 -21  17  22  18   4   7 -13  -8 -13]]


Checking our work
-----------------

We can check the output from executing on-device against output from the host.



In [14]:
model_inst, _ = model.instantiate_from_spec(f'cifar10_cnn:cpu:{config_path}')

compiled_model = model_inst.build_model()

cpu_relay_model, cpu_params = compiled_model.ir_mod, compiled_model.params
cpu_inputs = model_inst.adapt_sample_inputs(samples[0].inputs)

with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
  cpu_graph_json, cpu_mod, cpu_simplified_params = tvm.relay.build(
    cpu_relay_model, target="llvm", params=cpu_params)

graph_mod = tvm.contrib.graph_runtime.create(cpu_graph_json, cpu_mod, tvm.cpu(0))
graph_mod.set_input('data', cpu_inputs['data'].data, **cpu_simplified_params)
graph_mod.run()
cpu_output = graph_mod.get_output(0)

print('cpu:', cpu_output)

Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_nopack.x86', ('TENSOR', (1, 1024), 'int16'), ('TENSOR', (10, 1024), 'int8'), None, 'int16'). A fallback configuration is used, which may bring great performance regression.
For x86 target, NCHW layout is recommended for conv2d.
For x86 target, NCHW layout is recommended for conv2d.
For x86 target, NCHW layout is recommended for conv2d.
  # Remove the CWD from sys.path while we load stuff.


cpu: [[  4 -21  17  22  18   4   7 -13  -8 -13]]
