In [1]:
# mre 2020-11-25

import numpy as np

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 

import tensorflow as tf

# Helper for running interpreter

In [2]:
# run interpreter on random input
def test(interpreter):
    interpreter.allocate_tensors()

    input_details  = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    input_shape = input_details[0]['shape']
    input_data  = np.array(np.random.random_sample(input_shape), dtype=np.float32)
    interpreter.set_tensor(input_details[0]['index'], input_data)

    interpreter.invoke()
    
    output_data = interpreter.get_tensor(output_details[0]['index'])
    return output_data


# run interpreter on real dataset data
def run(interpreter, data):
    interpreter.allocate_tensors()

    input_index  = interpreter.get_input_details()[0]["index"]
    output_index = interpreter.get_output_details()[0]["index"]

    y = []
    for i, x in enumerate(data):
        tx = tf.constant(x, shape=(1,1))

        interpreter.set_tensor(input_index, tx)
        interpreter.invoke()
        output = interpreter.tensor(output_index)

        y.append(output()[0][0])
 
    return np.array(y)

# Helper uint8

In [3]:
# convert float to uint8
def toUint8(interpreter, xx):
    s, c =interpreter.get_input_details()[0]['quantization']
    x8 = [np.uint8(np.round(x/s + c)) for x in data]
    return x8


# run uint8-Model with hand-made IO-conversion
def runUint8(interpreter, x):
    """
    x -> uint8-input -> interpreter -> uint8-output -> float
    """

    x8 = toUint8(interpreter, x)
    y8 = np.array(run(interpreter, x8))
    
    s, c = interpreter.get_output_details()[0]['quantization']
    y = s * (np.float32(y8) - c)
    
    return y 

# Helper for TPU execution

In [4]:
import platform
import tflite_runtime.interpreter as tflite

EDGETPU_SHARED_LIB = {
  'Linux': 'libedgetpu.so.1',
  'Darwin': 'libedgetpu.1.dylib',
  'Windows': 'edgetpu.dll'
}[platform.system()]


def make_interpreter(model_file):
  model_file, *device = model_file.split('@')
  return tflite.Interpreter(
      model_path=model_file,
      experimental_delegates=[
          tflite.load_delegate(EDGETPU_SHARED_LIB,
                               {'device': device[0]} if device else {})
      ])

# Definition of (concrete) function and data

In [5]:
@tf.function
def cf(a):
  return a*a*a + 1

cf(tf.ones([2, 2]))

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[2., 2.],
       [2., 2.]], dtype=float32)>

In [6]:
nx = 10

## dataset for debugging
#xx = np.arange(nx) - nx/2
#data = [np.array([x], dtype=np.float32) for x in xx]

# dataset chosen so that quantization errors are obvious
np.random.seed(17)
data = [np.array([10*x], dtype=np.float32) for x in np.random.randn(nx)]

data

[array([2.7626588], dtype=float32),
 array([-18.54628], dtype=float32),
 array([6.2390113], dtype=float32),
 array([11.453113], dtype=float32),
 array([10.371904], dtype=float32),
 array([18.86639], dtype=float32),
 array([-1.1169829], dtype=float32),
 array([-3.6210134], dtype=float32),
 array([1.4867505], dtype=float32),
 array([-4.3778315], dtype=float32)]

In [7]:
list(map(cf, data))

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([22.085396], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-6378.262], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([243.85515], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1503.3481], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1116.7721], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([6716.3154], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.39360476], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-46.47778], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.2863536], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-82.90293], dtype=float32)>]

# Conversion to TFLite

## Convert concrete function to TFLite

In [8]:
converter = tf.lite.TFLiteConverter.from_concrete_functions([cf.get_concrete_function(tf.ones([1,1]))])

model_lite = converter.convert()

## Execution on CPU with random input

In [9]:
interpreter = tf.lite.Interpreter(model_content=model_lite)

test(interpreter)

array([[1.1679695]], dtype=float32)

## Execution on CPU with own input

In [10]:
run(interpreter, data)

array([ 2.2085396e+01, -6.3782622e+03,  2.4385515e+02,  1.5033481e+03,
        1.1167721e+03,  6.7163154e+03, -3.9360476e-01, -4.6477779e+01,
        4.2863536e+00, -8.2902931e+01], dtype=float32)

# Conversion to TFLite-uint8 with float32-IO-layers

## Representative data for quantization

In [11]:
def representative_data_gen():
    for x in data:
        yield [tf.cast(x, tf.float32)]
        
list(representative_data_gen())

[[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.7626588], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-18.54628], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([6.2390113], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([11.453113], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([10.371904], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([18.86639], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-1.1169829], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-3.6210134], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.4867505], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-4.3778315], dtype=float32)>]]

## Convert concrete function to TFLite-uint8 with float32-IO-layers

In [12]:
converter = tf.lite.TFLiteConverter.from_concrete_functions([cf.get_concrete_function(tf.ones([1,1]))])

converter.experimental_new_converter = True
converter.target_spec.supported_ops  = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]

converter.inference_input_type  = tf.float32
converter.inference_output_type = tf.float32

converter.representative_dataset = representative_data_gen

model_int_fio_lite = converter.convert()

## Run on CPU

In [13]:
interpreter = tf.lite.Interpreter(model_content=model_int_fio_lite)

run(interpreter, data)

array([    0.      , -6316.2085  ,   256.75644 ,  1489.1874  ,
        1129.7283  ,  6727.0186  ,     0.      ,   -51.351288,
           0.      ,  -102.702576], dtype=float32)

## Write model to file and compile it for TPU

In [14]:
with open('model_int_fio.tflite', 'wb') as f:
    f.write(model_int_fio_lite)

In [15]:
! edgetpu_compiler model_int_fio.tflite

Edge TPU Compiler version 15.0.340273435

Model compiled successfully in 11 ms.

Input model: model_int_fio.tflite
Input size: 1.41KiB
Output model: model_int_fio_edgetpu.tflite
Output size: 24.74KiB
On-chip memory used for caching model parameters: 0.00B
On-chip memory remaining for caching model parameters: 8.09MiB
Off-chip memory used for streaming uncached model parameters: 0.00B
Number of Edge TPU subgraphs: 1
Total number of operations: 5
Operation log: model_int_fio_edgetpu.log

Model successfully compiled but not all operations are supported by the Edge TPU. A percentage of the model will instead run on the CPU, which is slower. If possible, consider updating your model to use only operations supported by the Edge TPU. For details, visit g.co/coral/model-reqs.
Number of operations that will run on Edge TPU: 3
Number of operations that will run on CPU: 2
See the operation log file for individual operation details.


In [16]:
! ls -l model_int_fio*.tflite

-rw-rw-r-- 1 mre mre 25336 Nov 25 11:26 model_int_fio_edgetpu.tflite
-rw-rw-r-- 1 mre mre  1440 Nov 25 11:26 model_int_fio.tflite


## Run on TPU

In [17]:
interpreter = make_interpreter("model_int_fio_edgetpu.tflite")

run(interpreter, data)

array([    0.      , -6316.2085  ,   256.75644 ,  1489.1874  ,
        1129.7283  ,  6727.0186  ,     0.      ,   -51.351288,
           0.      ,  -102.702576], dtype=float32)

# Conversion to pure TFLite-uint8

## Representative data for quantization

In [18]:
def representative_data_gen():
    for x in data:
        yield [tf.cast(x, tf.float32)]
        
list(representative_data_gen())

[[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.7626588], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-18.54628], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([6.2390113], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([11.453113], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([10.371904], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([18.86639], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-1.1169829], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-3.6210134], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.4867505], dtype=float32)>],
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-4.3778315], dtype=float32)>]]

## Convert concrete function to pure TFLite-uint8

In [19]:
converter = tf.lite.TFLiteConverter.from_concrete_functions([cf.get_concrete_function(tf.ones([1,1]))])

converter.experimental_new_converter = True
converter.target_spec.supported_ops  = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]

converter.inference_input_type  = tf.uint8
converter.inference_output_type = tf.uint8

converter.representative_dataset = representative_data_gen

model_int_lite = converter.convert()

## Run pure uint8-model on CPU with uint8-IO

In [20]:
interpreter = tf.lite.Interpreter(model_content=model_int_lite)

x8 = toUint8(interpreter, data)
x8

[array([145], dtype=uint8),
 array([0], dtype=uint8),
 array([169], dtype=uint8),
 array([204], dtype=uint8),
 array([197], dtype=uint8),
 array([255], dtype=uint8),
 array([118], dtype=uint8),
 array([101], dtype=uint8),
 array([136], dtype=uint8),
 array([96], dtype=uint8)]

In [21]:
y8 = run(interpreter, x8)
y8

array([124,   1, 129, 153, 146, 255, 124, 123, 124, 122], dtype=uint8)

## Run pure uint8-model on CPU with hand made float-IO

In [22]:
runUint8(interpreter, data)

array([    0.      , -6316.2085  ,   256.75644 ,  1489.1874  ,
        1129.7283  ,  6727.0186  ,     0.      ,   -51.351288,
           0.      ,  -102.702576], dtype=float32)

## Write pure uint8-model to file and compile it for TPU

In [23]:
with open('model_int.tflite', 'wb') as f:
    f.write(model_int_lite)

In [24]:
! edgetpu_compiler model_int.tflite

Edge TPU Compiler version 15.0.340273435

Model compiled successfully in 12 ms.

Input model: model_int.tflite
Input size: 1.47KiB
Output model: model_int_edgetpu.tflite
Output size: 24.49KiB
On-chip memory used for caching model parameters: 0.00B
On-chip memory remaining for caching model parameters: 8.09MiB
Off-chip memory used for streaming uncached model parameters: 0.00B
Number of Edge TPU subgraphs: 1
Total number of operations: 5
Operation log: model_int_edgetpu.log
See the operation log file for individual operation details.


In [25]:
! ls -l model_int*.tflite

-rw-rw-r-- 1 mre mre 25080 Nov 25 11:26 model_int_edgetpu.tflite
-rw-rw-r-- 1 mre mre 25336 Nov 25 11:26 model_int_fio_edgetpu.tflite
-rw-rw-r-- 1 mre mre  1440 Nov 25 11:26 model_int_fio.tflite
-rw-rw-r-- 1 mre mre  1504 Nov 25 11:26 model_int.tflite


## Run pure uint8-model on TPU with uint8-IO

In [26]:
interpreter = make_interpreter("model_int_edgetpu.tflite")

run(interpreter, toUint8(interpreter, data))

array([124,   1, 129, 153, 146, 255, 124, 123, 124, 122], dtype=uint8)

## Run pure uint8-model on CPU with hand made float-IO

In [27]:
runUint8(interpreter, data)

array([    0.      , -6316.2085  ,   256.75644 ,  1489.1874  ,
        1129.7283  ,  6727.0186  ,     0.      ,   -51.351288,
           0.      ,  -102.702576], dtype=float32)