# Demonstrate construction and quantization of a model for execution on the EdgeTPU

This MNIST example should be helpful:  
https://www.tensorflow.org/lite/performance/post_training_integer_quant  

This webpage, and links at the bottom of the page, may also be helpful:  
https://www.tensorflow.org/lite/performance/post_training_quantization

## See of some of these comments help too:  
https://github.com/google-coral/edgetpu/issues/13

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, SeparableConv2D, ReLU
import time

# %matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
## Print out some system information
import subprocess

print('Linux kernel version:')
print('$ uname -r')
result = subprocess.run(['uname', '-r'], stdout=subprocess.PIPE)
print(result.stdout.decode('utf-8'))

print('Linux release:')
print('$ lsb_release -a')
result = subprocess.run(['lsb_release', '-a'], stdout=subprocess.PIPE)
print(result.stdout.decode('utf-8'))

print('Tensorflow python module version')
print(tf.__version__)
print('')

print('Edge TPU python module version:')
import edgetpu
print(edgetpu.__version__)
print('')

print('Edge TPU compiler version:')
result = subprocess.run(['edgetpu_compiler', '--version'], stdout=subprocess.PIPE)
print(result.stdout.decode('utf-8'))
print('')

print('Edge TPU runtime file:')
result = subprocess.check_output("dpkg -l | grep libedgetpu", shell=True)
print(result.decode("utf-8"))

print('Edge TPU runtime version:')
import edgetpu.basic.edgetpu_utils
print(edgetpu.basic.edgetpu_utils.GetRuntimeVersion())

# import tflite_runtime.interpreter as tflite
import tflite_runtime
print('tflite_runtime version:')
print(tflite_runtime.__version__)
print('')

print('Paths of available Edge TPU devices, if any:')
devices = edgetpu.basic.edgetpu_utils.ListEdgeTpuPaths(edgetpu.basic.edgetpu_utils.EDGE_TPU_STATE_NONE)
print(devices)
# Set identity of edge device to use, if any
if len(devices) > 0:
    # Use the first device in the list
    if devices[0].startswith('/dev/apex'):
        target_device = 'pci'
    else:
        # Assuming device is on USB bus
        target_device = 'usb'
    print('Using delegate device: "%s"' % (target_device))

In [None]:
if tf.__version__.startswith('1.15'):
    # This prevents some errors that otherwise occur when converting the model with TF 1.15...
    tf.enable_eager_execution() # Only if TF is version 1.15    

In [None]:
## Build a PixelLink model

import model_architectures as model_archs

gt_res='2s'  # resolution of ground truth bitmaps. '2s' or '4s'
# image_shape = (416, 416, 3) # works for: n_channels = [32, 64, 128, 256, 256, 512, 512]
# image_shape = (448, 448, 3) # output has artifact for: n_channels = [32, 64, 128, 256, 256, 512, 512]
# image_shape = (512, 512, 3) # freezes for: n_channels = [32, 64, 128, 256, 256, 512, 512]
image_shape = (512, 512, 3) # works for: n_channels = [16, 32, 64, 128, 128, 256, 256]
n_bitmaps = 5

def representative_dataset_gen():
    num_calibration_images = 100
    for i in range(num_calibration_images):
        image = tf.random.uniform([1] + list(image_shape))
        yield [image]

# n_channels = [64, 128, 256, 512, 512, 1024, 1024], # PixelLink
# n_channels = [32, 64, 128, 256, 256, 512, 512] # Primary model, works for up to 416x416 input with sep convs
n_channels = [16, 32, 64, 128, 128, 256, 256]
# n_channels = [8, 16, 32, 64, 64, 128, 128]
kernel_sizes = [3, 3, 3, 3, 3, 1, 1]
n_convs = [2, 2, 3, 3, 3, 1, 1]
pool = [True, True, True, True, False, False, False]
build_model = lambda : model_archs.build_modular(image_shape, n_gt_chans=n_bitmaps,
                                                 resolution=gt_res, separable_conv=True,
                                                 batchnorm='batchnorm', n_channels=n_channels,
                                                 kernel_sizes=kernel_sizes, n_convs=n_convs, pool=pool)
model = build_model()

# model.summary()

In [None]:
## Convert to tensorflow lite model and save...
if tf.__version__.startswith('2.'):
    converter = tf.lite.TFLiteConverter.from_keras_model(model) # TF2.0
elif tf.__version__.startswith('1.15'):
    model.save('model_keras', include_optimizer=False) # TF1.15
    converter = tf.lite.TFLiteConverter.from_keras_model_file('model_keras') # TF1.15
else:
    raise ValueError('Unhandled TensorFlow version.')
    
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = tf.lite.RepresentativeDataset(representative_dataset_gen)
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] # For EdgeTPU, no float ops allowed

tflite_model = converter.convert()
open('model.tflite', 'wb').write(tflite_model)

In [None]:
##================================================================================================
## Set variable below and comment out lines in cell below if not using Edge TPU model and hardware
##================================================================================================
# True:  Use EdgeTPU model and process on the Edge TPU (assumes one is available)
# False: Use TFLite model and process on CPU
use_edgetpu = True

In [None]:
%%bash
## Compile model for edge TPU
# Note that the output file name has '_edgetpu' appended to the root filename of the input TFLite model.
edgetpu_compiler --min_runtime_version 13 --show_operations 'model.tflite'

In [None]:
## Load TFLite model and allocate tensors.
if use_edgetpu:
    # If using interpreter from tflite_runtime package
    from tflite_runtime.interpreter import load_delegate
    from tflite_runtime.interpreter import Interpreter
    interpreter = Interpreter(model_path='model_edgetpu.tflite',
                              model_content=None,
                              experimental_delegates=[load_delegate('libedgetpu.so.1.0',
                                                                    {'device': target_device})])
    
#     # If using interpreter from full TensorFlow package...
#     from tensorflow.lite.python.interpreter import load_delegate
#     interpreter = tf.lite.Interpreter(model_path='model_edgetpu.tflite',
#                                       experimental_delegates=[load_delegate('libedgetpu.so.1.0',
#                                                                             {'device': target_device})])
else:
    interpreter = tf.lite.Interpreter(model_path='model.tflite')

interpreter.allocate_tensors()

# Get input and output tensors details.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [None]:
## Put some random data through the model and show results
# Create a batch of images
batch_size = 10
# image = tf.random.normal([batch_size] + list(image_shape))
image = tf.random.uniform([batch_size] + list(image_shape))
# image = tf.random.uniform([batch_size] + list(image_shape))*256
# image = tf.cast(image, tf.dtypes.uint8)

# Process the image with the network model
t_all = time.time()
t_individual = np.zeros(batch_size)
for i_im in range(batch_size):
    t_one = time.time()
    # Set input tensor and invoke model
    interpreter.set_tensor(input_details[0]['index'], image[i_im:i_im+1])
    interpreter.invoke()   # Can be slow if running on CPU

    # The function `get_tensor()` returns a copy of the tensor data.
    # Use `tensor()` in order to get a pointer to the tensor.
    model_output = interpreter.get_tensor(output_details[0]['index'])
    t_individual[i_im] = time.time() - t_one
print('Model processing took %f seconds.' % (time.time() - t_all))
print('Individual image processing times:')
print(t_individual)

# Plot results for first channel of input and output, of the first
# image in the batch.
in_chan0 = image[0, :, :, 0]
out_chan0 = model_output[0, :, :, 0]
plt.figure(figsize=(16, 8))
# clim = [-4, 4]

plt.subplot(1,2,1)
plt.imshow(in_chan0, aspect='equal')
plt.title('Input min chan0 value: %f' % (tf.reduce_min(in_chan0)))
# plt.clim(clim)
plt.colorbar()

plt.subplot(1,2,2)
plt.imshow(out_chan0, aspect='equal')
plt.title('Output min chan0 value: %f' % (tf.reduce_min(out_chan0)))
# plt.clim(clim)
plt.colorbar()