# How to create model for the Coral TPU

Link to notebook on Google Colab:
https://colab.research.google.com/drive/1K8cm79ztfgNyt_ZnF22W83759J2lHBHh

In [1]:
!echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | sudo tee /etc/apt/sources.list.d/coral-edgetpu.list
!sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 6A030B21BA07F4FB

!sudo apt update > /dev/null
!sudo apt install edgetpu > /dev/null

deb https://packages.cloud.google.com/apt coral-edgetpu-stable main
Executing: /tmp/apt-key-gpghome.PlNFIbP3Ab/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 6A030B21BA07F4FB
gpg: key 6A030B21BA07F4FB: public key "Google Cloud Packages Automatic Signing Key <gc-team@google.com>" imported
gpg: Total number processed: 1
gpg:               imported: 1




debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 5.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 


In [2]:
import tensorflow as tf
import keras
import numpy as np
import cv2

from keras.datasets import mnist
from keras.utils import to_categorical

Using TensorFlow backend.


In [0]:
#loading mnist data
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

#feature scaling and normalization
train_images = train_images.reshape((60000, 28, 28, 1)).astype('float32') / 255.0
train_labels = to_categorical(train_labels)

test_images = test_images.reshape((10000, 28, 28, 1)).astype('float32') / 255.0
test_labels = to_categorical(test_labels)

#set the shape to 28 as current shape is (60000, 28, 28, 1)
input_shape = (train_images.shape[1])

In [0]:
#build the model

def build_keras_model():
    return keras.Sequential([
        keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
        keras.layers.MaxPooling2D((2,2)),
        keras.layers.Conv2D(64, (3,3), activation='relu'),
        keras.layers.MaxPooling2D((2,2)),
        keras.layers.Conv2D(64, (3,3), activation='relu'),
        keras.layers.Flatten(),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(10, activation='softmax')
    ])

## Graphs and Sessions

TensorFlow uses a **dataflow graph** to represent your computation in terms of the dependencies between individual operations. This leads to a low-level programming model in which you first define the dataflow graph, then create a TensorFlow **session** to run parts of the graph across a set of local and remote devices.

## Quantization in Deep Learning

* Enables efficient high-perfomance deep learning computation on small devices.
* Can be defined as a method to bring the neural network to a reasonable size, while also achieving high performance accuracy.
* Therefore, it is the process of approximating a neural network that uses floating-point numbers by a neural network of low bit width numbers, reducing dramatically both the memory requirement and computational cost of using neural networks.
* The neural network **can** be quantized after training is finished. However, the most effective method for retaining high accuracy is to quantize during training.

More information can be found: https://medium.com/@joel_34050/quantization-in-deep-learning-478417eab72b

In [17]:
train_graph = tf.Graph()
train_sess = tf.Session(graph=train_graph)

keras.backend.set_session(train_sess)
with train_graph.as_default():
    train_model = build_keras_model()
    
    '''quant_delay: Number of steps after which weights and activations are
    quantized during training.
    
    The default value of quant_delay is suitable for finetuning an already trained
    floating point model (recommended).
    If one wants to train a quantized model from scratch, quant_delay should be
    set to the number of steps it take the floating point model to converge.'''
    tf.contrib.quantize.create_training_graph(input_graph=train_graph, quant_delay=100)
    train_sess.run(tf.global_variables_initializer())
    
    #compile the model after being quantized
    train_model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    #execute training
    train_model.fit(train_images, train_labels, epochs=1)
    
    #save graph and checkpoints
    saver = tf.train.Saver()
    saver.save(train_sess, 'checkpoints')

Epoch 1/1


In [18]:
#predict a sample
with train_graph.as_default():
    print('Sample result of original model:')
    print(train_model.predict(test_images[:1]))

Sample result of original model:
[[6.7921526e-08 1.0675574e-07 4.4519779e-06 9.5344333e-08 1.6287174e-09
  3.8594742e-08 3.6363954e-12 9.9999487e-01 7.9284330e-09 3.7020737e-07]]


In [0]:
#Evaluate the model
eval_graph = tf.Graph()
eval_sess = tf.Session(graph=eval_graph)

keras.backend.set_session(eval_sess)

with eval_graph.as_default():
    keras.backend.set_learning_phase(0)
    eval_model = build_keras_model()
    tf.contrib.quantize.create_eval_graph(input_graph=eval_graph)
    eval_graph_def = eval_graph.as_graph_def()
    saver = tf.train.Saver()
    saver.restore(eval_sess, 'checkpoints')
    
    #Freeze the graph
    frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
        eval_sess,
        eval_graph_def,
        [eval_model.output.op.name]
    )
    
    #WB stands for: writing truncating the file first and in binary mode
    with open('frozen_model.pb', 'wb') as f:
        f.write(frozen_graph_def.SerializeToString())

In [20]:
#Summary of the model
train_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 3, 3, 64)          36928     
_________________________________________________________________
flatten_1 (Flatten)          (None, 576)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                36928     
__________

In [28]:
#Convert the file to TFLite format
%%bash

tflite_convert \
    --output_file=test_model.tflite \
    --graph_def_file=frozen_model.pb \
    --inference_type=QUANTIZED_UINT8 \
    --input_arrays=conv2d_1_input \
    --output_arrays=dense_2/Softmax \
    --mean_values=0 \
    --std_dev_values=255

2019-07-11 14:40:30.299445: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcuda.so.1
2019-07-11 14:40:30.302261: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-07-11 14:40:30.302822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 0000:00:04.0
2019-07-11 14:40:30.303124: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-07-11 14:40:30.304368: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-07-11 14:40:30.305756: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcufft.so.10

In [29]:
# load TFLite file
interpreter = tf.lite.Interpreter(model_path=f'test_model.tflite')
# Allocate memory
interpreter.allocate_tensors()

#Get some information
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print(input_details)

print(output_details)

[{'name': 'conv2d_1_input', 'index': 3, 'shape': array([ 1, 28, 28,  1], dtype=int32), 'dtype': <class 'numpy.uint8'>, 'quantization': (0.003921568859368563, 0)}]
[{'name': 'dense_2/Softmax', 'index': 15, 'shape': array([ 1, 10], dtype=int32), 'dtype': <class 'numpy.uint8'>, 'quantization': (0.00390625, 0)}]


In [30]:
!edgetpu_compiler 'test_model.tflite'

Edge TPU Compiler version 1.0.249710469
INFO: Initialized TensorFlow Lite runtime.

Model compiled successfully in 59 ms.

Input model: test_model.tflite
Input size: 95.28KiB
Output model: test_model_edgetpu.tflite
Output size: 172.56KiB
On-chip memory available for caching model parameters: 7.95MiB
On-chip memory used for caching model parameters: 97.50KiB
Off-chip memory used for streaming uncached model parameters: 0.00B
Number of Edge TPU subgraphs: 1
Total number of operations: 8
Operation log: test_model_edgetpu.log
See the operation log file for individual operation details.


In [0]:
#Download file
from google.colab import files

files.download('test_model_edgetpu.tflite')