Based on the examples from:
https://www.tensorflow.org/lite/performance/post_training_integer_quant

In [1]:
import numpy as np
import tensorflow as tf

import keras
from keras.datasets import mnist
from keras.utils import np_utils

In [2]:
# Load the datasets for training and test
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()


In [3]:
# Prepare the input data
train_images = train_images.astype('float32') / 255.0
test_images = test_images.astype('float32') / 255.0

In [4]:
# Create the model
from keras.models import Sequential
from keras import models, layers
from keras import regularizers

# Define the model architecture
model = tf.keras.Sequential([
  tf.keras.layers.InputLayer(input_shape=(28, 28)),
  tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
  tf.keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(10)
])


model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 12)        120       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 12)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2028)              0         
                                                                 
 dense (Dense)               (None, 10)                20290     
                                                                 
Total params: 20,410
Trainable params: 20,410
Non-trainable params: 0
____________________________________________________

In [5]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(
                  from_logits=True),
              metrics=['accuracy'])

In [6]:
hist = model.fit(train_images, train_labels,
                        epochs=5,
                        verbose=1,
                        validation_data=(test_images,test_labels))

Epoch 1/5
   1/1875 [..............................] - ETA: 4:24 - loss: 2.3360 - accuracy: 0.0312

2022-09-25 13:46:25.965924: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
def generate_tflite_models(num_quantize_images, representative_data_generator):
    # non quantized model
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    fp32_model = converter.convert()

    # float16 quantized model
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_types = [tf.float16]
    fp16_model = converter.convert()

    # int8 quantized model
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.representative_dataset = representative_data_generator
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    converter.inference_input_type = tf.uint8
    converter.inference_output_type = tf.uint8

    int8_model = converter.convert()
    return fp32_model, fp16_model, int8_model

In [8]:
# Helper function to run inference on a TFLite model
def run_tflite_model(model, test_image_indices):
  global test_images

  # Initialize the interpreter
  interpreter = tf.lite.Interpreter(model_content=model)
  interpreter.allocate_tensors()

  input_details = interpreter.get_input_details()[0]
  output_details = interpreter.get_output_details()[0]

  predictions = np.zeros((len(test_image_indices),), dtype=int)
  for i, test_image_index in enumerate(test_image_indices):
    test_image = test_images[test_image_index]
    test_label = test_labels[test_image_index]

    # Check if the input type is quantized, then rescale input data to uint8
    if input_details['dtype'] == np.uint8:
      input_scale, input_zero_point = input_details["quantization"]
      test_image = test_image / input_scale + input_zero_point

    test_image = np.expand_dims(test_image, axis=0).astype(input_details["dtype"])
    interpreter.set_tensor(input_details["index"], test_image)
    interpreter.invoke()
    output = interpreter.get_tensor(output_details["index"])[0]

    predictions[i] = output.argmax()

  return predictions

In [9]:
def evaluate_model(tflite_model, model_type):
  global test_images
  global test_labels

  test_image_indices = range(test_images.shape[0])
  predictions = run_tflite_model(tflite_model, test_image_indices)

  accuracy = (np.sum(test_labels== predictions) * 100) / len(test_images)

  print('%s model accuracy is %.4f%% (Number of test samples=%d)' % (
      model_type, accuracy, len(test_images)))

In [10]:
# Calibration data 
def representative_data_gen_100():
  for input_value in tf.data.Dataset.from_tensor_slices(train_images).batch(1).take(100):
    yield [input_value]

fp32_model, fp16_model, int8_model = generate_tflite_models(100, representative_data_gen_100)

INFO:tensorflow:Assets written to: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpng5si6ur/assets


2022-09-25 13:46:51.348320: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-09-25 13:46:51.683100: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:357] Ignored output_format.
2022-09-25 13:46:51.683114: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:360] Ignored drop_control_dependency.


INFO:tensorflow:Assets written to: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpia3ykhbq/assets


2022-09-25 13:46:51.683237: I tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpng5si6ur
2022-09-25 13:46:51.683882: I tensorflow/cc/saved_model/reader.cc:78] Reading meta graph with tags { serve }
2022-09-25 13:46:51.683887: I tensorflow/cc/saved_model/reader.cc:119] Reading SavedModel debug info (if present) from: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpng5si6ur
2022-09-25 13:46:51.686369: I tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-09-25 13:46:51.712409: I tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpng5si6ur
2022-09-25 13:46:51.721221: I tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 37986 microseconds.
2022-09-25 13:46:51.731794: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:237] disabling MLIR cra

INFO:tensorflow:Assets written to: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpkowul14q/assets


INFO:tensorflow:Assets written to: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpkowul14q/assets
2022-09-25 13:46:52.481172: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:357] Ignored output_format.
2022-09-25 13:46:52.481191: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:360] Ignored drop_control_dependency.
2022-09-25 13:46:52.481280: I tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpkowul14q
2022-09-25 13:46:52.482001: I tensorflow/cc/saved_model/reader.cc:78] Reading meta graph with tags { serve }
2022-09-25 13:46:52.482010: I tensorflow/cc/saved_model/reader.cc:119] Reading SavedModel debug info (if present) from: /var/folders/y7/lgb6jsgs7y5_xp5pqnhc09k80000gn/T/tmpkowul14q
2022-09-25 13:46:52.484625: I tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-09-25 13:46:52.511390: I tensorflow/cc/saved_model/loader.cc:212] Running initialization

In [11]:
evaluate_model(fp32_model, model_type="fp32")
evaluate_model(fp16_model, model_type="fp16")
evaluate_model(int8_model, model_type="int8")

INFO: Initialized TensorFlow Lite runtime.
INFO: Applying 1 TensorFlow Lite delegate(s) lazily.


fp32 model accuracy is 97.7900% (Number of test samples=10000)


INFO: Applying 1 TensorFlow Lite delegate(s) lazily.


fp16 model accuracy is 97.7900% (Number of test samples=10000)
int8 model accuracy is 97.8300% (Number of test samples=10000)
