# Quantization: How to Maintain Model Accuracy While Reducing the Size of the Model

Below you will find code related to the paper.

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import tensorflow_model_optimization as tfmot
from tensorflow.keras.applications import mobilenet_v2, resnet_v2
from tensorflow.data import Dataset
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.keras as k
from keras.applications import imagenet_utils
import glob
import re
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)

The below functions are related to loading the images/labels into memory, generating architecture for the networks, and predicting the accuracy for the non-quantized model.

In [2]:
IMG_SIZE = (224,224)
IMG_SHAPE= IMG_SIZE + (3,)
BATCH_SIZE = 32

#load images and labels from directory
def load_imgs(path):
    imgs = []
    labels = []
    files = glob.glob(path+'*.JPEG')
    for f in files:
        # load an image in PIL format
        original = k.preprocessing.image.load_img(f, target_size=IMG_SIZE)    
        # convert the PIL image to a numpy array    
        numpy_image = k.preprocessing.image.img_to_array(original)
        labels.append(re.search('_(.+)\.',f).group(1))
        imgs.append(numpy_image)
    return imgs, labels

#generate architecture for pretrained model initialized with imagenet weights
def generate_model(modeltype):
    inputs = k.Input(shape=(224,224,3))
    x = imagenet_utils.preprocess_input(inputs, data_format=None, mode='tf')
    if modeltype == 'mobilenet':
        outputs = mobilenet_v2.MobileNetV2(input_shape=IMG_SHAPE,weights='imagenet')(x)
    elif modeltype == 'resnet50':
        outputs = resnet_v2.ResNet50V2(input_shape=IMG_SHAPE,weights='imagenet')(x)            
    elif modeltype == 'resnet101':
        outputs = resnet_v2.ResNet101V2(input_shape=IMG_SHAPE,weights='imagenet')(x)            
    else:
        outputs = resnet_v2.ResNet152V2(input_shape=IMG_SHAPE,weights='imagenet')(x)         
            
    model = k.Model(inputs,outputs)
    #save model for size analysis
    model.save('models/'+modeltype+'.h5')
    return model

#predict classes for images
def predict_class(images,model):
    predictions = model.predict(images)        
    labels = k.applications.imagenet_utils.decode_predictions(predictions)
    labels = [label[0][1] for label in labels]
    return labels

#Prints accuracy
def accuracy(preds, actual):
    accuracy = (sum(x == y for x,y in zip(preds,actual))/len(preds))*100
    print('model accuracy is %.2f%% (Number of test samples=%d)' % (accuracy, len(preds)))
    

Now that we have all helper functions set up for the non-quantized model, below we will set up functions for running the quantized model, which is no longer a keras model. It is a TFlite binary. These functions were built with the help of TensorFlow tutorial here: https://www.tensorflow.org/lite/performance/post_training_integer_quant

In [3]:
# Helper function to run inference on a TFLite model
def run_tflite_model(tflite_quant_model, test_image_indices):

    # Initialize the interpreter
    interpreter = tf.lite.Interpreter(model_content=tflite_quant_model)
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]

    predictions = []
    for i, test_image_index in enumerate(test_image_indices):
        test_image = imgs[test_image_index]
        test_label = labels[test_image_index]

        # Check if the input type is quantized, then rescale input data to uint8
        if input_details['dtype'] == np.uint8:
            input_scale, input_zero_point = input_details["quantization"]
            test_image = test_image / input_scale + input_zero_point

        test_image = np.expand_dims(test_image, axis=0).astype(input_details["dtype"])
        interpreter.set_tensor(input_details["index"], test_image)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details["index"])[0]

        predictions.append(output)
    return predictions

# Helper function to evaluate a TFLite model on all images
def evaluate_model(tflite_model):
    test_image_indices = range(len(imgs))
    predictions = run_tflite_model(tflite_model, test_image_indices)

    predictions = np.array(predictions)
    predictions = k.applications.imagenet_utils.decode_predictions(predictions)
    predictions = [preds[0][1] for preds in predictions]
    
    return predictions

#Dynamic range quantization function
def dynamic_range_quant(model,modeltype):
    quant_file_path = 'quant_models/dynamic'+modeltype+'.tflite'

    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    tflite_quant_model = converter.convert()
    
    with open(quant_file_path, 'wb') as f:
        f.write(tflite_quant_model)
    return tflite_quant_model

#Calibration dataset generator
def representative_dataset():
    for data in tf.data.Dataset.from_tensor_slices((imgs)).batch(1).take(1000):
        yield [data]
        
#Full integer quantization function
def full_int_quant(model,modeltype):
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.representative_dataset = representative_dataset
    # Ensure that if any ops can't be quantized, the converter throws an error
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    # Set the input and output tensors to uint8 (APIs added in r2.3)
    converter.inference_input_type = tf.uint8
    converter.inference_output_type = tf.uint8

    tflite_quant_model = converter.convert()
    
    quant_file_path = 'quant_models/fullint'+modeltype+'.tflite'
    with open(quant_file_path, 'wb') as f:
        f.write(tflite_quant_model)

    return tflite_quant_model
    
#Float16 quantization function
def float16_quant(model,modeltype):
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_types = [tf.float16]
    tflite_quant_model = converter.convert()
    
    quant_file_path = 'quant_models/float16'+modeltype+'.tflite'
    with open(quant_file_path, 'wb') as f:
        f.write(tflite_quant_model)

    return tflite_quant_model

Next we will load the image into memory and convert it to a TF Record. We convert to a TF Record to take advantage of efficiency gains.

In [4]:
imgs, labels = load_imgs('images/')
dataset = Dataset.from_tensor_slices((imgs,labels)).batch(BATCH_SIZE)

Generate models and run prediction

In [5]:
m = generate_model('mobilenet')

r50 = generate_model('resnet50')

r101 = generate_model('resnet101')

r152 = generate_model('resnet152')

print('MobileNetV2 accuracy')
testm = predict_class(dataset,m)
accuracy(testm,labels)
print('ResNet50V2 accuracy')
testr50 = predict_class(dataset,r50)
accuracy(testr50,labels)
print('ResNet101V2 accuracy')
testr101 = predict_class(dataset,r101)
accuracy(testr101,labels)
print('ResNet152V2 accuracy')
testr152 = predict_class(dataset,r152)
accuracy(testr152,labels)

MobileNetV2 accuracy
model accuracy is 83.9000% (Number of test samples=1000)
ResNet50V2 accuracy
model accuracy is 73.1000% (Number of test samples=1000)
ResNet101V2 accuracy
model accuracy is 76.7000% (Number of test samples=1000)
ResNet152V2 accuracy
model accuracy is 77.5000% (Number of test samples=1000)


### Dynamic Range Quantization

Generate models

In [11]:
mobiledynamic = dynamic_range_quant(m,'mobilenet')

r50dynamic = dynamic_range_quant(r50,'resnet50')

r101dynamic = dynamic_range_quant(r101,'resnet101')

r152dynamic = dynamic_range_quant(r152,'resnet152')



Run predictions

In [None]:
print('Dynamic Quantized MobileNetV2 accuracy')
testm = evaluate_model(mobiledynamic)
accuracy(testm,labels)
print('Dynamic Quantized ResNet50V2 accuracy')
testr50 = evaluate_model(r50dynamic)
accuracy(testr50,labels)
print('Dynamic Quantized ResNet101V2 accuracy')
testr101 = evaluate_model(r101dynamic)
accuracy(testr101,labels)
print('Dynamic Quantized ResNet152V2 accuracy')
testr152 = evaluate_model(r152dynamic)
accuracy(testr152,labels)

### Full Integer Quantization

Generate models

In [8]:
mobilefullint = full_int_quant(m, 'mobilenet')

r50fullint = full_int_quant(r50, 'resnet50')

r101fullint = full_int_quant(r101, 'resnet101')

r152fullint = full_int_quant(r152, 'resnet152')



Run predictions

In [None]:
print('Full Integer Quantized MobileNetV2 accuracy')
testm = evaluate_model(mobilefullint)
accuracy(testm,labels)
print('Full Integer Quantized ResNet50V2 accuracy')
testr50 = evaluate_model(r50fullint)
accuracy(testr50,labels)
print('Full Integer Quantized ResNet101V2 accuracy')
testr101 = evaluate_model(r101fullint)
accuracy(testr101,labels)
print('Full Integer Quantized ResNet152V2 accuracy')
testr152 = evaluate_model(r152fullint)
accuracy(testr152,labels)

Full Integer Quantized MobileNetV2 accuracy
model accuracy is 82.0000% (Number of test samples=1000)
Full Integer Quantized ResNet50V2 accuracy
model accuracy is 72.1000% (Number of test samples=1000)
Full Integer Quantized ResNet101V2 accuracy
model accuracy is 76.1000% (Number of test samples=1000)
Full Integer Quantized ResNet152V2 accuracy


### Float16 quantization

Generate models

In [9]:
mobilefloat16 = float16_quant(m,'mobilenet')

r50float16 = float16_quant(r50,'resnet50')

r101float16 = float16_quant(r101,'resnet101')

r152float16 = float16_quant(r152,'resnet152')



Run predictions

In [13]:
print('Float16 Quantized MobileNetV2 accuracy')
testm = evaluate_model(mobilefloat16)
accuracy(testm,labels)
print('Float16 Quantized ResNet50V2 accuracy')
testr50 = evaluate_model(r50float16)
accuracy(testr50,labels)
print('Float16 Quantized ResNet101V2 accuracy')
testr101 = evaluate_model(r101float16)
accuracy(testr101,labels)
print('Float16 Quantized ResNet152V2 accuracy')
testr152 = evaluate_model(r152float16)
accuracy(testr152,labels)

Float16 Quantized MobileNetV2 accuracy
model accuracy is 83.7000% (Number of test samples=1000)
Float16 Quantized ResNet50V2 accuracy
model accuracy is 73.0000% (Number of test samples=1000)
Float16 Quantized ResNet101V2 accuracy
model accuracy is 76.7000% (Number of test samples=1000)
Float16 Quantized ResNet152V2 accuracy
model accuracy is 77.5000% (Number of test samples=1000)
