In [1]:
import mmcv
import copy
import torch
import torch.nn as nn
from utils.weight_init import uniform_init, xavier_init
from mmdet.apis import init_detector


recog_config = "crnn_academic_dataset.py"
recog_ckpt = "crnn_academic-a723a1c5.pth"
device = 'cpu'


config = mmcv.Config.fromfile(recog_config)
model_config = config.model

max_seq_len=40

recog_model = init_detector(
    recog_config, recog_ckpt, device=device)
if hasattr(recog_model, 'module'):
    recog_model = recog_model.module

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Use load_from_local loader


# 1. Prepare data

In [2]:
import math
import numpy as np

from PIL import Image

def preprocess_image(pil_im, dst_height=32, dst_min_width=32, width_downsample_ratio=1.0 / 16):

    (ori_width, ori_height) = pil_im.size
    new_width = math.ceil(float(dst_height) / ori_height * ori_width)
    width_divisor = int(1 / width_downsample_ratio)

    if new_width % width_divisor != 0:
        new_width = round(new_width / width_divisor) * width_divisor
    
    new_width = max(dst_min_width, new_width)
    
    im_resized = pil_im.resize(size=(new_width, dst_height))
    
    return np.asarray(im_resized, dtype=np.float32)

In [3]:
image = "test_images/image_2.jpg"

pil_im = Image.open(image).convert('L')
im_array = preprocess_image(pil_im)

im_array = (im_array-127.0)/127.0

image_tensor = torch.from_numpy(im_array)
image_tensor = image_tensor.unsqueeze(0).to(device)

data = dict(img=image_tensor.unsqueeze(0), img_metas=[{'filename': 'test_images/image_1.jpg', 'resize_shape': (32, 48), 'valid_ratio': 1.0}])

# 1. Quantize backbone

## 1.1 Imports and model defs

In [4]:
# from utils import VeryDeepVgg

class VeryDeepVgg(nn.Module):
    """Implement VGG-VeryDeep backbone for text recognition, modified from
      `VGG-VeryDeep <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        leaky_relu (bool): Use leakyRelu or not.
        input_channels (int): Number of channels of input image tensor.
    """

    def __init__(self, leaky_relu=True, input_channels=3):
        super().__init__()

        ks = [3, 3, 3, 3, 3, 3, 2]
        ps = [1, 1, 1, 1, 1, 1, 0]
        ss = [1, 1, 1, 1, 1, 1, 1]
        nm = [64, 128, 256, 256, 512, 512, 512]

        self.channels = nm

        self.quant = torch.quantization.QuantStub()

        cnn = nn.Sequential()

        def conv_relu(i, batch_normalization=False):
            n_in = input_channels if i == 0 else nm[i - 1]
            n_out = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(n_in, n_out, ks[i], ss[i], ps[i]))
            if batch_normalization:
                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(n_out))
            if leaky_relu:
                cnn.add_module('relu{0}'.format(i),
                               nn.LeakyReLU(0.2, inplace=True))
            else:
                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))

        conv_relu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64
        conv_relu(1)
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32
        conv_relu(2, True)
        conv_relu(3)
        cnn.add_module('pooling{0}'.format(2),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16
        conv_relu(4, True)
        conv_relu(5)
        cnn.add_module('pooling{0}'.format(3),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16
        conv_relu(6, True)  # 512x1x16

        self.cnn = cnn

        self.dequant = torch.quantization.DeQuantStub()

    def init_weights(self, pretrained=None):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                xavier_init(m)
            elif isinstance(m, nn.BatchNorm2d):
                uniform_init(m)

    def out_channels(self):
        return self.channels[-1]

    def forward(self, x):
        # output = self.cnn(x)

        x = self.quant(x)
        x = self.cnn(x)
        x = self.dequant(x)

        return x
    

backbone_cfg = model_config['backbone'].copy()
backbone_cfg.pop('type')

fuser_list = [
    ["cnn.conv0", "cnn.relu0"],
    ["cnn.conv1", "cnn.relu1"],
    ["cnn.conv2", "cnn.batchnorm2", "cnn.relu2"],
    ["cnn.conv3", "cnn.relu3"],
    ["cnn.conv4", "cnn.batchnorm4", "cnn.relu4"],
    ["cnn.conv5", "cnn.relu5"],
    ["cnn.conv6", "cnn.batchnorm6", "cnn.relu6"]
]

## 1.2 load model initialized with pretrained weights 

In [5]:
backbone = VeryDeepVgg(**backbone_cfg)
backbone.load_state_dict(recog_model.backbone.state_dict())

<All keys matched successfully>

## 1.3 run static quantization process

In [7]:
backbone.eval()

# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'fbgemm' for server inference and
# 'qnnpack' for mobile inference. Other quantization configurations such
# as selecting symmetric or assymetric quantization and MinMax or L2Norm
# calibration techniques can be specified here.
backbone.qconfig = torch.quantization.get_default_qconfig('fbgemm')

# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
backbone_fused = torch.quantization.fuse_modules(backbone, fuser_list)

# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
backbone_prepared = torch.quantization.prepare(backbone_fused)

# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset
input_fp32 = data["img"]
backbone_prepared(input_fp32)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
backbone_quantized = torch.quantization.convert(backbone_prepared)

# 2. Quantize Decoder

## 2.1 Imports and configs

In [8]:
from mmocr.models.textrecog import CRNNDecoder, CTCConvertor

label_converter_cfg = model_config['label_convertor'].copy()
label_converter_cfg.pop('type')
label_converter_cfg.update(max_seq_len=max_seq_len)
label_converter = CTCConvertor(**label_converter_cfg)


decoder_cfg = model_config['decoder'].copy()
decoder_cfg.pop('type')
decoder_cfg.update(num_classes=label_converter.num_classes())
decoder_cfg.update(start_idx=label_converter.start_idx)
decoder_cfg.update(padding_idx=label_converter.padding_idx)
decoder_cfg.update(max_seq_len=max_seq_len)

## 2.2 load model initialized with pretrained weights

In [9]:
decoder = CRNNDecoder(**decoder_cfg)
decoder.load_state_dict(recog_model.decoder.state_dict())

<All keys matched successfully>

## 2.3 run dynamic quantization process

In [10]:
quantized_decoder = torch.quantization.quantize_dynamic(decoder, {nn.LSTM, nn.Linear}, dtype=torch.qint8)

# 3. Reassemble CRNN

In [11]:
from mmocr.models.textrecog import CTCLoss
from utils import CRNNNet

loss_cfg = model_config['decoder'].copy()
loss_cfg.pop('type')
loss_cfg.update(ignore_index=label_converter.padding_idx)
loss = CTCLoss(**loss_cfg)

model = CRNNNet(preprocessor=None,
            backbone=backbone_quantized,
            decoder=quantized_decoder,
            loss=loss,
            label_convertor=label_converter,
            pretrained=None,
            test_cfg=config.get('test_cfg'))

In [12]:
with torch.no_grad():
    det_result = model(return_loss=False, rescale=True, **data)

In [13]:
det_result

[{'text': '29', 'score': [0.7538787126541138, 0.5606470704078674]}]

# 4. Optimize for Mobile

In [104]:
from torch.utils.mobile_optimizer import optimize_for_mobile
from typing import List, Dict

class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
    def forward(self, inputs: List[torch.Tensor]):
        x = inputs[0].unsqueeze(0)
        data = dict(img=x, img_metas=[{'filename': 'image_1.jpg', 'resize_shape': (32, 48), 'valid_ratio': 1.0}])

        out = self.model(return_loss=False, rescale=True, **data)

        text_val = out[0]['text']
        text_ascii_int = []
        for v in text_val:
            text_ascii_int.append(ord(v))

        return torch.IntTensor(text_ascii_int), torch.FloatTensor(out[0]['score'])

In [105]:
example = torch.rand(1, 1, 32, 48)
wrapped_model = Wrapper(model)
traced_script_module = torch.jit.trace(wrapped_model, example)
traced_script_module_optimized = optimize_for_mobile(traced_script_module)
traced_script_module_optimized._save_for_lite_interpreter("crcnn.ptl")


In [106]:
text_ascii, text_scores = wrapped_model([image_tensor])

In [107]:
text_ascii = text_ascii.numpy().tolist()
text_scores = text_scores.numpy().tolist()

In [108]:
text = [chr(x) for x in text_ascii]

print(text)
print(text_scores)

['2', '9']
[0.7538787126541138, 0.5606470704078674]
