# Optical Character Recognition

In this tutorial optical character recognition is presented. This notebook is continuation of [004-hello-detection](../004-hello-detection) notebook.

Now in addition of previously used [horizontal-text-detection-0001](https://docs.openvinotoolkit.org/latest/omz_models_model_horizontal_text_detection_0001.html), [text-recognition-resnet](https://docs.openvinotoolkit.org/latest/omz_models_model_text_recognition_resnet_fc.html) is used.

## Imports modules required to run

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from openvino.inference_engine import IECore
from shutil import copy
from os import path, makedirs, listdir

## Settings

In [None]:
ie = IECore()

model_folder = "model"
download_folder = "output"

precision = "FP16"
detection_model_name = "horizontal-text-detection-0001"
recognition_model_name = "text-recognition-resnet-fc"
model_extensions = ("bin", "xml")

## Download models and convert public model

If it is your first run models will download and convert here. It might take up to ten minutes. 

In [None]:
makedirs(model_folder, exist_ok=True)
makedirs(download_folder, exist_ok=True)

# Check if models are already downloaded in download directory
try:
    for model_name, folder_name in ((detection_model_name, f'intel/{detection_model_name}'), (recognition_model_name, f'public/{recognition_model_name}')):
        for extension in model_extensions:
            if not path.isfile(f'{download_folder}/{folder_name}/{precision}/{model_name}.{extension}'):
                raise FileNotFoundError
except FileNotFoundError:
    download_command = f"omz_downloader --name {detection_model_name},{recognition_model_name} --output_dir {download_folder} --precision {precision}"
    convert_command = f"omz_converter --name {recognition_model_name} --precisions {precision} --download_dir {download_folder} --output_dir {download_folder}"
    # Run commands, first download model than convert it to inferable 
    ! $download_command
    # Models are downloaded straight to output folder, we will keep all not used files outside of models directory
    ! $convert_command

## Copy models to model folder

At this point both models are kept in download_folder (by default named 'output'). We need only .bin and .xml files from there that we will copy to model directory.

In [None]:
for text_detection_model in listdir(f"{download_folder}/intel/{detection_model_name}/{precision}"):
    copy(src=f"{download_folder}/intel/{detection_model_name}/{precision}/{text_detection_model}", dst=model_folder)

for text_recognition_model in listdir(f"{download_folder}/public/{recognition_model_name}/{precision}"):
    copy(src=f"{download_folder}/public/{recognition_model_name}/{precision}/{text_recognition_model}", dst=model_folder)

## Load the network

In [None]:
net = ie.read_network(
    model=f"{model_folder}/{detection_model_name}.xml"
)
exec_net = ie.load_network(net, "CPU")

input_layer_ir = next(iter(exec_net.input_info))

## Load an Image

In [None]:
# Text detection models expects image in BGR format
image = cv2.imread("data/intel_rnb.jpg")

# N,C,H,W = batch size, number of channels, height, width
N, C, H, W = net.input_info[input_layer_ir].tensor_desc.dims

# Resize image to meet network expected input sizes
resized_image = cv2.resize(image, (W, H))

# Reshape to network input shape
input_image = np.expand_dims(
    resized_image.transpose(2, 0, 1), 0
)

plt.imshow(image)

## Get boxes

It detects texts in images and returns blob of data in shape of [100, 5]. For each detection description has format [x_min, y_min, x_max, y_max, conf].

In [None]:
result = exec_net.infer(inputs={input_layer_ir: input_image})

# Extract list of boxes from results
boxes = result['boxes']

# Remove zero only boxes
boxes = boxes[~np.all(boxes==0, axis=1)]

In [None]:
def multiply_by_ratio(ratio_x, ratio_y, box):
    return [max(shape * ratio_y, 10) if idx % 2 else shape * ratio_x for idx, shape in enumerate(box[:-1])]

def run_preprocesing_on_crop(crop, net_shape):
    temp_img = cv2.resize(crop, net_shape)
    temp_img = temp_img.reshape((1,) * 2 + temp_img.shape)
    return temp_img


def convert_result_to_image(bgr_image, resized_image, boxes, threshold=0.3, conf_labels=True): 
    # Define colors for boxes and descriptions
    colors = {'red': (255, 0, 0), 'green': (0, 255, 0), 'white': (255, 255, 255)} 

    # Fetch image shapes to calculate ratio
    (real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2]
    ratio_x, ratio_y = real_x/resized_x, real_y/resized_y

    # Convert base image from bgr to rgb format
    rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) 

    # Iterate through non-zero boxes
    for box, annotation in boxes: 
        # Pick confidence factor from last place in array
        conf = box[-1]
        if conf > threshold: 
            # Convert float to int and multiply position of each box by x and y ratio
            (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, box)) 

            # Draw box based on position, parameters in rectangle function are: image, start_point, end_point, color, thickness 
            cv2.rectangle( 
                rgb_image, 
                (x_min, y_min), 
                (x_max, y_max), 
                colors['green'], 
                3
            ) 

            # Add text to image based on position and confidence, parameters in putText function are: image, text, bottomleft_corner_textfield, font, font_scale, color, thickness, line_type 
            if conf_labels:
                # Create background box based on annotation length
                (text_w, text_h), _ = cv2.getTextSize(f"{annotation}", cv2.FONT_HERSHEY_TRIPLEX, 0.8, 1)
                image_copy = rgb_image.copy()
                cv2.rectangle( 
                    image_copy, 
                    (x_min, y_min - text_h - 10), 
                    (x_min + text_w, y_min - 10), 
                    colors['white'], 
                    -1
                )
                # Add weighted image copy with white boxes under text
                cv2.addWeighted(image_copy, 0.4, rgb_image, 0.6, 0, rgb_image)
                cv2.putText( 
                    rgb_image, 
                    f"{annotation}", 
                    (x_min, y_min - 10), 
                    cv2.FONT_HERSHEY_SIMPLEX, 
                    0.8, 
                    colors['red'], 
                    1, 
                    cv2.LINE_AA
                ) 
            
    return rgb_image

In [None]:
recognition_net = ie.read_network(
    model=f"{model_folder}/{recognition_model_name}.xml"
)

exec_recognition_net = ie.load_network(recognition_net, "CPU")

recognition_output_layer = next(iter(exec_recognition_net.outputs))
recognition_input_layer = next(iter(exec_recognition_net.input_info))


In [None]:
# Get height and width of input layer
_, _, H, W = recognition_net.input_info[recognition_input_layer].tensor_desc.dims

# Calculate scale for image resizing
(real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2]
ratio_x, ratio_y = real_x/resized_x, real_y/resized_y

# Convert image to grayscale for text recognition model
grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Get dictionary to encode output, based on model documentation
letters = "~0123456789abcdefghijklmnopqrstuvwxyz"

# Prepare empty list for annotations
annotations = list()

# For each crop, based on boxes given by detection model we want to get annotations
for crop in boxes:
    # Get coordinates on corners of crop
    (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, crop))
    image_crop = run_preprocesing_on_crop(grayscale_image[y_min:y_max, x_min:x_max], (W, H))
    
    # Run inference with recognition model
    recognition_result = exec_recognition_net.infer(inputs={recognition_input_layer: image_crop})
    
    # Squeeze output to remove unnececery dimension
    recognition_results_test = np.squeeze(recognition_result[recognition_output_layer])
    
    # Read annotation based on probabilities from output layer
    annotation = list()
    for letter in recognition_results_test:
        parsed_letter = letters[letter.argmax()]

        # Returning 0 index from argmax signalises end of string
        if parsed_letter == letters[0]:
            break
        annotation.append(parsed_letter)
    annotations.append(''.join(annotation))

boxes_with_annotations = zip(boxes, annotations)


In [None]:
plt.imshow(convert_result_to_image(image, resized_image, boxes_with_annotations, conf_labels=True))
