# Printed orders digitalization

## Requirements
- [tesseract-ocr](https://github.com/tesseract-ocr/tesseract)
- [Spanish trained data for tesseract](https://github.com/tesseract-ocr/tessdata_best/blob/master/spa.traineddata)
- [pytesseract](https://pypi.org/project/pytesseract/)
- [opencv-python](https://pypi.org/project/opencv-python/)
- [matplotlib](https://matplotlib.org/)
- [numpy](https://numpy.org/install/)
- [scipy](https://www.scipy.org/)
- [enchant](https://abiword.github.io/enchant/)

## Pipeline

### Setup

Load the required libraries

In [None]:
import copy
import json
import math
import os
import pathlib
import re
import string

import cv2
import enchant
import numpy as np
import pytesseract
from matplotlib import pyplot as plt
from scipy import ndimage

### Preprocessing

Transform the image

In [None]:
def get_image(filename):
    return cv2.imread(filename)


def save_image(folder, filename, image):
    print(f'Save image: {folder / filename} | {cv2.imwrite(str(pathlib.Path("output") / filename), image)}')


def get_kernel(size):
    return np.ones((size, size), np.uint8)


def get_gray_image(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


def get_mask(image):
    return np.zeros(image.shape, dtype=image.dtype)


def get_image_edges(image):
    return cv2.Canny(image, 175, 175)


def get_angle(image):
    image_edges = get_image_edges(image)
    lines = cv2.HoughLinesP(image_edges, 1, math.pi / 180, 100, minLineLength=100, maxLineGap=5)
    angles = []
    for [[x1, y1, x2, y2]] in lines:
        angles.append(math.degrees(math.atan2(y2 - y1, x2 - x1)))
    return np.median(angles)


def get_rotated_image(image, angle):
    if angle:
        return ndimage.rotate(image, angle)
    return image


def get_thresholded_image(binary_image):
    return cv2.adaptiveThreshold(binary_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 25, 11)


def get_blurred_image(image, kernel):
    return cv2.blur(image, kernel.shape)


def get_denoised_image(image):
    return cv2.fastNlMeansDenoising(image, 5)


def get_eroded_image(image, kernel):
    return cv2.erode(image, kernel)


def get_dilated_image(image, kernel):
    return cv2.dilate(image, kernel)


def get_biggest_contour(contours):
    return max(contours, key=lambda x: cv2.contourArea(x))


def draw_contours(image, contours, thickness=10):
    return cv2.drawContours(image, [contours], -1, (255, 255, 255), thickness)


def get_print_contour(image_gray, kernel):
    # Require the print have a dark background
    blur = get_blurred_image(image_gray, kernel)
    thresh = cv2.threshold(blur, 75, 255, cv2.THRESH_BINARY, 50)[1]
    dilation = get_dilated_image(thresh, kernel)
    contours = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]
    return get_biggest_contour(contours)


def get_mask_contourned(mask, contours):
    return draw_contours(mask, contours, -1)


def get_masked_image(image, mask):
    # Apply mask over an image
    return cv2.bitwise_and(image, image, mask=mask)


def get_kmeans_transformed_image(image):
    data = np.float32(image.reshape((-1,3)))
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 4, 1)
    label, center = cv2.kmeans(data, 8, None, criteria, 4, cv2.KMEANS_RANDOM_CENTERS)[1:]
    center = np.uint8(center)
    return center[label.flatten()].reshape((image.shape))


def get_recorted_image(image, contour):
    x, y, w, h = cv2.boundingRect(contour)
    return image[y:y+h, x:x+w]

### Text Detection

Mark text detection boxs

In [None]:
def get_detected_text(image, tesseract_config):
    return pytesseract.image_to_data(image, **tesseract_config)


def get_image_with_detected_text(image, data):
    image = copy.deepcopy(image)
    for i in range(len(data['level'])):
        x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
        image = cv2.rectangle(image, (x - 1, y - 1), (x + w + 2, y + h + 2), (0, 0, 255), 2)
    return image

### Text Recognition

In [None]:
def get_recognized_text(image, tesseract_config):
    recognized_text = pytesseract.image_to_string(image, **tesseract_config)
    return '\n'.join(line for line in recognized_text['text'].rsplit('\n') if line.strip())


def get_allowed_characters(dictionary):
    with open(dictionary, encoding='utf-8') as f:
        return set(string.digits) | set(f.read()) | {'.'}


def replace_characters(text, characters_tuple):
    for character_tuple in characters_tuple:
        text = text.replace(*character_tuple)
    return text


def remove_not_allowed_characters(text, allowed_characters):
    not_allowed_characters = re.compile(f'[^{"".join(allowed_characters)}]')
    return not_allowed_characters.sub(' ', text)


def remove_empty_lines(text):
    return '\n'.join(line for line in text.split('\n') if line.strip())


def get_spell_checker(dictionary):
    spell_checker = enchant.PyPWL()
    spell_checker.pwl = spell_checker.tag = os.path.abspath(dictionary)
    words = set()
    with open(dictionary, encoding='utf-8') as f:
        for line in f.read().split('\n'):
            words |= set(line.split())
    for word in words:
        spell_checker.add_to_session(word)
    return spell_checker


def get_converted_string(string):
    try:
        return int(string)
    except ValueError:
        try:
            return float(string)
        except ValueError:
            return string


def get_corrected_text(spell_checker, text):
    corrected_lines = []
    for line in text.upper().split('\n'):
        corrected_words = []
        for word in line.split():
            converted_word = get_converted_string(word)
            if isinstance(converted_word, str):
                suggestion = spell_checker.suggest(converted_word)
                if suggestion:
                    corrected_words.append(suggestion[0])
                else:
                    corrected_words.append(converted_word.upper())
            else:
                corrected_words.append(str(converted_word))
        corrected_lines.append(' '.join(corrected_words))
    return '\n'.join(corrected_lines)

### Implementation

In [None]:
# Basic config
images = ['test_1.jpg', 'test_2.jpg', 'test_3.jpg']
tesseract_config = {
    'config': '--oem 3 --psm 12',
    'lang': 'eng+spa',
}
initial_angle = -90
output_folder_name = 'output'
dictionary = 'fields.txt'

# filename = images[0]

# Run the pipeline
for filename in images):
    print(f'{f" {filename} ":=^30}')
    tesseract_config['output_type'] = pytesseract.Output.DICT
    output_folder = pathlib.Path(output_folder_name)
    output_folder.mkdir(exist_ok=True)

    # Preprocessing
    image = get_image(filename)
    image = get_rotated_image(image, initial_angle)
    # - Keep only the ROI
    image_gray = get_gray_image(image)
    print_contour = get_print_contour(image_gray, get_kernel(5))
    mask = get_mask(image_gray)
    mask_contourned = get_mask_contourned(mask, print_contour)
    masked_image = get_masked_image(image, mask_contourned)
    save_image(output_folder, f'contourned.{filename}', masked_image)
    recorted_image = get_recorted_image(masked_image, print_contour)
    save_image(output_folder, f'recorted.{filename}', recorted_image)
    recorted_original_image = get_recorted_image(image, print_contour)
    save_image(output_folder, f'recorted.original.{filename}', recorted_original_image)
    # - Deskew
    angle = get_angle(recorted_image)
    image = get_rotated_image(image, angle)
    image_rotated = get_rotated_image(recorted_image, angle)
    save_image(output_folder, f'rotated.{filename}', image_rotated)
    # - Threshold
    image_rotated_gray = get_gray_image(image_rotated)
    thresh = get_thresholded_image(image_rotated_gray)
    save_image(output_folder, f'thresh.{filename}', thresh)
    # - Denoise
    dilated = get_dilated_image(thresh, get_kernel(2))
    save_image(output_folder, f'dilated.{filename}', dilated)
    eroded = get_eroded_image(dilated, get_kernel(3))
    save_image(output_folder, f'eroded.{filename}', eroded)

    # Text detection
    detected_text = get_detected_text(eroded, tesseract_config)
    image_with_detected_text = get_image_with_detected_text(recorted_original_image, detected_text)
    save_image(output_folder, f'detected.{filename}', image_with_detected_text)

    # Text recognition
    recognized_text = get_recognized_text(eroded, tesseract_config)
    allowed_characters = get_allowed_characters(dictionary)
    text = replace_characters(recognized_text, ((',', '.'),))
    text = remove_not_allowed_characters(text, allowed_characters)
    text = remove_empty_lines(text)
    spell_checker = get_spell_checker(dictionary)
    corrected_text = get_corrected_text(spell_checker, text)
    print(corrected_text)