# Printed orders digitalization

TODO
- Train tesseract for handwritten digits
    - https://stackoverflow.com/questions/10763017/training-tesseract-for-handwritten-text
    - https://tesseract-ocr.github.io/tessdoc/Training-Tesseract
- Finish reading:
    - https://tesseract-ocr.github.io/tessdoc/ImproveQuality
    - https://medium.com/better-programming/beginners-guide-to-tesseract-ocr-using-python-10ecbb426c3d

## Requirements
- [tesseract-ocr](https://github.com/tesseract-ocr/tesseract)
- [Spanish trained data for tesseract](https://github.com/tesseract-ocr/tessdata_best/blob/master/spa.traineddata)
- [pytesseract](https://pypi.org/project/pytesseract/)
- [opencv-python](https://pypi.org/project/opencv-python/)
- [matplotlib](https://matplotlib.org/)
- [numpy](https://numpy.org/install/)
- [scipy](https://www.scipy.org/)
- [enchant](https://abiword.github.io/enchant/)

## Resources

### Imports

Load the required libraries

In [None]:
import math
import os
import pathlib
import re
import string

import cv2
import enchant
import numpy as np
import pytesseract
from matplotlib import pyplot as plt
from scipy import ndimage

### Preprocessing

Transform the image

In [None]:
def get_image(filename):
    return cv2.imread(filename)


def save_image(folder, filename, image):
    print(f'Save image: {folder / filename} | {cv2.imwrite(str(pathlib.Path("output") / filename), image)}')


def get_kernel(size):
    return np.ones((size, size), np.uint8)


def get_gray_image(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


def get_mask(image):
    return np.zeros(image.shape, dtype=image.dtype)


def get_image_edges(image):
    return cv2.Canny(image, 175, 175)


def get_angle(image):
    image_edges = get_image_edges(image)
    lines = cv2.HoughLinesP(image_edges, 1, math.pi / 180, 100, minLineLength=100, maxLineGap=5)
    angles = []
    for [[x1, y1, x2, y2]] in lines:
        angles.append(math.degrees(math.atan2(y2 - y1, x2 - x1)))
    return np.median(angles)


def get_rotated_image(image, angle):
    if angle:
        return ndimage.rotate(image, angle)
    return image


def get_thresholded_image(binary_image):
    return cv2.adaptiveThreshold(binary_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 25, 11)


def get_blurred_image(image, kernel):
    return cv2.blur(image, kernel.shape)


def get_denoised_image(image):
    return cv2.fastNlMeansDenoising(image, 5)


def get_eroded_image(image, kernel):
    return cv2.erode(image, kernel)


def get_dilated_image(image, kernel):
    return cv2.dilate(image, kernel)


def get_biggest_contour(contours):
    return max(contours, key=lambda x: cv2.contourArea(x))


def draw_contours(image, contours, thickness=10):
    image = np.copy(image)
    return cv2.drawContours(image, contours, -1, (255, 255, 255), thickness)


def set_color(gray_image, thresh, color):
    color = np.full_like(gray_image, (color,))
    return np.where(thresh!=0, gray_image, color)


def get_print_contour(image_gray, kernel):
    eroded_image = get_dilated_image(image_gray, kernel)
    contours = cv2.findContours(cv2.bitwise_not(eroded_image), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]
    return get_biggest_contour(contours)


def get_mask_contourned(mask, contours):
    return draw_contours(mask, contours, -1)


def get_masked_image(image, mask):
    # Apply mask over an image
    return cv2.bitwise_and(image, image, mask=mask)


def get_kmeans_transformed_image(image):
    data = np.float32(image.reshape((-1,3)))
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 4, 1)
    label, center = cv2.kmeans(data, 8, None, criteria, 4, cv2.KMEANS_RANDOM_CENTERS)[1:]
    center = np.uint8(center)
    return center[label.flatten()].reshape((image.shape))


def get_recorted_image(image, contour):
    x, y, w, h = cv2.boundingRect(contour)
    return image[y:y+h, x:x+w]


def get_horizontal_lines_structure(image):
    cols = image.shape[1]
    horizontal_size = cols // 30
    return cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))


def get_vertical_lines_structure(image):
    rows = image.shape[0]
    vertical_size = rows // 30
    return cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))


def get_structure_contours(negative_gray_image, structure):
    eroded_image = get_eroded_image(negative_gray_image, structure)
    return cv2.findContours(eroded_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]


def remove_print_lines(image_gray):
    # remove cell boxs
    bw = cv2.bitwise_not(image_gray)
    contours = []
    horizontal_lines_structure = get_horizontal_lines_structure(bw)
    contours.append(get_structure_contours(bw, horizontal_lines_structure))
    vertical_lines_structure = get_vertical_lines_structure(bw)
    contours.append(get_structure_contours(bw, vertical_lines_structure))
    return np.concatenate([
        np.array(contour, dtype="object")
        for contour in contours
    ])

### Text Detection

Mark the detected text

In [None]:
def get_detected_text(image, tesseract_config):
    return pytesseract.image_to_data(image, **tesseract_config)


def get_image_with_detected_text(image, data):
    image = np.copy(image)
    for i in range(len(data['level'])):
        x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
        image = cv2.rectangle(image, (x - 1, y - 1), (x + w + 2, y + h + 2), (0, 0, 255), 2)
    return image

### Text Recognition

Recognize the text

In [None]:
def get_recognized_text(image, tesseract_config):
    recognized_text = pytesseract.image_to_string(image, **tesseract_config)
    return '\n'.join(line for line in recognized_text['text'].rsplit('\n') if line.strip())


def get_allowed_characters(dictionary):
    # Get allowed characters based on a dictionary.
    # Also add basics number characters, as the dot and the digits.
    with open(dictionary, encoding='utf-8') as f:
        return set(string.digits) | set(f.read()) | {'.'}


def replace_characters(text, characters_tuple):
    for character_tuple in characters_tuple:
        text = text.replace(*character_tuple)
    return text


def remove_not_allowed_characters(text, allowed_characters):
    not_allowed_characters = re.compile(f'[^{"".join(allowed_characters)}]')
    return not_allowed_characters.sub(' ', text)


def remove_empty_lines(text):
    return '\n'.join(line for line in text.split('\n') if line.strip())


def get_spell_checker(dictionary):
    # Create a spell checker based on a dictionary file.
    # The words can be separated by whitespaces.
    spell_checker = enchant.PyPWL()
    spell_checker.pwl = spell_checker.tag = os.path.abspath(dictionary)
    words = set()
    with open(dictionary, encoding='utf-8') as f:
        words |= set(f.read().split())
    for word in words:
        spell_checker.add_to_session(word)
    return spell_checker


def get_converted_string(string):
    try:
        return int(string)
    except ValueError:
        try:
            return float(string)
        except ValueError:
            return string


def get_corrected_text(spell_checker, text):
    # Use spell checker to make corrections to a text
    corrected_lines = []
    for line in text.upper().split('\n'):
        corrected_words = []
        for word in line.split():
            converted_word = get_converted_string(word)
            # If the string isn't a representation of an int or float,
            # it's a word representation and maybe we need to make corrections on them
            # using the spell checker.
            if isinstance(converted_word, str):
                suggestion = spell_checker.suggest(converted_word)
                if suggestion:
                    corrected_words.append(suggestion[0])
                else:
                    corrected_words.append(converted_word.upper())
            else:
                corrected_words.append(str(converted_word))
        corrected_lines.append(' '.join(corrected_words))
    return '\n'.join(corrected_lines)

## Implementation

In [None]:
# Define the basic config
filename = 'order.jpg'
tesseract_config = {
    'config': '--oem 3 --psm 12',
    'lang': 'eng+spa',
}
initial_angle = 0
output_folder_name = 'output'
dictionary = 'fields.txt'

# Run some basic setup
tesseract_config['output_type'] = pytesseract.Output.DICT
output_folder = pathlib.Path(output_folder_name)
output_folder.mkdir(exist_ok=True)
image = get_image(filename)
image = get_rotated_image(image, initial_angle) 

<p align="center">order.jpg</p>
<p align="center"><img align="center" src="order.jpg" width="400px"></p>

In [None]:
# Threshold
thresh = get_thresholded_image(get_gray_image(image))
save_image(output_folder, f'thresh.{filename}', thresh)

<p align="center">thresh.order.jpg</p>
<p align="center"><img src="output/thresh.order.jpg" width="400"></p>

In [None]:
# Keep only the ROI and remove the background
mask = get_mask(get_gray_image(image))

print_contour = get_print_contour(thresh, get_kernel(1))
print_contourned = get_mask_contourned(mask, [print_contour])
save_image(output_folder, f'contourned.{filename}', print_contourned)

mask_contour = get_print_contour(cv2.bitwise_not(print_contourned), get_kernel(20))
mask_contourned = get_mask_contourned(mask, [mask_contour])
white_background = set_color(thresh, mask_contourned, 255)
save_image(output_folder, f'white-background.{filename}', white_background)

recorted_image = get_recorted_image(white_background, print_contour)
save_image(output_folder, f'recorted.{filename}', recorted_image)
recorted_image_original = get_recorted_image(image, print_contour)
save_image(output_folder, f'recorted.original.{filename}', recorted_image_original)

<p align="center">contourned.order.jpg</p>
<p align="center"><img src="output/contourned.order.jpg" width="400"></p>
<br>
<p align="center">white-background.order.jpg</p>
<p align="center"><img src="output/white-background.order.jpg" width="400"></p>
<br>
<p align="center">recorted.order.jpg</p>
<p align="center"><img src="output/recorted.order.jpg" width="400"></p>
<br>
<p align="center">recorted.original.order.jpg</p>
<p align="center"><img src="output/recorted.original.order.jpg" width="400"></p>

In [None]:
# Deskew
angle = get_angle(recorted_image)  # 0.0 for this case
image = get_rotated_image(image, angle)
image_rotated = get_rotated_image(recorted_image, angle)
save_image(output_folder, f'rotated.{filename}', image_rotated)
image_rotated_original = get_rotated_image(recorted_image_original, angle)
save_image(output_folder, f'rotated.original.{filename}', image_rotated_original)

<p align="center">rotated.order.jpg</p>
<p align="center"><i>(0 rotation for this case)</i></p>
<p align="center"><img src="output/rotated.order.jpg" width="400"></p>
<br>
<p align="center">rotated.original.order.jpg</p>
<p align="center"><i>(0 rotation for this case)</i></p>
<p align="center"><img src="output/rotated.original.order.jpg" width="400"></p>

In [None]:
# Dilate characters
dilated = get_dilated_image(image_rotated, get_kernel(2))
save_image(output_folder, f'dilated.{filename}', dilated)

<p align="center">dilated.order.jpg</p>
<p align="center"><img src="output/dilated.order.jpg" width="400"></p>

In [None]:
# Mark the detected text (this step isn't necessary).
# The text boxes are marked over the original recorted and rotated image.
detected_text = get_detected_text(dilated, tesseract_config)
image_with_detected_text = get_image_with_detected_text(image_rotated_original, detected_text)
save_image(output_folder, f'detected.{filename}', image_with_detected_text)

<p align="center">detected.order.jpg</p>
<p align="center"><img src="output/detected.order.jpg" width="400"></p>

In [None]:
# Recognize the text
recognized_text = get_recognized_text(dilated, tesseract_config)
allowed_characters = get_allowed_characters(dictionary)
text = replace_characters(recognized_text, ((',', '.'),))
text = remove_not_allowed_characters(text, allowed_characters)
text = remove_empty_lines(text)
spell_checker = get_spell_checker(dictionary)
corrected_text = get_corrected_text(spell_checker, text)
print(corrected_text)

```
DIA:
HORA:
PAN
PAN LACTAL
5
GALLETA
PAN LACTAL DE SALVADO
CORONITAS
PAQ. PAN DE MIGA
MIGA
C/ TORTA
G.
PAQ. MIGA SALVADO
PAN CHORIPAN
HAMB X 4 UNID.
HAMBURGUESASX 20
PAQ. PERNIL/CHIPS X 6 UNID.
TORPEDO
VIENAS X 6
VIENA LARGO/CORTOS
HAMBURGUESAS X 4
PAN DE LOMO CUADR BOLSA
LOMO X 2
PAN PERNIL
G.
ITALIANOS
G.
MEDIALUNAS SALADAS
30
G. COMUNES
MEDIALUNAS DULCES
G.
G. CHATOS SABORIZ.
FACTURAS GRANDES
GRISINES C/SEMILLA
FACTURAS C/DULCE
GRISINES SALVADO
FACTURAS HOJALDRE
GALLETITAS C/SEMILLA
TOSTADAS
VIGILANTES
SACRAMENTOS
BIZCOCHOS SALVADO
MINIFACTURITAS
PREPIZZA
DONAS
PREPIZZA CHICA
TARTELETAS
ENTERO SALVADO
BIZCOCHOS DULCES
ENTERO BLANCO
TORTA BIZCOCHUELO
BIZCOCHOS MEMBRILLO
BIZCOCHO CREMA PASTELERA
TORTA MIXTA
BIZC BATATA
TORTA HOJALDRE
BIZC CRIOLLITOS
TORTA PORCION
BIZC CASERITOS
MASAS FINAS
CUPCAKES
BIZC AGUA
BIZC SALADO HOJALDRE
MAGDALENAS
BUDINES
X
120 C/QUESO
BIZC CHICHA
MADRIL GRANDES
PAN DULCE
MADRIL MEDIANOS
SANTAFESINOS
PANETTON
MAIZENAS CHICOS
PALMERITAS
MASITAS SABORIZADAS
DEVOLUCIONES:
MERENGUES CHICOS
MERENG.C/ DULCE/CREMA
CAÑONCITOS C/DULCE
PASTA FROLA GRANDES
PASTA FROLA CHICA
```