# Receipts and invoices digitalization

## Objetives
Compare the data between the invoiced items and the received items

## Requirements
- [tesseract-ocr](https://github.com/tesseract-ocr/tesseract)
- [pytesseract](https://pypi.org/project/pytesseract/)
- [opencv-python](https://pypi.org/project/opencv-python/)
- [matplotlib](https://matplotlib.org/)
- [numpy](https://numpy.org/install/)

## Pipeline

### Setup

Load the required libraries

In [None]:
import copy
import json
import re

import cv2
import pytesseract
import numpy as np
from matplotlib import pyplot as plt

### Preprocessing

Here we fix the image angle and apply some thresholding filter.

In [None]:
def get_angle(image):
    thresh = get_thresholded_image(image)
    coords = np.column_stack(np.where(thresh > 0))
    angle = cv2.minAreaRect(coords)[-1]
    return -(90 + angle) if angle < -45 else -angle

def get_rotated_image(image, angle):
    h, w = image.shape[:2]
    center = w // 2, h // 2
    matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
    return cv2.warpAffine(image, matrix, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

def get_thresholded_image(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 11)


### Text Detection

In [None]:
def get_detected_text(image):
    marked_image = copy.deepcopy(image)
    data = pytesseract.image_to_data(marked_image, output_type=pytesseract.Output.DICT, config=tesseract_config)
    for i in range(len(data['level'])):
        x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
        marked_image = cv2.rectangle(marked_image, (x, y), (x + w, y + h), (0, 0, 255), 1)
    return marked_image

### Text Recognition

In [None]:
def get_text(image, **kwargs):
    extracted_text = pytesseract.image_to_string(image, **kwargs)
    return '\n'.join(line for line in extracted_text.rsplit('\n') if line.strip())

### Implementation

In [None]:
# Basic config
images = {
    'invoice': 'invoice.jpg',
    'receipt': 'receipt.jpg',
}
tesseract_config = '--oem 3'

# Run the pipeline
for filename in images.values():
    print(f'{f"filename":=^30}')
    image = cv2.imread(filename, 0)

    # Preprocessing
    angle = get_angle(image)
    if angle:
        image = get_rotated_image(image, angle)
        cv2.imwrite(f'rotated.{filename}', rotated)
    thresh = get_thresholded_image(image)
    cv2.imwrite(f'preprocessed.{filename}', thresh)

    # Text detection
    preprocessed_image = cv2.imread(f'preprocessed.{filename}')
    marked_image = get_detected_text(preprocessed_image)
    cv2.imwrite(f'marked_image.{filename}', marked_image)

    # Text recognition
    text = get_text(thresh, config=tesseract_config)
    print(text)