In [64]:
from PIL import Image
import pandas as pd
from pytesseract import Output
from sys import platform
from pathlib import Path
import pytesseract
import cv2


#  Module global variables.
if platform == "win32":
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
last_index = 0
tokens = []


def resize_image(image, size):
    """Image interpolation function"""

    # Output image dimensions calculation.
    multiplier = size / image.shape[0]
    width = int(image.shape[1] * multiplier)
    height = int(image.shape[0] * multiplier)
    shape = (width, height)

    #  Image resizing.
    resized_image = cv2.resize(image, shape, interpolation=cv2.INTER_CUBIC)
    return resized_image, multiplier


def prepare_image(img_path, size):
    """Function of image preparation before passing to recognition."""
    image = cv2.imread(img_path)
    #  Image resize and getting resize multiplier(how much bigger resized image than original).
    resized_image, multiplier = resize_image(image, size)
    
    #  Reverting image channelf before passing to recognition.
    converted_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB) #  Image channels reversion.
    return image, resized_image, multiplier, converted_image


def recognize_image(cv2_image, config, lang):
    """
    Function getting recognized text from an image

    :img_path: - A path to an image;
    :config: - a string config for tesseract;
    :lang: - language of recognition;

    Returns dataframe with ocr data(including coords, word text and confidence).
    """
    
    #  Ocr data extraction.
    ocr_data = pytesseract.image_to_data(cv2_image, lang=lang, output_type=Output.DATAFRAME, config=config)
    
    return ocr_data


def filter_ocr_data(ocr_data):
    """Function of tesseract output preparation"""

    cols = ['text', 'left', 'top', 'width', 'height', 'conf']

    #  Zero confidence entries deletion.
    confident_ocr_data = ocr_data[ocr_data.conf != -1]

    #  Unused columns removal.
    cleared_ocr_data = confident_ocr_data[cols]
    
    cleared_ocr_data.reset_index(drop=True, inplace=True)

    return cleared_ocr_data


def process_text(prepared_ocr_data, concatenation_sep):
    """
    Function processing text from recognized image.
    
    :prepared_ocr_data: - prepared dataframe of data from recognition;
    :concatenation_sep: - separator to be used in strings concatenaton.
    
    Returns dataframe with text info and concatenated text itself.
    """

    def _calc_span(row, sep):
        """
        Function calculating span for a word in dataframe.
        Requires access to a global var last_index.
        
        :sep: - concatenation separator;
        
        Returns span tupple(start index, stop index).
        """
        global last_index
        
        word_len = len(row["text"])
        
        span = tuple([last_index, last_index + word_len])

        #  Adding word len and sep len to a accumulation variable.
        last_index += word_len + len(sep)
        return span

    global last_index
    
    #  Counting span column using concatenation separator.
    span_col = prepared_ocr_data.apply(lambda r: _calc_span(r, concatenation_sep), axis=1)

    #  Adding new column to a dataframe.
    span_df = prepared_ocr_data.assign(span=span_col)

    #  Removing unused columns.
    text_df = span_df.drop(columns=["conf", "left", "top", "width", "height"])
    
    concatenated_text = concatenation_sep.join(text_df['text'])
    
    last_index = 0
    return concatenated_text, text_df


def process_layout(prepared_ocr_data):
    """
    Function processing coordinates of extracted text.
    
    :prepared_ocr_data: - prepared dataframe of data from recognition.
    
    Returns coords df.
    """
    
    layout_df = prepared_ocr_data.drop(columns=["conf"])
    
    return layout_df


def build_result(merged_df, extracted_text, image_shape):
    """Function preparing final result."""

    def _create_token(row):
        """
        Function creates token from a merged dataframe row and
        appends it to global tokens variable
        """
        global tokens
        token_dict = {
            "text": row["text"],
            "position": {
                "left": row["left"],
                "top": row["top"],
                "width": row["width"],
                "height": row["height"] 
            },
            "offset": row["span"][0]
        }
        tokens.append(token_dict)
        return None
    
    global tokens

    #  Tokens creation from merged dataframe.
    merged_df.apply(lambda row: _create_token(row), axis=1)

    source = {
        "width": image_shape[1],
        "height": image_shape[0]
    }

    result = {
        "text": extracted_text,
        "tokens": tokens,
        "source": source
    }

    tokens = []
    return result


def debug_draw(result, image, save_path):
    """Debugging function. Draws recognized rectangles on an image"""
    
    color = (89, 28, 252)
    image_to_draw_at = image.copy()
    
    for token in result["tokens"]:
        x, y, w, h = token["position"]["left"], token["position"]["top"], \
                     token["position"]["width"], token["position"]["height"]
        cv2.rectangle(image_to_draw_at, (x, y), (x+w, y+h), color, 4)
    
    cv2.imwrite(save_path, image_to_draw_at)
    return save_path


def transform_coordinates(filtered_ocr_data, multiplier):
    """
    Function returns a dataframe with coordinates, that fit original image.
    """
    filtered_ocr_data["left"] = filtered_ocr_data["left"].apply(lambda c: int(c/multiplier))
    filtered_ocr_data["top"] = filtered_ocr_data["top"].apply(lambda c: int(c/multiplier))
    filtered_ocr_data["width"] = filtered_ocr_data["width"].apply(lambda c: int(c/multiplier))
    filtered_ocr_data["height"] = filtered_ocr_data["height"].apply(lambda c: int(c/multiplier))

    return filtered_ocr_data


def main_recognition(img_path):
    """
    Main function of recognition module.
    Goes throught all recognition steps and returning final result.
    """

    size = 4000
    lang = 'eng'
    config = '--psm 11'
    sep = ' '
    debug = True

    #  Image preparations.
    #  Interpolation, channels conversion from bgr to rgb to pass into tesseract.
    image, resized_image, multiplier, converted_image = prepare_image(img_path, size)

    #  Getting recognition data and removing unused data from tesseract output.
    ocr_data = recognize_image(converted_image, config, lang)
    filtered_ocr_data = filter_ocr_data(ocr_data)
    
    #  Coordinate transformation after resizing.
    #  Due to resize, it is needed to return back coords fitting original image.
    prepared_ocr_data = transform_coordinates(filtered_ocr_data, multiplier)
    #prepared_ocr_data = filtered_ocr_data

    #  Processing only text data from recognition.
    #  Affects only text information.
    extracted_text, text_df = process_text(prepared_ocr_data, sep)

    #  Processing extracted layout information (coordinates).
    layout_df = process_layout(prepared_ocr_data)

    #  Combined dataframe(text spans info + layout). In order to make it possible to parallel previous operations.
    concatenated_df = pd.concat([text_df, layout_df], axis=1)
    merged_df = concatenated_df.loc[:,~concatenated_df.columns.duplicated()]

    #  Result preparation.
    result = build_result(merged_df, extracted_text, image.shape)

    #  Debug depicting.
    if debug:
        save_path = str(Path("./").joinpath("debug_layout.jpg"))
        debug_draw(result, image, save_path)

    return result

In [65]:
img_path = "./pages/source/azb11c00.tif"
main_recognition(img_path)

{'text': 'wet fp) C BROWN & WILLIAMSON TOBACCO CORPORATION LETTER OF AUTHORIZATION July 26, 1985 Ms. Nancy Fletcher Kapuler Inc. 3436 N. Kennicott Avenue North Arlington Atrium Arlington Heights, IL 60004 Dear Nancy: This is your letter of authorization to conduct the ELI CUTTER Pack Color/Style Mix Evaluation for Brown & Williamson Tobacco Corporation. This letter constitutes a "Project Document” and is subject to all of the terms and conditions of that certain agreement between Brown & Williamson Tobacco Corporation and Kapuler, Inc., dated as of June 1, 1981, and together with said agreement constitutes the entire agreement for Kapuler, Inc. to conduct the ELI CUTTER Pack Color/Style Mix Evaluation for Brown & Williamson Tobacco Corporation, at the cost of $93,950. In the event of a conflict between the terms and conditions of said agreement and the terms and conditions of this document, the terms and conditions of this document shall control. Please assign Project No. 1985-97NP to 