<a href="https://colab.research.google.com/github/matthewleechen/digitize_woodcroft_patents/blob/main/notebooks/inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook was designed for training in Google Colab Pro.

In [None]:
%%capture
# Install all dependencies
! pip install -e git+https://github.com/matthewleechen/layout-parser.git#egg=layoutparser
! pip install torchvision && pip install "git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2"
! pip install google-cloud-vision 

Restart runtime before proceeding.

In [None]:
import layoutparser as lp
import os
import cv2
from concurrent.futures import ThreadPoolExecutor

The notebook assumes the images you want to run inference on are uploaded to your Google Drive.

In [None]:
# Mount Google Drive
from google.colab import drive 
drive.mount('/content/drive') # will prompt sign-in

Mounted at /content/drive


Upload the model configuration file (`config.yaml`) and the model weights file (in the format `model_{number of iterations}.yaml`). Modify the label map appropriately according to your COCO annotations file.

The argument `MODEL.ROI_HEADS.SCORE_THRESH_TEST` is set to 0.5 by default, meaning that bounding boxes with a confidence score below 0.5 are suppressed (see https://detectron2.readthedocs.io/en/latest/modules/config.html). You can change this parameter.

In [None]:
model = lp.Detectron2LayoutModel(
    config_path = "/path/to/config/file",
    model_path = "/path/to/model/weights/file",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5], 
    label_map={0: "date_box", 1: "full_box", 2: "header", 3: "text"}
)

Upload your Google Cloud Vision credentials file to the current directory.

In [None]:
# Initialize GCV API 
ocr_agent = lp.GCVAgent.with_credential("/content/patents-381912-ca752b6251d6.json",
                                        languages = ['en']) 

In [None]:
# Set folder path
folder_path = "/path/to/folder"

In [None]:
# Loop over documents and run inference
def process_image(filepath):
    # Construct the input and output file paths
    output_filepath = os.path.splitext(filepath)[0] + '.txt'

    # Perform layout detection and OCR on the image
    image = cv2.imread(filepath)
    layout = model.detect(image)
    blocks = lp.Layout([b for b in layout if b.type=='text' or b.type=='date_box' or b.type=='full_box'])

    with open(output_filepath, 'w') as f:
        sorted_blocks = sorted(blocks, key=lambda b: b.coordinates[1]) # order by y-axis

        for block in sorted_blocks: # padding
            segment_image = (block
                                .pad(left=5, right=5, top=5, bottom=5)
                                .crop_image(image))

            layout = ocr_agent.detect(segment_image)

            full_text = ''
            for line in layout:
                text = line.text
                if text.endswith('.'):
                    full_text += text + '\n'
                else:
                    # remove spaces before commas
                    text = text.replace(' ,', ',')
                    full_text += text + ' '

            # remove space before full stops
            full_text = full_text.replace(' .', '.')

            # Write the output to the file
            f.write(full_text.strip() + "\n")
            f.write('---\n')

if __name__ == '__main__':

    # Get a list of all the JPEG files in the folder
    filenames = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.lower().endswith('.jpg')]

    # Batch process the images using multiple threads
    with ThreadPoolExecutor(max_workers=2) as executor: # set max_workers = #cpu cores (2 on Colab)
        executor.map(process_image, filenames)

In [36]:
# Create merged text file from all individual pages.

output_file = "merged.txt"

# Get a list of all the .txt files in the directory, sorted by name
files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
files.sort()

with open(os.path.join(folder_path, output_file), "w") as outfile:
    for filename in files:
        with open(os.path.join(folder_path, filename), "r") as infile:
            content = infile.read().strip()
            if content:  # Check if content is not empty
                if outfile.tell() != 0:  # Check if output file is not empty
                    outfile.write("---\n")  # Add separator between files
                outfile.write(content)

# Remove double separators
with open(os.path.join(folder_path, output_file), "r+") as f:
    lines = f.readlines()
    f.seek(0)
    for i, line in enumerate(lines):
        if line.strip() != "---" or i == 0 or lines[i-1].strip() != "---":
            f.write(line)
    f.truncate()