# OCR with DocTR

In [1]:
# Imports
import base64
import re
from tempfile import TemporaryDirectory
from math import atan, cos, sin
from typing import Dict, Optional, Tuple
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element

import numpy as np
import PyPDF2
from PyPDF2 import PdfFileMerger
# from doctr.io import DocumentFile
# from doctr.models import ocr_predictor
from PIL import Image
from reportlab.lib.colors import black
from reportlab.lib.units import inch
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen.canvas import Canvas

In [3]:
PDF_PATH = '../DATA/'
pdf_file = "test.pdf"
file = '.'.join(pdf_file.split('.')[:-1])

# getting array of RGB values from pdf file
new_docs = pdf_to_array(PDF_PATH, pdf_file, zooming=5)

In [4]:
docs = [new_docs[9], new_docs[10]]

db_resnet50-adcafc63.zip
crnn_vgg16_bn-76b7f2c6.zip
crnn_mobilenet_v3_small-7f36edec.zip
classif_mobilenet_v3_small-1ea8db03.zip

mv db_resnet50-adcafc63.zip /home/jovyan/.cache/doctr/models/
mv crnn_mobilenet_v3_small-7f36edec.zip /home/jovyan/.cache/doctr/models/
mv classif_mobilenet_v3_small-1ea8db03.zip /home/jovyan/.cache/doctr/models/
mv crnn_vgg16_bn-76b7f2c6.zip /home/jovyan/.cache/doctr/models/

In [None]:
import pickle

from doctr.io import DocumentFile
from doctr.utils.visualization import visualize_page
from doctr.models import ocr_predictor

WORK_PATH = '../DATA/WORK_OCR/'

# loading docTR predictor
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

# # ocr of the pdf file
xml_outputs = array_to_ocr_xml(new_docs, model)
pickle.dump(xml_outputs, open(WORK_PATH+file+'_xml_outputs.pkl','wb'))

In [None]:
thresh = 0.05

# ocr models prediction on each pages of the document
ocred_docs = model(docs)

# adding white spaces after each word
for page in ocred_docs.pages:
    for block in page.blocks:
        for line in block.lines:
            for word in line.words:
                word.value += ' '
                if word.confidence < thresh:
                    word.value = ' '
                print(word.value)

# exporting to xml
xml_outputs = ocred_docs.export_as_xml()


In [5]:
import pickle
PATH = '../DATA/'

xml_outputs = pickle.load(open(WORK_PATH+file+'_xml_outputs.pkl', 'rb'))

TODO: delete temporary files

In [6]:
SAVE_PATH = '../DATA/PDFA/'
pdfa_dict = xml_to_pdfa(SAVE_PATH, pdf_file, docs, xml_outputs)

In [12]:
# opening the pdf file document 
doc = fitz.open(SAVE_PATH+file+'_pdfa.pdf')

In [None]:
tp = doc[0].get_textpage()

In [30]:
pickle.dump(pdfa_dict, open(WORK_PATH+file+'.pkl','wb'))

In [2]:
import sys
sys.path.append('../packages/')
import os
from HocrParser import HocrParser
import fitz

def pdf_to_array(PATH, file, zooming=3):
    """ Converts pdf file to numpy array of RGB values.
    
    Parameters
    ----------
     - PATH, string: folder where to find .pdf file
     - file, string: .pdf file name
     - zooming, int, default 3: intensity of zooming x and y axes
     
    Returns
    ----------
     - doc_array, list(numpy.array): list of arrays of RGB uint8 for each page
    """
    
    # remove extension of file name
    file_without_extension = '.'.join(file.split('.')[:-1])
    
    # opening the pdf file document 
    doc = fitz.open(PATH+file_without_extension+'.pdf')

    # initiating arrays with scaling
    image_matrix = fitz.Matrix(fitz.Identity)
    image_matrix.preScale(zooming, zooming)

    # getting pixels from each page of the document
    doc_pixs = [page.getPixmap(alpha = False, matrix=image_matrix) for page in doc]

    # converting pixels to an array of RGB (0 to 255) values for each page of the document
    doc_array = [np.array(Image.frombytes('RGB', [pix.width, pix.height], pix.samples)) for pix in doc_pixs]

    return doc_array



def array_to_ocr_xml(docs, model, thresh=0.05):
    """ Applies doctTR model on numpy array of RGB values.
    
    Parameters
    ----------
     - doc_array, list(numpy.array): list of arrays of RGB uint8 for each page
     - model, ocr_predictor: ocr model from doctr.models
     - thresh, float, default 0.05: confidence threshold for each word recognition
     
    Returns
    ----------
     - xml_outputs, list(tuple(string, xml.etree.ElementTree)): xml output from docTR
    """
    
    # ocr models prediction on each pages of the document
    ocred_docs = model(docs)

    # adding white spaces after each word
    for page in ocred_docs.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    word.value += ' '
                    if word.confidence < thresh:
                        word.value = ' '
    
    # exporting to xml
    xml_outputs = ocred_docs.export_as_xml()
    
    return xml_outputs

def xml_to_pdfa(PATH, file, doc_array, xml_outputs):
    """ Converts xml outputs from docTR to searchable PDF/A file.
    
    Parameters
    ----------
     - PATH, string: folder where to save .pdf file
     - file, string: .pdf file name
     - doc_array, list(numpy.array): list of arrays of RGB uint8 for each page
     - xml_outputs, list(tuple(string, xml.etree.ElementTree)): xml output from docTR
     
    Returns
    ----------
     - pdfa_dict, dict: dictionnary of int page number keys and string text value 
    """    
    
    # remove extension of file name
    file_without_extension = '.'.join(file.split('.')[:-1])
    
    # init parser
    parser = HocrParser()
    
    # init merged pdf file
    merger = PdfFileMerger()
    
    # iterate through the xml outputs and images and export to pdf/a
    # the image is optional else you can set invisible_text=False and the text will be printed on a blank page
    for i, (xml, img) in enumerate(zip(xml_outputs, doc_array)):
        
        # accessing xml.etree.ElementTree.ElementTree object
        xml_element_tree = xml[1]
        
        # exporting the page to pdf/a file
        parser.export_pdfa(f'{PATH+file_without_extension+str(i)}.pdf', hocr=xml_element_tree, image=img)
        # adding the page to merged pdf/a file
        merger.append(f'{PATH+file_without_extension+str(i)}.pdf')
        
    # saving merged pdf/a file
    merger.write(f'{PATH+file_without_extension}.pdf')
    
    # accessing merged pdf/a file
    pdfa_doc = fitz.open(f'{PATH+file_without_extension}.pdf')
    
    # converting pdf
    pdfa_dict = {k:v.get_text() for k,v in enumerate(pdfa_doc)}
    
    return pdfa_dict