# Objectives

1. Applying Mindee's docTR Optical Character Recognition (OCR) to collect multiple PDFs' texts.

2. Translating those texts using Meta AI's M2M-100.

3. Building translated PDF/A documents (searchable PDFs).

# Code

## From .pdf to RGB arrays

Getting list of all files

In [10]:
import os

# paths where data is stored
PDF_PATH = '../DATA/'
PDFA_PATH = '../DATA/PDFA/'
WORK_PATH = '../WORK/'

# list of PDF files
files = ['.'.join(f.split('.')[:-1]) for f in os.listdir(PDF_PATH) if f.endswith('.pdf')] 

Converting to RGB numpy arrays:
* Scaling (zoom)
* Gray scaling
* Deskewing

In [18]:
import pickle

from tqdm.notebook import tqdm

# scaling parameter to be applied to original PDF files
zooming = 3 

# iteration over each PDF file
for file in tqdm(files):

    # getting array of RGB values from pdf file (rotated for straight pages)
    docs = pdf_to_array(PDF_PATH, file+'.pdf', zooming=zooming)
    pickle.dump(docs, open(f'{WORK_PATH+file}_array.pkl','wb'))
    # docs = pickle.load(open(f'{WORK_PATH+file}_array.pkl','rb'))

  0%|          | 0/3 [00:00<?, ?it/s]

func:'pdf_to_array' took: 21.8331 sec
func:'pdf_to_array' took: 7.2664 sec
func:'pdf_to_array' took: 17.1023 sec


## Applying OCR

* Optical Character Recognition with mindee's docTR:

https://mindee.github.io/doctr/

In [None]:
import pickle

from doctr.models import ocr_predictor

# docTR pretrained models for rotated text 
# model = ocr_predictor(
#     det_arch='linknet_resnet18_rotation', reco_arch='crnn_vgg16_bn', pretrained=True, 
#     assume_straight_pages=False, export_as_straight_boxes=True)

# docTR pretrained models for straight text
model = ocr_predictor(
    det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True, 
    assume_straight_pages=True, export_as_straight_boxes=True)

# ocr of the pdf file
xml_outputs = array_to_ocr_xml(docs, model)
pickle.dump(xml_outputs, open(f'{WORK_PATH+file}_xml_outputs.pkl','wb'))
# xml_outputs = pickle.load(open(f'{WORK_PATH+file}_xml_outputs.pkl', 'rb'))

 The versions of TensorFlow you are currently using is 2.6.5 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons



Converting to PDFA file

In [194]:
# building text and adding it to the original PDF file (image)
pdfa_dict = xml_to_pdfa(PDFA_PATH, file+, docs, xml_outputs)
pickle.dump(pdfa_dict, open(f'{WORK_PATH+file}_dict.pkl','wb'))
# pdfa_dict = pickle.load(open(f'{WORK_PATH+file}_dict.pkl','rb'))

func:'xml_to_pdfa' took: 8.2860 sec


In [17]:
import sys
sys.path.append('../packages/')
import os
from HocrParser import HocrParser
import fitz
import re
import numpy as np
from PIL import Image
import cv2
from deskew import determine_skew

from functools import wraps
from time import time


def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print('func:%r took: %2.4f sec' % (f.__name__, te-ts))
        return result
    return wrap

from random import sample
import numpy
from deskew import determine_skew

def compute_doc_angle(docs, min_n_page=5, max_angle=5):
    """ computes a documents text angle"""
    
    angles = []
    retries=0
    while (len(angles)<min_n_page) & (retries<2*min_n_page):
        page = sample(docs, 1)[0]
        angle = determine_skew(page)
        if abs(angle)<max_angle:
            angles += [angle]
        retries+=1
    
    if retries>2*min_n_page:
        angle = 0
    else:
        angle = numpy.median(angles)
    
    return angle

def rotate_image(image, angle):
    """ rotates a rgb image"""

    # rotating image
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

def rotate_docs(docs, min_n_page=5, max_angle=5):
    """ Rotates all pages of the document if needed."""
    
    # computes median angle needed to straighten document
    angle = compute_doc_angle(docs, min_n_page, max_angle)
    
    # if rotation is needed
    if angle!=0:
        
        # rotates each pages
        new_docs = []
        for doc in docs:
            new_docs += [rotate_image(doc, angle)]
    
    # else no rotation is needed, the initial images are returned
    else:
        new_docs = docs
        
    return new_docs

@timing
def pdf_to_array(PATH, file, zooming=3, min_n_page=5, max_angle=5):
    """ Converts pdf file to numpy array of RGB values.
    
    Parameters
    ----------
     - PATH, string: folder where to find .pdf file
     - file, string: .pdf file name
     - zooming, int, default 3: intensity of zooming x and y axes
     
    Returns
    ----------
     - doc_array, list(numpy.array): list of arrays of RGB uint8 for each page
    """
    
    # remove extension of file name
    file_without_extension = '.'.join(file.split('.')[:-1])
    
    # opening the pdf file document 
    doc = fitz.open(PATH+file_without_extension+'.pdf')

    # initiating arrays with scaling
    image_matrix = fitz.Matrix(fitz.Identity)
    image_matrix.preScale(zooming, zooming)

    # getting pixels from each page of the document
    doc = [page.getPixmap(alpha = False, matrix=image_matrix) for page in doc]

    # converting pixels to an array of RGB (0 to 255) values for each page of the document
    doc = [np.array(Image.frombytes('RGB', [pix.width, pix.height], pix.samples)) for pix in doc]
    
    # rotating pages if needed
    doc = rotate_docs(doc, min_n_page, max_angle)

    return doc


@timing
def array_to_ocr_xml(docs, model, thresh=0.05):
    """ Applies doctTR model on numpy array of RGB values.
    
    Parameters
    ----------
     - doc_array, list(numpy.array): list of arrays of RGB uint8 for each page
     - model, ocr_predictor: ocr model from doctr.models
     - thresh, float, default 0.05: confidence threshold for each word recognition
     
    Returns
    ----------
     - xml_outputs, list(tuple(string, xml.etree.ElementTree)): xml output from docTR
    """
    
    # ocr models prediction on each pages of the document
    ocred_docs = model(docs)

    # adding white spaces after each word
    for page in ocred_docs.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    word.value += ' '
                    if word.confidence < thresh:
                        word.value = ' '
    
    # exporting to xml
    xml_outputs = ocred_docs.export_as_xml()
    
    return xml_outputs

@timing
def xml_to_pdfa(PATH, file, doc_array, xml_outputs):
    """ Converts xml outputs from docTR to searchable PDF/A file.
    
    Parameters
    ----------
     - PATH, string: folder where to save .pdf file
     - file, string: .pdf file name
     - doc_array, list(numpy.array): list of arrays of RGB uint8 for each page
     - xml_outputs, list(tuple(string, xml.etree.ElementTree)): xml output from docTR
     
    Returns
    ----------
     - pdfa_dict, dict: dictionnary of int page number keys and string text value 
    """    
    
    # remove extension of file name
    file_without_extension = '.'.join(file.split('.')[:-1])
    
    # init parser
    parser = HocrParser()
    
    # init merged pdf file
    merger = PdfFileMerger()
    
    # iterate through the xml outputs and images and export to pdf/a
    # the image is optional else you can set invisible_text=False and the text will be printed on a blank page
    for i, (xml, img) in enumerate(zip(xml_outputs, doc_array)):
        
        # accessing xml.etree.ElementTree.ElementTree object
        xml_element_tree = xml[1]
        
        # exporting the page to pdf/a file
        parser.export_pdfa(f'{PATH+file_without_extension+str(i)}.pdf', hocr=xml_element_tree, image=img)
        # adding the page to merged pdf/a file
        merger.append(f'{PATH+file_without_extension+str(i)}.pdf')
        
    # saving merged pdf/a file
    merger.write(f'{PATH+file_without_extension}.pdf')
    
    # accessing merged pdf/a file
    pdfa_doc = fitz.open(f'{PATH+file_without_extension}.pdf')
    
    # converting pdf
    pdfa_dict = {k:v.get_text() for k,v in enumerate(pdfa_doc)}
    
    return pdfa_dict