mindee · fg-mindee · Nov 2, 2021 · Oct 26, 2021 · Oct 26, 2021 · Oct 26, 2021
diff --git a/docs/source/using_models.rst b/docs/source/using_models.rst
@@ -295,4 +295,35 @@ For reference, here is the JSON export for the same `Document` as above::
             ]
         }
     ]
-  }
+  }
+
+To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::
+
+  xml_output = result.export_as_xml()
+  for output in xml_output:
+    xml_bytes_string = output[0]
+    xml_element = output[1]
+
+For reference, here is a short snippet of the XML output::
+
+  <?xml version="1.0" encoding="UTF-8"?>
+  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+    <head>
+      <title>docTR - hOCR</title>
+      <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+      <meta name="ocr-system" content="doctr 0.5.0" />
+      <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
+    </head>
+    <body>
+      <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
+      <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
+        <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
+          <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
+            <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
+            <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
+            <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
+          </span>
+        </p>
+      </div>
+    </body>
+  </html>
diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -2,15 +2,18 @@
 
 # This program is licensed under the Apache License version 2.
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Any, Dict, List, Optional, Tuple, Union
+from xml.dom import minidom
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element as ETElement, SubElement
 
-import numpy as np
 import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page, synthesize_page
+import numpy as np
+import doctr
 from doctr.utils.common_types import BoundingBox, RotatedBbox
+from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
 from doctr.utils.repr import NestedObject
+from doctr.utils.visualization import synthesize_page, visualize_page
 
 __all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
 
@@ -253,6 +256,79 @@ def synthesize(self, **kwargs) -> np.ndarray:
 
         return synthesize_page(self.export(), **kwargs)
 
+    def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)', **kwargs) \
+            -> Tuple[bytes, ET.ElementTree]:
+        """Export the page as XML
+
+        Args:
+            return_plain: whether to return the plain (bytes) XML string or an ElementTree object
+            file_title: the title of the XML file
+            **kwargs: additional arguments to pass to the exporter
+
+        Returns:
+            the XML element
+        """
+        p_idx = self.page_idx
+        block_count: int = 1
+        line_count: int = 1
+        word_count: int = 1
+        width, height = self.dimensions
+        language = self.language if 'language' in self.language.keys() else 'en'
+        # Create the XML root element
+        page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
+        # Create the header / SubElements of the root element
+        head = SubElement(page_hocr, 'head')
+        SubElement(head, 'title').text = file_title
+        SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
+        SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"})
+        SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
+                                         'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
+        # Create the body
+        body = SubElement(page_hocr, 'body')
+        SubElement(body, 'div', attrib={
+            'class': 'ocr_page',
+            'id': f'page_{p_idx + 1}',
+            'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
+        })
+        # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
+        for block in self.blocks:
+            xmin, ymin, xmax, ymax = [coord for coordinates in block.geometry for coord in coordinates]
+            block_div = SubElement(body, 'div', attrib={
+                'class': 'ocr_carea',
+                'id': f'block_1_{block_count}',
+                'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
+            })
+            paragraph = SubElement(block_div, 'p', attrib={
+                'class': 'ocr_par',
+                'id': f'par_1_{block_count}',
+                'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
+            })
+            block_count += 1
+            for line in block.lines:
+                xmin, ymin, xmax, ymax = [coord for coordinates in line.geometry for coord in coordinates]
+                # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
+                line_span = SubElement(paragraph, 'span', attrib={
+                    'class': 'ocr_line',
+                    'id': f'line_1_{line_count}',
+                    'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
+                        baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
+                })
+                line_count += 1
+                for word in line.words:
+                    xmin, ymin, xmax, ymax = [coord for coordinates in word.geometry for coord in coordinates]
+                    conf = word.confidence
+                    word_div = SubElement(line_span, 'span', attrib={
+                        'class': 'ocrx_word',
+                        'id': f'word_1_{word_count}',
+                        'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
+                            x_wconf {int(conf * 100)}'
+                    })
+                    # set the text
+                    word_div.text = word.value
+                    word_count += 1
+
+        return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr))
+
     @classmethod
     def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
         kwargs = {k: save_dict[k] for k in cls._exported_keys}
@@ -298,6 +374,18 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:
 
         return [page.synthesize() for page in self.pages]
 
+    def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)', **kwargs) \
+            -> List[Tuple[bytes, ET.ElementTree]]:
+        """Export the document as XML
+
+        Args:
+            file_title: the title of the XML file
+
+        Returns:
+            list of tuple of (bytes, ElementTree)
+        """
+        return [page.export_as_xml(file_title, **kwargs) for page in self.pages]
+
     @classmethod
     def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
         kwargs = {k: save_dict[k] for k in cls._exported_keys}

diff --git a/doctr/models/builder.py b/doctr/models/builder.py
@@ -278,5 +278,4 @@ def __call__(
             )
             for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
         ]
-
         return Document(_pages)
diff --git a/examples/generate_pdfa_with_doctr_output.py b/examples/generate_pdfa_with_doctr_output.py
@@ -0,0 +1,176 @@
+#
+# Lets take a look how to generate a PDF/A document with the doctr output.
+#
+
+import re
+import numpy as np
+from math import atan, cos, sin
+from typing import Optional, Tuple, Dict
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element
+
+from reportlab.lib.colors import black
+from reportlab.lib.units import inch
+from reportlab.pdfgen.canvas import Canvas
+from reportlab.lib.utils import ImageReader
+from PIL import Image
+
+# First define the hOCR (xml) parser
+
+
+class HocrParser():
+
+    def __init__(self):
+        self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
+        self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
+
+    def _element_coordinates(self, element: Element) -> Dict:
+        """
+        Returns a tuple containing the coordinates of the bounding box around
+        an element
+        """
+        out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
+        if 'title' in element.attrib:
+            matches = self.box_pattern.search(element.attrib['title'])
+            if matches:
+                coords = matches.group(1).split()
+                out = {'x1': int(coords[0]), 'y1': int(coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
+        return out
+
+    def _get_baseline(self, element: Element) -> Tuple[float, float]:
+        """
+        Returns a tuple containing the baseline slope and intercept.
+        """
+        if 'title' in element.attrib:
+            matches = self.baseline_pattern.search(element.attrib['title']).group(1).split()
+            if matches:
+                return float(matches[0]), float(matches[1])
+        return (0.0, 0.0)
+
+    def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
+        """
+        Returns the quantity in PDF units (pt) given quantity in pixels
+        """
+        pt = [(c / dpi * inch) for c in pxl.values()]
+        return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
+
+    def _get_element_text(self, element: Element) -> str:
+        """
+        Return the textual content of the element and its children
+        """
+        text = ''
+        if element.text is not None:
+            text += element.text
+        for child in element:
+            text += self._get_element_text(child)
+        if element.tail is not None:
+            text += element.tail
+        return text
+
+    def export_pdfa(self,
+                    out_filename: str,
+                    hocr: ET.ElementTree,
+                    image: Optional[np.ndarray] = None,
+                    fontname: str = "Times-Roman",
+                    fontsize: int = 12,
+                    invisible_text: bool = True,
+                    dpi: int = 300):
+        """
+        Generates a PDF/A document from a hOCR document.
+        """
+
+        width, height = None, None
+        # Get the image dimensions
+        for div in hocr.findall(".//div[@class='ocr_page']"):
+            coords = self._element_coordinates(div)
+            pt_coords = self._pt_from_pixel(coords, dpi)
+            width, height = pt_coords['x2'] - pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
+            # after catch break loop
+            break
+        if width is None or height is None:
+            raise ValueError("Could not determine page size")
+
+        pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
+
+        span_elements = [element for element in hocr.iterfind(".//span")]
+        for line in span_elements:
+            if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
+                # get information from xml
+                pxl_line_coords = self._element_coordinates(line)
+                line_box = self._pt_from_pixel(pxl_line_coords, dpi)
+
+                # compute baseline
+                slope, pxl_intercept = self._get_baseline(line)
+                if abs(slope) < 0.005:
+                    slope = 0.0
+                angle = atan(slope)
+                cos_a, sin_a = cos(angle), sin(angle)
+                intercept = pxl_intercept / dpi * inch
+                baseline_y2 = height - (line_box['y2'] + intercept)
+
+                # configure options
+                text = pdf.beginText()
+                text.setFont(fontname, fontsize)
+                pdf.setFillColor(black)
+                if invisible_text:
+                    text.setTextRenderMode(3)  # invisible text
+
+                # transform overlayed text
+                text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
+
+                elements = line.findall(".//span[@class='ocrx_word']")
+                for elem in elements:
+                    elemtxt = self._get_element_text(elem).strip()
+                    # replace unsupported characters
+                    elemtxt = elemtxt.translate(str.maketrans(
+                        {'ﬀ': 'ff', 'ﬃ': 'f‌f‌i', 'ﬄ': 'f‌f‌l', 'ﬁ': 'fi', 'ﬂ': 'fl'}))
+                    if not elemtxt:
+                        continue
+
+                    # compute string width
+                    pxl_coords = self._element_coordinates(elem)
+                    box = self._pt_from_pixel(pxl_coords, dpi)
+                    box_width = box['x2'] - box['x1']
+                    font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
+
+                    # Adjust relative position of cursor
+                    cursor = text.getStartOfLine()
+                    dx = box['x1'] - cursor[0]
+                    dy = baseline_y2 - cursor[1]
+                    text.moveCursor(dx, dy)
+
+                    # suppress text if it is 0 units wide
+                    if font_width > 0:
+                        text.setHorizScale(100 * box_width / font_width)
+                        text.textOut(elemtxt)
+                pdf.drawText(text)
+
+        # overlay image if provided
+        if image is not None:
+            pdf.drawImage(ImageReader(Image.fromarray(image)), 0, 0, width=width, height=height)
+        pdf.save()
+
+
+# Second get the results from doctr OCR
+from doctr.models import ocr_predictor
+from doctr.io import DocumentFile
+
+model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
+
+img_docs = DocumentFile.from_images(["1.jpg", "2.jpg"])
+
+result = model(img_docs)
+
+# Third, export the results to PDF/A which has a overlayed textlayer and is now searchable in any pdf viewer
+
+# returns: list of tuple where the first element is the (bytes) xml string and the second is the ElementTree
+xml_outputs = result.export_as_xml()
+
+# init the above parser
+parser = HocrParser()
+
+# iterate through the xml outputs and images and export to pdf/a
+# the image is optional else you can set invisible_text=False and the text will be printed on a blank page
+for i, (xml, img) in enumerate(zip(xml_outputs, img_docs)):
+    xml_element_tree = xml[1]
+    parser.export_pdfa(f'{i}.pdf', hocr=xml_element_tree, image=img)
diff --git a/test/common/test_io_elements.py b/test/common/test_io_elements.py
@@ -1,5 +1,5 @@
-import pytest
 import numpy as np
+import pytest
 from doctr.io import elements
 
 
@@ -190,6 +190,10 @@ def test_page():
     assert page.export() == {"blocks": [b.export() for b in blocks], "page_idx": page_idx, "dimensions": page_size,
                              "orientation": orientation, "language": language}
 
+    # Export XML
+    assert isinstance(page.export_as_xml(), tuple) and isinstance(
+        page.export_as_xml()[0], (bytes, bytearray))
+
     # Repr
     assert '\n'.join(repr(page).split('\n')[:2]) == f'Page(\n  dimensions={repr(page_size)}'
 
@@ -217,6 +221,9 @@ def test_document():
     # Export
     assert doc.export() == {"pages": [p.export() for p in pages]}
 
+    # Export XML
+    assert isinstance(doc.export_as_xml(), list)
+
     # Show
     doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)