mindee · fg-mindee · Nov 2, 2021 · Oct 26, 2021 · Oct 26, 2021 · Oct 26, 2021
diff --git a/docs/source/using_models.rst b/docs/source/using_models.rst
@@ -295,4 +295,32 @@ For reference, here is the JSON export for the same `Document` as above::
             ]
         }
     ]
-  }
+  }
+
+To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::
+
+  xml_output = result.export_as_xml(return_plain=True)
+
+For reference, here is a short snippet of the XML output::
+
+  <?xml version="1.0" encoding="UTF-8"?>
+  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+    <head>
+      <title>docTR - hOCR</title>
+      <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+      <meta name="ocr-system" content="doctr 0.5.0" />
+      <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
+    </head>
+    <body>
+      <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
+      <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
+        <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
+          <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
+            <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
+            <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
+            <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
+          </span>
+        </p>
+      </div>
+    </body>
+  </html>
diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -2,15 +2,17 @@
 
 # This program is licensed under the Apache License version 2.
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Any, Dict, List, Optional, Tuple, Union
+from xml.dom import minidom
+from xml.etree import ElementTree as ET
 
-import numpy as np
 import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page, synthesize_page
+import numpy as np
 from doctr.utils.common_types import BoundingBox, RotatedBbox
+from doctr.utils.geometry import (resolve_enclosing_bbox,
+                                  resolve_enclosing_rbbox)
 from doctr.utils.repr import NestedObject
+from doctr.utils.visualization import synthesize_page, visualize_page
 
 __all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
 
@@ -273,8 +275,10 @@ class Document(Element):
     def __init__(
         self,
         pages: List[Page],
+        hocr_pages: List[ET.Element]
     ) -> None:
         super().__init__(pages=pages)
+        self.hocr_pages = hocr_pages
 
     def render(self, page_break: str = '\n\n\n\n') -> str:
         """Renders the full text of the element"""
@@ -298,6 +302,19 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:
 
         return [page.synthesize() for page in self.pages]
 
+    def export_as_xml(self, return_plain: bool = False, **kwargs):
+        """Export the document as a list of binary hocr (xml) strings or ElementTree objects
+
+        Args:
+            return_plain: whether to return the plain text or the hocr
+        Returns:
+            list of binary hocr (xml) strings or ElementTree objects
+        """
+        if return_plain:
+            return [ET.tostring(hocr_element, encoding='utf-8', method='xml') for hocr_element in self.hocr_pages]
+        else:
+            return [ET.ElementTree(hocr_element) for hocr_element in self.hocr_pages]
+
     @classmethod
     def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
         kwargs = {k: save_dict[k] for k in cls._exported_keys}

diff --git a/doctr/models/builder.py b/doctr/models/builder.py
@@ -4,13 +4,16 @@
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
 
 
-import numpy as np
-from scipy.cluster.hierarchy import fclusterdata
-from typing import List, Tuple, Dict
+from typing import Dict, List, Tuple
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element, SubElement
 
-from doctr.io.elements import Word, Line, Block, Page, Document
+import numpy as np
+from doctr.io.elements import Block, Document, Line, Page, Word
+from doctr.utils.geometry import (resolve_enclosing_bbox,
+                                  resolve_enclosing_rbbox)
 from doctr.utils.repr import NestedObject
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
+from scipy.cluster.hierarchy import fclusterdata
 
 __all__ = ['DocumentBuilder']
 
@@ -242,6 +245,72 @@ def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]])
 
         return blocks
 
+    def _generate_hocr(self, pages: List[Page]) -> List[Element]:
+        """Generate hOCR format from pages
+
+        Args:
+            pages: list of pages
+
+        Returns:
+            list of hOCR elements
+        """
+        hocr_pages: List[Element] = list()
+        for p_idx, page in enumerate(pages):
+            block_count: int = 1
+            line_count: int = 1
+            word_count: int = 1
+            width, height = page.dimensions
+            language = page.language if 'language' in page.language.keys() else 'en'
+            page_hocr = Element('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
+            head = SubElement(page_hocr, 'head')
+            SubElement(head, 'title').text = 'docTR - hOCR'
+            SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
+            SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': 'doctr 0.5.0'})
+            SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
+                       'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
+            body = SubElement(page_hocr, 'body')
+            SubElement(body, 'div', attrib={
+                'class': 'ocr_page',
+                'id': f'page_{p_idx + 1}',
+                'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
+            })
+            for block in page.blocks:
+                xmin, ymin, xmax, ymax = [coord for coordinates in block.geometry for coord in coordinates]
+                block_div = SubElement(body, 'div', attrib={
+                    'class': 'ocr_carea',
+                    'id': f'block_1_{block_count}',
+                    'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
+                })
+                paragraph = SubElement(block_div, 'p', attrib={
+                    'class': 'ocr_par',
+                    'id': f'par_1_{block_count}',
+                    'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
+                })
+                block_count += 1
+                for line in block.lines:
+                    xmin, ymin, xmax, ymax = [coord for coordinates in line.geometry for coord in coordinates]
+                    # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
+                    line_span = SubElement(paragraph, 'span', attrib={
+                        'class': 'ocr_line',
+                        'id': f'line_1_{line_count}',
+                        'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
+                            baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
+                    })
+                    line_count += 1
+                    for word in line.words:
+                        xmin, ymin, xmax, ymax = [coord for coordinates in word.geometry for coord in coordinates]
+                        conf = word.confidence
+                        word_div = SubElement(line_span, 'span', attrib={
+                            'class': 'ocrx_word',
+                            'id': f'word_1_{word_count}',
+                            'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
+                                x_wconf {int(conf * 100)}'
+                        })
+                        word_div.text = word.value
+                        word_count += 1
+            hocr_pages.append(page_hocr)
+        return hocr_pages
+
     def extra_repr(self) -> str:
         return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, "
                 f"paragraph_break={self.paragraph_break}")
@@ -278,5 +347,5 @@ def __call__(
             )
             for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
         ]
-
-        return Document(_pages)
+        hocr_pages = self._generate_hocr(_pages)
+        return Document(_pages, hocr_pages)
diff --git a/test/common/test_core.py b/test/common/test_core.py
@@ -10,4 +10,4 @@ def test_is_tf_available():
 
 
 def test_is_torch_available():
-    assert not doctr.is_torch_available()
+    assert doctr.is_torch_available()
diff --git a/test/common/test_io_elements.py b/test/common/test_io_elements.py
@@ -1,5 +1,9 @@
-import pytest
+from typing import List
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element
+
 import numpy as np
+import pytest
 from doctr.io import elements
 
 
@@ -204,7 +208,8 @@ def test_page():
 
 def test_document():
     pages = _mock_pages()
-    doc = elements.Document(pages)
+    hocr_page = ET.fromstring('<xml></xml>')
+    doc = elements.Document(pages, hocr_pages=[hocr_page])
 
     # Attribute checks
     assert len(doc.pages) == len(pages)
@@ -217,6 +222,10 @@ def test_document():
     # Export
     assert doc.export() == {"pages": [p.export() for p in pages]}
 
+    # Export XML
+    assert isinstance(doc.export_as_xml(return_plain=False), list) and isinstance(
+        doc.export_as_xml(return_plain=True)[0], (bytes, bytearray))
+
     # Show
     doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)