mindee · fg-mindee · Nov 2, 2021 · Oct 26, 2021 · Oct 26, 2021 · Oct 26, 2021
diff --git a/docs/source/using_models.rst b/docs/source/using_models.rst
@@ -295,4 +295,32 @@ For reference, here is the JSON export for the same `Document` as above::
             ]
         }
     ]
-  }
+  }
+
+To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::
+
+  xml_output = result.export_as_xml(return_plain=True)
+
+For reference, here is a short snippet of the XML output::
+
+  <?xml version="1.0" encoding="UTF-8"?>
+  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+    <head>
+      <title>docTR - hOCR</title>
+      <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+      <meta name="ocr-system" content="doctr 0.5.0" />
+      <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
+    </head>
+    <body>
+      <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
+      <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
+        <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
+          <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
+            <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
+            <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
+            <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
+          </span>
+        </p>
+      </div>
+    </body>
+  </html>
diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -2,15 +2,17 @@
 
 # This program is licensed under the Apache License version 2.
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Any, Dict, List, Optional, Tuple, Union
+from xml.dom import minidom
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element as ETElement, SubElement
 
-import numpy as np
 import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page, synthesize_page
+import numpy as np
 from doctr.utils.common_types import BoundingBox, RotatedBbox
+from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
 from doctr.utils.repr import NestedObject
+from doctr.utils.visualization import synthesize_page, visualize_page
 
 __all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
 
@@ -253,6 +255,73 @@ def synthesize(self, **kwargs) -> np.ndarray:
 
         return synthesize_page(self.export(), **kwargs)
 
+    def export_as_xml(self, return_plain: bool = False, **kwargs) -> Union[bytes, ET.ElementTree]:
+        """Export the page as XML
+
+        Args:
+            return_plain: whether to return the plain (bytes) XML string or an ElementTree object
+            **kwargs: additional arguments to pass to the exporter
+
+        Returns:
+            the XML element
+        """
+        p_idx = self.page_idx
+        block_count: int = 1
+        line_count: int = 1
+        word_count: int = 1
+        width, height = self.dimensions
+        language = self.language if 'language' in self.language.keys() else 'en'
+        page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
+        head = SubElement(page_hocr, 'head')
+        SubElement(head, 'title').text = 'docTR - hOCR'
+        SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
+        SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': 'doctr 0.5.0'})
+        SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
+                                         'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
+        body = SubElement(page_hocr, 'body')
+        SubElement(body, 'div', attrib={
+            'class': 'ocr_page',
+            'id': f'page_{p_idx + 1}',
+            'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
+        })
+        for block in self.blocks:
+            xmin, ymin, xmax, ymax = [coord for coordinates in block.geometry for coord in coordinates]
+            block_div = SubElement(body, 'div', attrib={
+                'class': 'ocr_carea',
+                'id': f'block_1_{block_count}',
+                'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
+            })
+            paragraph = SubElement(block_div, 'p', attrib={
+                'class': 'ocr_par',
+                'id': f'par_1_{block_count}',
+                'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
+            })
+            block_count += 1
+            for line in block.lines:
+                xmin, ymin, xmax, ymax = [coord for coordinates in line.geometry for coord in coordinates]
+                # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
+                line_span = SubElement(paragraph, 'span', attrib={
+                    'class': 'ocr_line',
+                    'id': f'line_1_{line_count}',
+                    'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
+                        baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
+                })
+                line_count += 1
+                for word in line.words:
+                    xmin, ymin, xmax, ymax = [coord for coordinates in word.geometry for coord in coordinates]
+                    conf = word.confidence
+                    word_div = SubElement(line_span, 'span', attrib={
+                        'class': 'ocrx_word',
+                        'id': f'word_1_{word_count}',
+                        'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
+                            x_wconf {int(conf * 100)}'
+                    })
+                    word_div.text = word.value
+                    word_count += 1
+        if return_plain:
+            return ET.tostring(page_hocr, encoding='utf-8', method='xml')
+        return ET.ElementTree(page_hocr)
+
     @classmethod
     def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
         kwargs = {k: save_dict[k] for k in cls._exported_keys}
@@ -298,6 +367,17 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:
 
         return [page.synthesize() for page in self.pages]
 
+    def export_as_xml(self, return_plain: bool = False, **kwargs) -> List[Union[bytes, ET.ElementTree]]:
+        """Export the document as XML
+
+        Args:
+            return_plain: whether to return the plain (bytes) XML string or an ElementTree object
+
+        Returns:
+            list of XML (hOCR format) elements
+        """
+        return [page.export_as_xml(return_plain, **kwargs) for page in self.pages]
+
     @classmethod
     def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
         kwargs = {k: save_dict[k] for k in cls._exported_keys}

diff --git a/doctr/models/builder.py b/doctr/models/builder.py
@@ -278,5 +278,4 @@ def __call__(
             )
             for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
         ]
-
         return Document(_pages)
diff --git a/test/common/test_io_elements.py b/test/common/test_io_elements.py
@@ -1,5 +1,9 @@
-import pytest
+from typing import List
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element
+
 import numpy as np
+import pytest
 from doctr.io import elements
 
 
@@ -217,6 +221,10 @@ def test_document():
     # Export
     assert doc.export() == {"pages": [p.export() for p in pages]}
 
+    # Export XML
+    assert isinstance(doc.export_as_xml(return_plain=False), list) and isinstance(
+        doc.export_as_xml(return_plain=True)[0], (bytes, bytearray))
+
     # Show
     doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)