Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds XML export method to DocumentBuilder #544

Merged
merged 15 commits into from
Nov 2, 2021
33 changes: 32 additions & 1 deletion docs/source/using_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,35 @@ For reference, here is the JSON export for the same `Document` as above::
]
}
]
}
}

To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::

xml_output = result.export_as_xml()
for output in xml_output:
xml_bytes_string = output[0]
xml_element = output[1]

For reference, here is a sample XML byte string output::

<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>docTR - hOCR</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="ocr-system" content="doctr 0.5.0" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
<div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
<div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
<p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
<span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
<span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
<span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
<span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
</span>
</p>
</div>
</body>
</html>
92 changes: 92 additions & 0 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from doctr.utils.common_types import BoundingBox, RotatedBbox
from doctr.utils.repr import NestedObject

import doctr
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element as ETElement, SubElement

__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']


Expand Down Expand Up @@ -253,6 +257,83 @@ def synthesize(self, **kwargs) -> np.ndarray:

return synthesize_page(self.export(), **kwargs)

def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)') -> Tuple[bytes, ET.ElementTree]:
"""Export the page as XML (hOCR-format)
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md

Args:
file_title: the title of the XML file

Returns:
a tuple of the XML byte string, and its ElementTree
"""
p_idx = self.page_idx
block_count: int = 1
line_count: int = 1
word_count: int = 1
width, height = self.dimensions
language = self.language if 'language' in self.language.keys() else 'en'
# Create the XML root element
page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
# Create the header / SubElements of the root element
head = SubElement(page_hocr, 'head')
SubElement(head, 'title').text = file_title
SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"})
SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
# Create the body
body = SubElement(page_hocr, 'body')
SubElement(body, 'div', attrib={
'class': 'ocr_page',
'id': f'page_{p_idx + 1}',
'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
})
# iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
for block in self.blocks:
if len(block.geometry) != 2:
raise TypeError("XML export is only available for straight bounding boxes for now.")
(xmin, ymin), (xmax, ymax) = block.geometry # type: ignore[misc]
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
block_div = SubElement(body, 'div', attrib={
'class': 'ocr_carea',
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
'id': f'block_{block_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}'
})
paragraph = SubElement(block_div, 'p', attrib={
'class': 'ocr_par',
'id': f'par_{block_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}'
})
block_count += 1
for line in block.lines:
(xmin, ymin), (xmax, ymax) = line.geometry # type: ignore[misc]
# NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
line_span = SubElement(paragraph, 'span', attrib={
'class': 'ocr_line',
'id': f'line_{line_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}; \
baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
})
line_count += 1
for word in line.words:
(xmin, ymin), (xmax, ymax) = word.geometry # type: ignore[misc]
conf = word.confidence
word_div = SubElement(line_span, 'span', attrib={
'class': 'ocrx_word',
'id': f'word_{word_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}; \
x_wconf {int(round(conf * 100))}'
})
# set the text
word_div.text = word.value
word_count += 1

return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr))

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down Expand Up @@ -298,6 +379,17 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:

return [page.synthesize() for page in self.pages]

def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
"""Export the document as XML (hOCR-format)

Args:
**kwargs: additional keyword arguments passed to the Page.export_as_xml method

Returns:
list of tuple of (bytes, ElementTree)
"""
return [page.export_as_xml(**kwargs) for page in self.pages]

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down
10 changes: 9 additions & 1 deletion test/common/test_io_elements.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import numpy as np
import pytest
from doctr.io import elements
from xml.etree.ElementTree import ElementTree


def _mock_words(size=(1., 1.), offset=(0, 0), confidence=0.9):
Expand Down Expand Up @@ -190,6 +191,10 @@ def test_page():
assert page.export() == {"blocks": [b.export() for b in blocks], "page_idx": page_idx, "dimensions": page_size,
"orientation": orientation, "language": language}

# Export XML
assert isinstance(page.export_as_xml(), tuple) and isinstance(
page.export_as_xml()[0], (bytes, bytearray)) and isinstance(page.export_as_xml()[1], ElementTree)

# Repr
assert '\n'.join(repr(page).split('\n')[:2]) == f'Page(\n dimensions={repr(page_size)}'

Expand Down Expand Up @@ -217,6 +222,9 @@ def test_document():
# Export
assert doc.export() == {"pages": [p.export() for p in pages]}

# Export XML
assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages)

# Show
doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)

Expand Down