Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds XML export method to DocumentBuilder #544

Merged
merged 15 commits into from
Nov 2, 2021
30 changes: 29 additions & 1 deletion docs/source/using_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,32 @@ For reference, here is the JSON export for the same `Document` as above::
]
}
]
}
}

To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::

xml_output = result.export_as_xml(return_plain=True)

For reference, here is a short snippet of the XML output::
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>docTR - hOCR</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="ocr-system" content="doctr 0.5.0" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
<div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
<div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
<p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
<span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
<span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
<span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
<span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
</span>
</p>
</div>
</body>
</html>
90 changes: 85 additions & 5 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@

# This program is licensed under the Apache License version 2.
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
from typing import Any, Dict, List, Optional, Tuple, Union
from xml.dom import minidom
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element as ETElement, SubElement

import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List, Any, Optional, Union

from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
from doctr.utils.visualization import visualize_page, synthesize_page
import numpy as np
from doctr.utils.common_types import BoundingBox, RotatedBbox
from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
from doctr.utils.repr import NestedObject
from doctr.utils.visualization import synthesize_page, visualize_page

__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']

Expand Down Expand Up @@ -253,6 +255,73 @@ def synthesize(self, **kwargs) -> np.ndarray:

return synthesize_page(self.export(), **kwargs)

def export_as_xml(self, return_plain: bool = False, **kwargs) -> Union[bytes, ET.ElementTree]:
"""Export the page as XML

Args:
return_plain: whether to return the plain (bytes) XML string or an ElementTree object
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
**kwargs: additional arguments to pass to the exporter
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

Returns:
the XML element
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
"""
p_idx = self.page_idx
block_count: int = 1
line_count: int = 1
word_count: int = 1
width, height = self.dimensions
language = self.language if 'language' in self.language.keys() else 'en'
page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
head = SubElement(page_hocr, 'head')
SubElement(head, 'title').text = 'docTR - hOCR'
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': 'doctr 0.5.0'})
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
body = SubElement(page_hocr, 'body')
SubElement(body, 'div', attrib={
'class': 'ocr_page',
'id': f'page_{p_idx + 1}',
'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
})
for block in self.blocks:
xmin, ymin, xmax, ymax = [coord for coordinates in block.geometry for coord in coordinates]
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
block_div = SubElement(body, 'div', attrib={
'class': 'ocr_carea',
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
'id': f'block_1_{block_count}',
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
})
paragraph = SubElement(block_div, 'p', attrib={
'class': 'ocr_par',
'id': f'par_1_{block_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
})
block_count += 1
for line in block.lines:
xmin, ymin, xmax, ymax = [coord for coordinates in line.geometry for coord in coordinates]
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
# NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
line_span = SubElement(paragraph, 'span', attrib={
'class': 'ocr_line',
'id': f'line_1_{line_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
})
line_count += 1
for word in line.words:
xmin, ymin, xmax, ymax = [coord for coordinates in word.geometry for coord in coordinates]
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
conf = word.confidence
word_div = SubElement(line_span, 'span', attrib={
'class': 'ocrx_word',
'id': f'word_1_{word_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
x_wconf {int(conf * 100)}'
})
word_div.text = word.value
word_count += 1
if return_plain:
return ET.tostring(page_hocr, encoding='utf-8', method='xml')
return ET.ElementTree(page_hocr)
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down Expand Up @@ -298,6 +367,17 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:

return [page.synthesize() for page in self.pages]

def export_as_xml(self, return_plain: bool = False, **kwargs) -> List[Union[bytes, ET.ElementTree]]:
"""Export the document as XML

Args:
return_plain: whether to return the plain (bytes) XML string or an ElementTree object

Returns:
list of XML (hOCR format) elements
"""
return [page.export_as_xml(return_plain, **kwargs) for page in self.pages]

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down
1 change: 0 additions & 1 deletion doctr/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,5 +278,4 @@ def __call__(
)
for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
]

felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
return Document(_pages)
10 changes: 9 additions & 1 deletion test/common/test_io_elements.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import pytest
from typing import List
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

import numpy as np
import pytest
from doctr.io import elements


Expand Down Expand Up @@ -217,6 +221,10 @@ def test_document():
# Export
assert doc.export() == {"pages": [p.export() for p in pages]}

# Export XML
assert isinstance(doc.export_as_xml(return_plain=False), list) and isinstance(
doc.export_as_xml(return_plain=True)[0], (bytes, bytearray))

# Show
doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)

Expand Down