Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds XML export method to DocumentBuilder #544

Merged
merged 15 commits into from
Nov 2, 2021
30 changes: 29 additions & 1 deletion docs/source/using_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,32 @@ For reference, here is the JSON export for the same `Document` as above::
]
}
]
}
}

To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::

xml_output = result.export_as_xml(return_plain=True)

For reference, here is a short snippet of the XML output::
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>docTR - hOCR</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="ocr-system" content="doctr 0.5.0" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
<div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
<div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
<p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
<span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
<span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
<span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
<span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
</span>
</p>
</div>
</body>
</html>
27 changes: 22 additions & 5 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@

# This program is licensed under the Apache License version 2.
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
from typing import Any, Dict, List, Optional, Tuple, Union
from xml.dom import minidom
from xml.etree import ElementTree as ET

import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List, Any, Optional, Union

from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
from doctr.utils.visualization import visualize_page, synthesize_page
import numpy as np
from doctr.utils.common_types import BoundingBox, RotatedBbox
from doctr.utils.geometry import (resolve_enclosing_bbox,
resolve_enclosing_rbbox)
from doctr.utils.repr import NestedObject
from doctr.utils.visualization import synthesize_page, visualize_page
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']

Expand Down Expand Up @@ -273,8 +275,10 @@ class Document(Element):
def __init__(
self,
pages: List[Page],
hocr_pages: List[ET.Element]
) -> None:
super().__init__(pages=pages)
self.hocr_pages = hocr_pages
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

def render(self, page_break: str = '\n\n\n\n') -> str:
"""Renders the full text of the element"""
Expand All @@ -298,6 +302,19 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:

return [page.synthesize() for page in self.pages]

def export_as_xml(self, return_plain: bool = False, **kwargs):
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
"""Export the document as a list of binary hocr (xml) strings or ElementTree objects

Args:
return_plain: whether to return the plain text or the hocr
Returns:
list of binary hocr (xml) strings or ElementTree objects
"""
if return_plain:
return [ET.tostring(hocr_element, encoding='utf-8', method='xml') for hocr_element in self.hocr_pages]
else:
return [ET.ElementTree(hocr_element) for hocr_element in self.hocr_pages]

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down
83 changes: 76 additions & 7 deletions doctr/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.


import numpy as np
from scipy.cluster.hierarchy import fclusterdata
from typing import List, Tuple, Dict
from typing import Dict, List, Tuple
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element, SubElement

from doctr.io.elements import Word, Line, Block, Page, Document
import numpy as np
from doctr.io.elements import Block, Document, Line, Page, Word
from doctr.utils.geometry import (resolve_enclosing_bbox,
resolve_enclosing_rbbox)
from doctr.utils.repr import NestedObject
from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
from scipy.cluster.hierarchy import fclusterdata

__all__ = ['DocumentBuilder']

Expand Down Expand Up @@ -242,6 +245,72 @@ def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]])

return blocks

def _generate_hocr(self, pages: List[Page]) -> List[Element]:
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
"""Generate hOCR format from pages

Args:
pages: list of pages

Returns:
list of hOCR elements
"""
hocr_pages: List[Element] = list()
for p_idx, page in enumerate(pages):
block_count: int = 1
line_count: int = 1
word_count: int = 1
width, height = page.dimensions
language = page.language if 'language' in page.language.keys() else 'en'
page_hocr = Element('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
head = SubElement(page_hocr, 'head')
SubElement(head, 'title').text = 'docTR - hOCR'
SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': 'doctr 0.5.0'})
SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
body = SubElement(page_hocr, 'body')
SubElement(body, 'div', attrib={
'class': 'ocr_page',
'id': f'page_{p_idx + 1}',
'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
})
for block in page.blocks:
xmin, ymin, xmax, ymax = [coord for coordinates in block.geometry for coord in coordinates]
block_div = SubElement(body, 'div', attrib={
'class': 'ocr_carea',
'id': f'block_1_{block_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
})
paragraph = SubElement(block_div, 'p', attrib={
'class': 'ocr_par',
'id': f'par_1_{block_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
})
block_count += 1
for line in block.lines:
xmin, ymin, xmax, ymax = [coord for coordinates in line.geometry for coord in coordinates]
# NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
line_span = SubElement(paragraph, 'span', attrib={
'class': 'ocr_line',
'id': f'line_1_{line_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
})
line_count += 1
for word in line.words:
xmin, ymin, xmax, ymax = [coord for coordinates in word.geometry for coord in coordinates]
conf = word.confidence
word_div = SubElement(line_span, 'span', attrib={
'class': 'ocrx_word',
'id': f'word_1_{word_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
x_wconf {int(conf * 100)}'
})
word_div.text = word.value
word_count += 1
hocr_pages.append(page_hocr)
return hocr_pages

def extra_repr(self) -> str:
return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, "
f"paragraph_break={self.paragraph_break}")
Expand Down Expand Up @@ -278,5 +347,5 @@ def __call__(
)
for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
]

felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
return Document(_pages)
hocr_pages = self._generate_hocr(_pages)
return Document(_pages, hocr_pages)
2 changes: 1 addition & 1 deletion test/common/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ def test_is_tf_available():


def test_is_torch_available():
assert not doctr.is_torch_available()
assert doctr.is_torch_available()
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
13 changes: 11 additions & 2 deletions test/common/test_io_elements.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import pytest
from typing import List
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

import numpy as np
import pytest
from doctr.io import elements


Expand Down Expand Up @@ -204,7 +208,8 @@ def test_page():

def test_document():
pages = _mock_pages()
doc = elements.Document(pages)
hocr_page = ET.fromstring('<xml></xml>')
doc = elements.Document(pages, hocr_pages=[hocr_page])

# Attribute checks
assert len(doc.pages) == len(pages)
Expand All @@ -217,6 +222,10 @@ def test_document():
# Export
assert doc.export() == {"pages": [p.export() for p in pages]}

# Export XML
assert isinstance(doc.export_as_xml(return_plain=False), list) and isinstance(
doc.export_as_xml(return_plain=True)[0], (bytes, bytearray))

# Show
doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)

Expand Down