Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds XML export method to DocumentBuilder #544

Merged
merged 15 commits into from
Nov 2, 2021
33 changes: 32 additions & 1 deletion docs/source/using_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,35 @@ For reference, here is the JSON export for the same `Document` as above::
]
}
]
}
}

To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::

xml_output = result.export_as_xml()
for output in xml_output:
xml_bytes_string = output[0]
xml_element = output[1]

For reference, here is a short snippet of the XML output::
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>docTR - hOCR</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="ocr-system" content="doctr 0.5.0" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
<div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
<div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
<p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
<span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
<span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
<span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
<span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
</span>
</p>
</div>
</body>
</html>
98 changes: 93 additions & 5 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@

# This program is licensed under the Apache License version 2.
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
from typing import Any, Dict, List, Optional, Tuple, Union
from xml.dom import minidom
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element as ETElement, SubElement

import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List, Any, Optional, Union

from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
from doctr.utils.visualization import visualize_page, synthesize_page
import numpy as np
import doctr
from doctr.utils.common_types import BoundingBox, RotatedBbox
from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
from doctr.utils.repr import NestedObject
from doctr.utils.visualization import synthesize_page, visualize_page
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']

Expand Down Expand Up @@ -253,6 +256,79 @@ def synthesize(self, **kwargs) -> np.ndarray:

return synthesize_page(self.export(), **kwargs)

def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)', **kwargs) \
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
-> Tuple[bytes, ET.ElementTree]:
"""Export the page as XML

Args:
return_plain: whether to return the plain (bytes) XML string or an ElementTree object
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
file_title: the title of the XML file
**kwargs: additional arguments to pass to the exporter
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

Returns:
the XML element
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
"""
p_idx = self.page_idx
block_count: int = 1
line_count: int = 1
word_count: int = 1
width, height = self.dimensions
language = self.language if 'language' in self.language.keys() else 'en'
# Create the XML root element
page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
# Create the header / SubElements of the root element
head = SubElement(page_hocr, 'head')
SubElement(head, 'title').text = file_title
SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"})
SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
# Create the body
body = SubElement(page_hocr, 'body')
SubElement(body, 'div', attrib={
'class': 'ocr_page',
'id': f'page_{p_idx + 1}',
'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
})
# iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
for block in self.blocks:
xmin, ymin, xmax, ymax = [coord for coordinates in block.geometry for coord in coordinates]
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
block_div = SubElement(body, 'div', attrib={
'class': 'ocr_carea',
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
'id': f'block_1_{block_count}',
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
})
paragraph = SubElement(block_div, 'p', attrib={
'class': 'ocr_par',
'id': f'par_1_{block_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}'
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
})
block_count += 1
for line in block.lines:
xmin, ymin, xmax, ymax = [coord for coordinates in line.geometry for coord in coordinates]
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
# NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
line_span = SubElement(paragraph, 'span', attrib={
'class': 'ocr_line',
'id': f'line_1_{line_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
})
line_count += 1
for word in line.words:
xmin, ymin, xmax, ymax = [coord for coordinates in word.geometry for coord in coordinates]
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
conf = word.confidence
word_div = SubElement(line_span, 'span', attrib={
'class': 'ocrx_word',
'id': f'word_1_{word_count}',
'title': f'bbox {int(xmin * width)} {int(ymin * height)} {int(xmax * width)} {int(ymax * height)}; \
x_wconf {int(conf * 100)}'
})
# set the text
word_div.text = word.value
word_count += 1

return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr))

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down Expand Up @@ -298,6 +374,18 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:

return [page.synthesize() for page in self.pages]

def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)', **kwargs) \
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
-> List[Tuple[bytes, ET.ElementTree]]:
"""Export the document as XML

Args:
file_title: the title of the XML file

Returns:
list of tuple of (bytes, ElementTree)
"""
return [page.export_as_xml(file_title, **kwargs) for page in self.pages]

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down
1 change: 0 additions & 1 deletion doctr/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,5 +278,4 @@ def __call__(
)
for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
]

felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
return Document(_pages)
176 changes: 176 additions & 0 deletions examples/generate_pdfa_with_doctr_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#
# Lets take a look how to generate a PDF/A document with the doctr output.
#

felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved
import re
import numpy as np
from math import atan, cos, sin
from typing import Optional, Tuple, Dict
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element

from reportlab.lib.colors import black
from reportlab.lib.units import inch
from reportlab.pdfgen.canvas import Canvas
from reportlab.lib.utils import ImageReader
from PIL import Image

# First define the hOCR (xml) parser


class HocrParser():

def __init__(self):
self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')

def _element_coordinates(self, element: Element) -> Dict:
"""
Returns a tuple containing the coordinates of the bounding box around
an element
"""
out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
if 'title' in element.attrib:
matches = self.box_pattern.search(element.attrib['title'])
if matches:
coords = matches.group(1).split()
out = {'x1': int(coords[0]), 'y1': int(coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
return out

def _get_baseline(self, element: Element) -> Tuple[float, float]:
"""
Returns a tuple containing the baseline slope and intercept.
"""
if 'title' in element.attrib:
matches = self.baseline_pattern.search(element.attrib['title']).group(1).split()
if matches:
return float(matches[0]), float(matches[1])
return (0.0, 0.0)

def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
"""
Returns the quantity in PDF units (pt) given quantity in pixels
"""
pt = [(c / dpi * inch) for c in pxl.values()]
return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}

def _get_element_text(self, element: Element) -> str:
"""
Return the textual content of the element and its children
"""
text = ''
if element.text is not None:
text += element.text
for child in element:
text += self._get_element_text(child)
if element.tail is not None:
text += element.tail
return text

def export_pdfa(self,
out_filename: str,
hocr: ET.ElementTree,
image: Optional[np.ndarray] = None,
fontname: str = "Times-Roman",
fontsize: int = 12,
invisible_text: bool = True,
dpi: int = 300):
"""
Generates a PDF/A document from a hOCR document.
"""

width, height = None, None
# Get the image dimensions
for div in hocr.findall(".//div[@class='ocr_page']"):
coords = self._element_coordinates(div)
pt_coords = self._pt_from_pixel(coords, dpi)
width, height = pt_coords['x2'] - pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
# after catch break loop
break
if width is None or height is None:
raise ValueError("Could not determine page size")

pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)

span_elements = [element for element in hocr.iterfind(".//span")]
for line in span_elements:
if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
# get information from xml
pxl_line_coords = self._element_coordinates(line)
line_box = self._pt_from_pixel(pxl_line_coords, dpi)

# compute baseline
slope, pxl_intercept = self._get_baseline(line)
if abs(slope) < 0.005:
slope = 0.0
angle = atan(slope)
cos_a, sin_a = cos(angle), sin(angle)
intercept = pxl_intercept / dpi * inch
baseline_y2 = height - (line_box['y2'] + intercept)

# configure options
text = pdf.beginText()
text.setFont(fontname, fontsize)
pdf.setFillColor(black)
if invisible_text:
text.setTextRenderMode(3) # invisible text

# transform overlayed text
text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)

elements = line.findall(".//span[@class='ocrx_word']")
for elem in elements:
elemtxt = self._get_element_text(elem).strip()
# replace unsupported characters
elemtxt = elemtxt.translate(str.maketrans(
{'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'}))
if not elemtxt:
continue

# compute string width
pxl_coords = self._element_coordinates(elem)
box = self._pt_from_pixel(pxl_coords, dpi)
box_width = box['x2'] - box['x1']
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)

# Adjust relative position of cursor
cursor = text.getStartOfLine()
dx = box['x1'] - cursor[0]
dy = baseline_y2 - cursor[1]
text.moveCursor(dx, dy)

# suppress text if it is 0 units wide
if font_width > 0:
text.setHorizScale(100 * box_width / font_width)
text.textOut(elemtxt)
pdf.drawText(text)

# overlay image if provided
if image is not None:
pdf.drawImage(ImageReader(Image.fromarray(image)), 0, 0, width=width, height=height)
pdf.save()


# Second get the results from doctr OCR
from doctr.models import ocr_predictor
from doctr.io import DocumentFile

model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

img_docs = DocumentFile.from_images(["1.jpg", "2.jpg"])

result = model(img_docs)

# Third, export the results to PDF/A which has a overlayed textlayer and is now searchable in any pdf viewer

# returns: list of tuple where the first element is the (bytes) xml string and the second is the ElementTree
xml_outputs = result.export_as_xml()

# init the above parser
parser = HocrParser()

# iterate through the xml outputs and images and export to pdf/a
# the image is optional else you can set invisible_text=False and the text will be printed on a blank page
for i, (xml, img) in enumerate(zip(xml_outputs, img_docs)):
xml_element_tree = xml[1]
parser.export_pdfa(f'{i}.pdf', hocr=xml_element_tree, image=img)
9 changes: 8 additions & 1 deletion test/common/test_io_elements.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
import numpy as np
import pytest
from doctr.io import elements


Expand Down Expand Up @@ -190,6 +190,10 @@ def test_page():
assert page.export() == {"blocks": [b.export() for b in blocks], "page_idx": page_idx, "dimensions": page_size,
"orientation": orientation, "language": language}

# Export XML
assert isinstance(page.export_as_xml(), tuple) and isinstance(
page.export_as_xml()[0], (bytes, bytearray))
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

# Repr
assert '\n'.join(repr(page).split('\n')[:2]) == f'Page(\n dimensions={repr(page_size)}'

Expand Down Expand Up @@ -217,6 +221,9 @@ def test_document():
# Export
assert doc.export() == {"pages": [p.export() for p in pages]}

# Export XML
assert isinstance(doc.export_as_xml(), list)
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

# Show
doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)

Expand Down