From e6e34251c6a7d103d0b5fc44cc3614631d781c6f Mon Sep 17 00:00:00 2001 From: Tucker Barbour Date: Thu, 1 Mar 2018 13:23:14 +0000 Subject: [PATCH 1/4] Add option to explicitly add interword spaces to HOCR pdf-renderer This commit includes an optional work around for limitations of the PDF.js viewer described in https://github.com/jbarlow83/OCRmyPDF/issues/133. Here is explicitly add an addition space to text elements before drawing them on the PDF canvas when using the HOCR renderer. This option does not apply to other pdf renderers in OCRmyPDF and is turned off by default. --- ocrmypdf/__main__.py | 9 ++++++++- ocrmypdf/hocrtransform.py | 14 +++++++++++--- ocrmypdf/pipeline.py | 8 ++++---- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py index 16b7f662f..6af7af480 100755 --- a/ocrmypdf/__main__.py +++ b/ocrmypdf/__main__.py @@ -304,6 +304,9 @@ def complain(message): advanced.add_argument( '--user-patterns', metavar='FILE', help="Specify the location of the Tesseract user patterns file.") +advanced.add_argument( + '--interword-spaces', action='store_true', + help="Add spaces between words with HOCR transformation.") debugging = parser.add_argument_group( "Debugging", @@ -463,7 +466,11 @@ def check_options_advanced(options, log): "--pdfa-image-compression argument has no effect when " "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'" ) - + if options.interword_spaces and options.pdf_renderer != 'hocr': + log.warning( + "--interword-spaces argument has no effect when " + "--pdf-renderer is not 'hocr'" + ) def check_options_metadata(options, log): import unicodedata diff --git a/ocrmypdf/hocrtransform.py b/ocrmypdf/hocrtransform.py index 75faf8d35..244eec0e8 100755 --- a/ocrmypdf/hocrtransform.py +++ b/ocrmypdf/hocrtransform.py @@ -137,7 +137,7 @@ def replace_unsupported_chars(self, s): return s def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False, - fontname="Helvetica", invisibleText=False): + fontname="Helvetica", invisibleText=False, interwordSpaces=False): """ Creates a PDF file with an image superimposed on top of the text. Text is positioned according to the bounding box of the lines in @@ -180,7 +180,7 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False, ".//%sspan[@class='ocrx_word']" % (self.xmlns)) is not None: elemclass = "ocrx_word" - # itterate all text elements + # iterate all text elements # light green for bounding box of word/line pdf.setStrokeColorRGB(1, 0, 0) pdf.setLineWidth(0.5) # bounding box line width @@ -196,6 +196,12 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False, if len(elemtxt) == 0: continue + # if the advanced option `--interword-spaces` is true, append a space + # to the end of each text element to allow simpler PDF viewers such + # as PDF.js to better recognize words in search and copy and paste + if interwordSpaces: + elemtxt += ' ' + pxl_coords = self.element_coordinates(elem) pt = self.pt_from_pixel(pxl_coords) @@ -242,10 +248,12 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False, help='Resolution of the image that was OCRed') parser.add_argument('-i', '--image', default=None, help='Path to the image to be placed above the text') + parser.add_argument('--interword-spaces', action='store_true', + default=False, help='Add spaces between words') parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') parser.add_argument( 'outputfile', help='Path to the PDF file to be generated') args = parser.parse_args() hocr = HocrTransform(args.hocrfile, args.resolution) - hocr.to_pdf(args.outputfile, args.image, args.boundingboxes) + hocr.to_pdf(args.outputfile, args.image, args.boundingboxes, interwordSpaces=args.interword_spaces) diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index 79f129f5b..35326b885 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -639,8 +639,8 @@ def render_hocr_page( hocrtransform = HocrTransform(hocr, dpi) hocrtransform.to_pdf(output_file, imageFileName=None, - showBoundingboxes=False, invisibleText=True) - + showBoundingboxes=False, invisibleText=True, + interwordSpaces=options.interword_spaces) def flatten_groups(groups): for obj in groups: @@ -664,8 +664,8 @@ def render_hocr_debug_page( hocrtransform = HocrTransform(hocr, dpi) hocrtransform.to_pdf(output_file, imageFileName=None, - showBoundingboxes=True, invisibleText=False) - + showBoundingboxes=True, invisibleText=False, + interwordSpaces=options.interword_spaces) def combine_layers( infiles, From 422e61997856cc70674952baaa28d6a603b8884d Mon Sep 17 00:00:00 2001 From: Charles Forcey Date: Thu, 1 Mar 2018 12:37:41 -0500 Subject: [PATCH 2/4] Add a note to the documentation about interword-spaces --- docs/advanced.rst | 13 +++++++++++++ docs/introduction.rst | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/advanced.rst b/docs/advanced.rst index e86cfd3cd..47a810e7b 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -131,6 +131,8 @@ The ``hocr`` renderer The ``hocr`` renderer works with older versions of Tesseract. The image layer is copied from the original PDF page if possible, avoiding potentially lossy transcoding or loss of other PDF information. If preprocessing is specified, then the image layer is a new PDF. +When combined with an additional option ``--interword-spaces`, this renderer will append a space at the end of each recognized text element to help simpler viewers such as PDF.js correctly recognize words for search and copy and paste operations. + This works in all versions of Tesseract. The ``tesseract`` renderer @@ -141,3 +143,14 @@ The ``tesseract`` renderer creates a PDF with the image and text layers precompo If a PDF created with this renderer using Tesseract versions older than 3.05.00 is then passed through Ghostscript's pdfwrite feature, the OCR text *may* be corrupted. The ``--output-type=pdfa`` argument will produce a warning in this situation. *This renderer is deprecated and will be removed whenever support for older versions of Tesseract is dropped.* + +Adding Interword Spaces +------------------------- + +OCRmyPDF has an option ``--interword-spaces`` that appends a space at the end of each text element. Without the space, simpler PDF viewers such as PDF.js have difficulty detecting individuals words and maintaining white space between them. As a result, searching for multi-word phrases and selecting text for copy and paste are severely impacted. With this option set, these viewers are able to locate multi-word phrases while more advanced viewers remain unaffected. + +.. code-block:: bash + + ocrmypdf --output-type pdf --interword-spaces --pdf-renderer hocr input.pdf output.pdf + +This option defaults to ``False`` and must be combined with ``--pdf-renderer hocr`` or it will be ignored with a warning. This works in all versions of Tesseract. diff --git a/docs/introduction.rst b/docs/introduction.rst index 988454fca..8264b0014 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -82,7 +82,8 @@ OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences these l OCRmyPDF is also limited by the PDF specification: -* PDF encodes the position of text glyphs but does not encode document structure. There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically. Some PDF viewers do a better job of this than others. +* PDF encodes the position of text glyphs but does not encode document structure. There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically. Some PDF viewers do a better job of this than others. +* Because some popular opensource PDF viewers have a particularly hard time with spaces betweem words, OCRmyPDF does provide an optional command option ``--interword-spaces`` that appends a space to each text element as a workaround, but discourages its use unless absolutely necessary as it mixes document structure with graphical information that ideally should be left to the PDF viewer to interpret. This option produces output similar to the aptly named ``-sloppy-text`` option of pdfsandwich mentioned in the Similar Programs section below. Ghostscript also imposes some limitations: From 9fd9c7a51fb37c0ebf5d5f0f69ca58832c775ec0 Mon Sep 17 00:00:00 2001 From: Tucker Barbour Date: Fri, 2 Mar 2018 11:13:47 +0000 Subject: [PATCH 3/4] Scale BoundingBox and Text elements to account for additional space. Here we are manually scaling the pt width used for the BoundingBox and the Text element when manually adding whitespace to account for limitations of the PDF.js viewer. This fixes an initial regression noticed when selecting text elements in Chrome and PDFium. The width of the Text element and BoundBox had not been adjusted for the additional whitespace so the highlighting was offset slightly. --- ocrmypdf/hocrtransform.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ocrmypdf/hocrtransform.py b/ocrmypdf/hocrtransform.py index 244eec0e8..dd7723913 100755 --- a/ocrmypdf/hocrtransform.py +++ b/ocrmypdf/hocrtransform.py @@ -196,14 +196,16 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False, if len(elemtxt) == 0: continue + pxl_coords = self.element_coordinates(elem) + pt = self.pt_from_pixel(pxl_coords) + # if the advanced option `--interword-spaces` is true, append a space # to the end of each text element to allow simpler PDF viewers such # as PDF.js to better recognize words in search and copy and paste if interwordSpaces: elemtxt += ' ' - - pxl_coords = self.element_coordinates(elem) - pt = self.pt_from_pixel(pxl_coords) + pt = Rect._make((pt.x1, pt.y1, + pt.x2 + pdf.stringWidth(' ', fontname, pt.y2 - pt.y1), pt.y2)) # draw the bbox border if showBoundingboxes: From f6c70312c9d5c5688028a0b73bde5ff9884d00e6 Mon Sep 17 00:00:00 2001 From: Tucker Barbour Date: Fri, 2 Mar 2018 14:26:13 +0000 Subject: [PATCH 4/4] Fix Homebrew python package Homebrew removed python3 and python now defaults to version 3. Here we use `brew upgrade python` to upgrade the pre-installed version of python to python3. --- .travis/osx_before_install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis/osx_before_install.sh b/.travis/osx_before_install.sh index 91ad02767..3a47e571d 100644 --- a/.travis/osx_before_install.sh +++ b/.travis/osx_before_install.sh @@ -8,7 +8,7 @@ brew update brew install openjpeg jbig2dec libtiff # image libraries brew install qpdf brew install ghostscript -brew install python3 +brew upgrade python # Brew removed python3 and python now defaults to python3 brew install libxml2 libffi leptonica brew install unpaper # optional brew install tesseract