From e6e34251c6a7d103d0b5fc44cc3614631d781c6f Mon Sep 17 00:00:00 2001
From: Tucker Barbour <barbct5@gmail.com>
Date: Thu, 1 Mar 2018 13:23:14 +0000
Subject: [PATCH 1/4] Add option to explicitly add interword spaces to HOCR
 pdf-renderer

This commit includes an optional work around for limitations of the
PDF.js viewer described in
https://github.com/jbarlow83/OCRmyPDF/issues/133. Here is explicitly
add an addition space to text elements before drawing them on the PDF
canvas when using the HOCR renderer. This option does not apply to
other pdf renderers in OCRmyPDF and is turned off by default.
---
 ocrmypdf/__main__.py      |  9 ++++++++-
 ocrmypdf/hocrtransform.py | 14 +++++++++++---
 ocrmypdf/pipeline.py      |  8 ++++----
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py
index 16b7f662f..6af7af480 100755
--- a/ocrmypdf/__main__.py
+++ b/ocrmypdf/__main__.py
@@ -304,6 +304,9 @@ def complain(message):
 advanced.add_argument(
     '--user-patterns', metavar='FILE',
     help="Specify the location of the Tesseract user patterns file.")
+advanced.add_argument(
+    '--interword-spaces', action='store_true',
+    help="Add spaces between words with HOCR transformation.")
 
 debugging = parser.add_argument_group(
     "Debugging",
@@ -463,7 +466,11 @@ def check_options_advanced(options, log):
             "--pdfa-image-compression argument has no effect when "
             "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
         )
-
+    if options.interword_spaces and options.pdf_renderer != 'hocr':
+        log.warning(
+            "--interword-spaces argument has no effect when "
+            "--pdf-renderer is not 'hocr'"
+        )
 
 def check_options_metadata(options, log):
     import unicodedata
diff --git a/ocrmypdf/hocrtransform.py b/ocrmypdf/hocrtransform.py
index 75faf8d35..244eec0e8 100755
--- a/ocrmypdf/hocrtransform.py
+++ b/ocrmypdf/hocrtransform.py
@@ -137,7 +137,7 @@ def replace_unsupported_chars(self, s):
         return s
 
     def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
-               fontname="Helvetica", invisibleText=False):
+               fontname="Helvetica", invisibleText=False, interwordSpaces=False):
         """
         Creates a PDF file with an image superimposed on top of the text.
         Text is positioned according to the bounding box of the lines in
@@ -180,7 +180,7 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
                 ".//%sspan[@class='ocrx_word']" % (self.xmlns)) is not None:
             elemclass = "ocrx_word"
 
-        # itterate all text elements
+        # iterate all text elements
         # light green for bounding box of word/line
         pdf.setStrokeColorRGB(1, 0, 0)
         pdf.setLineWidth(0.5)		# bounding box line width
@@ -196,6 +196,12 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
             if len(elemtxt) == 0:
                 continue
 
+            # if the advanced option `--interword-spaces` is true, append a space
+            # to the end of each text element to allow simpler PDF viewers such
+            # as PDF.js to better recognize words in search and copy and paste
+            if interwordSpaces:
+                elemtxt += ' '
+
             pxl_coords = self.element_coordinates(elem)
             pt = self.pt_from_pixel(pxl_coords)
 
@@ -242,10 +248,12 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
                         help='Resolution of the image that was OCRed')
     parser.add_argument('-i', '--image', default=None,
                         help='Path to the image to be placed above the text')
+    parser.add_argument('--interword-spaces', action='store_true',
+                         default=False, help='Add spaces between words')
     parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')
     parser.add_argument(
         'outputfile', help='Path to the PDF file to be generated')
     args = parser.parse_args()
 
     hocr = HocrTransform(args.hocrfile, args.resolution)
-    hocr.to_pdf(args.outputfile, args.image, args.boundingboxes)
+    hocr.to_pdf(args.outputfile, args.image, args.boundingboxes, interwordSpaces=args.interword_spaces)
diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py
index 79f129f5b..35326b885 100644
--- a/ocrmypdf/pipeline.py
+++ b/ocrmypdf/pipeline.py
@@ -639,8 +639,8 @@ def render_hocr_page(
 
     hocrtransform = HocrTransform(hocr, dpi)
     hocrtransform.to_pdf(output_file, imageFileName=None,
-                         showBoundingboxes=False, invisibleText=True)
-
+                         showBoundingboxes=False, invisibleText=True,
+                         interwordSpaces=options.interword_spaces)
 
 def flatten_groups(groups):
     for obj in groups:
@@ -664,8 +664,8 @@ def render_hocr_debug_page(
 
     hocrtransform = HocrTransform(hocr, dpi)
     hocrtransform.to_pdf(output_file, imageFileName=None,
-                         showBoundingboxes=True, invisibleText=False)
-
+                         showBoundingboxes=True, invisibleText=False,
+                         interwordSpaces=options.interword_spaces)
 
 def combine_layers(
         infiles,

From 422e61997856cc70674952baaa28d6a603b8884d Mon Sep 17 00:00:00 2001
From: Charles Forcey <cforcey@me.com>
Date: Thu, 1 Mar 2018 12:37:41 -0500
Subject: [PATCH 2/4] Add a note to the documentation about interword-spaces

---
 docs/advanced.rst     | 13 +++++++++++++
 docs/introduction.rst |  3 ++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/advanced.rst b/docs/advanced.rst
index e86cfd3cd..47a810e7b 100644
--- a/docs/advanced.rst
+++ b/docs/advanced.rst
@@ -131,6 +131,8 @@ The ``hocr`` renderer
 
 The ``hocr`` renderer works with older versions of Tesseract. The image layer is copied from the original PDF page if possible, avoiding potentially lossy transcoding or loss of other PDF information. If preprocessing is specified, then the image layer is a new PDF.
 
+When combined with an additional option ``--interword-spaces`, this renderer will append a space at the end of each recognized text element to help simpler viewers such as PDF.js correctly recognize words for search and copy and paste operations.
+
 This works in all versions of Tesseract.
 
 The ``tesseract`` renderer
@@ -141,3 +143,14 @@ The ``tesseract`` renderer creates a PDF with the image and text layers precompo
 If a PDF created with this renderer using Tesseract versions older than 3.05.00 is then passed through Ghostscript's pdfwrite feature, the OCR text *may* be corrupted. The ``--output-type=pdfa`` argument will produce a warning in this situation.
 
 *This renderer is deprecated and will be removed whenever support for older versions of Tesseract is dropped.*
+
+Adding Interword Spaces
+-------------------------
+
+OCRmyPDF has an option ``--interword-spaces`` that appends a space at the end of each text element.  Without the space, simpler PDF viewers such as PDF.js have difficulty detecting individuals words and maintaining white space between them.  As a result, searching for multi-word phrases and selecting text for copy and paste are severely impacted.  With this option set, these viewers are able to locate multi-word phrases while more advanced viewers remain unaffected. 
+
+.. code-block:: bash
+
+	ocrmypdf --output-type pdf --interword-spaces --pdf-renderer hocr input.pdf output.pdf
+
+This option defaults to ``False`` and must be combined with ``--pdf-renderer hocr`` or it will be ignored with a warning. This works in all versions of Tesseract.
diff --git a/docs/introduction.rst b/docs/introduction.rst
index 988454fca..8264b0014 100644
--- a/docs/introduction.rst
+++ b/docs/introduction.rst
@@ -82,7 +82,8 @@ OCRmyPDF is limited by the Tesseract OCR engine.  As such it experiences these l
   
 OCRmyPDF is also limited by the PDF specification:
 
-* PDF encodes the position of text glyphs but does not encode document structure.  There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically.  Some PDF viewers do a better job of this than others.
+* PDF encodes the position of text glyphs but does not encode document structure.  There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically.  Some PDF viewers do a better job of this than others.  
+* Because some popular opensource PDF viewers have a particularly hard time with spaces betweem words, OCRmyPDF does provide an optional command option ``--interword-spaces`` that appends a space to each text element as a workaround, but discourages its use unless absolutely necessary as it mixes document structure with graphical information that ideally should be left to the PDF viewer to interpret.  This option produces output similar to the aptly named ``-sloppy-text`` option of pdfsandwich mentioned in the Similar Programs section below.
 
 Ghostscript also imposes some limitations:
 

From 9fd9c7a51fb37c0ebf5d5f0f69ca58832c775ec0 Mon Sep 17 00:00:00 2001
From: Tucker Barbour <barbct5@gmail.com>
Date: Fri, 2 Mar 2018 11:13:47 +0000
Subject: [PATCH 3/4] Scale BoundingBox and Text elements to account for
 additional space.

Here we are manually scaling the pt width used for the BoundingBox and
the Text element when manually adding whitespace to account for
limitations of the PDF.js viewer. This fixes an initial regression
noticed when selecting text elements in Chrome and PDFium. The width
of the Text element and BoundBox had not been adjusted for the
additional whitespace so the highlighting was offset slightly.
---
 ocrmypdf/hocrtransform.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ocrmypdf/hocrtransform.py b/ocrmypdf/hocrtransform.py
index 244eec0e8..dd7723913 100755
--- a/ocrmypdf/hocrtransform.py
+++ b/ocrmypdf/hocrtransform.py
@@ -196,14 +196,16 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
             if len(elemtxt) == 0:
                 continue
 
+            pxl_coords = self.element_coordinates(elem)
+            pt = self.pt_from_pixel(pxl_coords)
+
             # if the advanced option `--interword-spaces` is true, append a space
             # to the end of each text element to allow simpler PDF viewers such
             # as PDF.js to better recognize words in search and copy and paste
             if interwordSpaces:
                 elemtxt += ' '
-
-            pxl_coords = self.element_coordinates(elem)
-            pt = self.pt_from_pixel(pxl_coords)
+                pt = Rect._make((pt.x1, pt.y1,
+                                 pt.x2 + pdf.stringWidth(' ', fontname, pt.y2 - pt.y1), pt.y2))
 
             # draw the bbox border
             if showBoundingboxes:

From f6c70312c9d5c5688028a0b73bde5ff9884d00e6 Mon Sep 17 00:00:00 2001
From: Tucker Barbour <barbct5@gmail.com>
Date: Fri, 2 Mar 2018 14:26:13 +0000
Subject: [PATCH 4/4] Fix Homebrew python package

Homebrew removed python3 and python now defaults to version 3. Here we
use `brew upgrade python` to upgrade the pre-installed version of
python to python3.
---
 .travis/osx_before_install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis/osx_before_install.sh b/.travis/osx_before_install.sh
index 91ad02767..3a47e571d 100644
--- a/.travis/osx_before_install.sh
+++ b/.travis/osx_before_install.sh
@@ -8,7 +8,7 @@ brew update
 brew install openjpeg jbig2dec libtiff     # image libraries
 brew install qpdf
 brew install ghostscript
-brew install python3
+brew upgrade python # Brew removed python3 and python now defaults to python3
 brew install libxml2 libffi leptonica
 brew install unpaper   # optional
 brew install tesseract