ocrmypdf · cforcey · Mar 1, 2018 · Mar 1, 2018 · Mar 2, 2018 · Mar 2, 2018
diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -131,6 +131,8 @@ The ``hocr`` renderer
 
 The ``hocr`` renderer works with older versions of Tesseract. The image layer is copied from the original PDF page if possible, avoiding potentially lossy transcoding or loss of other PDF information. If preprocessing is specified, then the image layer is a new PDF.
 
+When combined with an additional option ``--interword-spaces`, this renderer will append a space at the end of each recognized text element to help simpler viewers such as PDF.js correctly recognize words for search and copy and paste operations.
+
 This works in all versions of Tesseract.
 
 The ``tesseract`` renderer
@@ -141,3 +143,14 @@ The ``tesseract`` renderer creates a PDF with the image and text layers precompo
 If a PDF created with this renderer using Tesseract versions older than 3.05.00 is then passed through Ghostscript's pdfwrite feature, the OCR text *may* be corrupted. The ``--output-type=pdfa`` argument will produce a warning in this situation.
 
 *This renderer is deprecated and will be removed whenever support for older versions of Tesseract is dropped.*
+
+Adding Interword Spaces
+-------------------------
+
+OCRmyPDF has an option ``--interword-spaces`` that appends a space at the end of each text element.  Without the space, simpler PDF viewers such as PDF.js have difficulty detecting individuals words and maintaining white space between them.  As a result, searching for multi-word phrases and selecting text for copy and paste are severely impacted.  With this option set, these viewers are able to locate multi-word phrases while more advanced viewers remain unaffected. 
+
+.. code-block:: bash
+
+	ocrmypdf --output-type pdf --interword-spaces --pdf-renderer hocr input.pdf output.pdf
+
+This option defaults to ``False`` and must be combined with ``--pdf-renderer hocr`` or it will be ignored with a warning. This works in all versions of Tesseract.
diff --git a/docs/introduction.rst b/docs/introduction.rst
@@ -82,7 +82,8 @@ OCRmyPDF is limited by the Tesseract OCR engine.  As such it experiences these l
 
 OCRmyPDF is also limited by the PDF specification:
 
-* PDF encodes the position of text glyphs but does not encode document structure.  There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically.  Some PDF viewers do a better job of this than others.
+* PDF encodes the position of text glyphs but does not encode document structure.  There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically.  Some PDF viewers do a better job of this than others.  
+* Because some popular opensource PDF viewers have a particularly hard time with spaces betweem words, OCRmyPDF does provide an optional command option ``--interword-spaces`` that appends a space to each text element as a workaround, but discourages its use unless absolutely necessary as it mixes document structure with graphical information that ideally should be left to the PDF viewer to interpret.  This option produces output similar to the aptly named ``-sloppy-text`` option of pdfsandwich mentioned in the Similar Programs section below.
 
 Ghostscript also imposes some limitations:
 

diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py
@@ -304,6 +304,9 @@ def complain(message):
 advanced.add_argument(
     '--user-patterns', metavar='FILE',
     help="Specify the location of the Tesseract user patterns file.")
+advanced.add_argument(
+    '--interword-spaces', action='store_true',
+    help="Add spaces between words with HOCR transformation.")
 
 debugging = parser.add_argument_group(
     "Debugging",
@@ -463,7 +466,11 @@ def check_options_advanced(options, log):
             "--pdfa-image-compression argument has no effect when "
             "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
         )
-
+    if options.interword_spaces and options.pdf_renderer != 'hocr':
+        log.warning(
+            "--interword-spaces argument has no effect when "
+            "--pdf-renderer is not 'hocr'"
+        )
 
 def check_options_metadata(options, log):
     import unicodedata

diff --git a/ocrmypdf/hocrtransform.py b/ocrmypdf/hocrtransform.py
@@ -137,7 +137,7 @@ def replace_unsupported_chars(self, s):
         return s
 
     def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
-               fontname="Helvetica", invisibleText=False):
+               fontname="Helvetica", invisibleText=False, interwordSpaces=False):
         """
         Creates a PDF file with an image superimposed on top of the text.
         Text is positioned according to the bounding box of the lines in
@@ -180,7 +180,7 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
                 ".//%sspan[@class='ocrx_word']" % (self.xmlns)) is not None:
             elemclass = "ocrx_word"
 
-        # itterate all text elements
+        # iterate all text elements
         # light green for bounding box of word/line
         pdf.setStrokeColorRGB(1, 0, 0)
         pdf.setLineWidth(0.5)		# bounding box line width
@@ -199,6 +199,14 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
             pxl_coords = self.element_coordinates(elem)
             pt = self.pt_from_pixel(pxl_coords)
 
+            # if the advanced option `--interword-spaces` is true, append a space
+            # to the end of each text element to allow simpler PDF viewers such
+            # as PDF.js to better recognize words in search and copy and paste
+            if interwordSpaces:
+                elemtxt += ' '
+                pt = Rect._make((pt.x1, pt.y1,
+                                 pt.x2 + pdf.stringWidth(' ', fontname, pt.y2 - pt.y1), pt.y2))
+
             # draw the bbox border
             if showBoundingboxes:
                 pdf.rect(
@@ -242,10 +250,12 @@ def to_pdf(self, outFileName, imageFileName=None, showBoundingboxes=False,
                         help='Resolution of the image that was OCRed')
     parser.add_argument('-i', '--image', default=None,
                         help='Path to the image to be placed above the text')
+    parser.add_argument('--interword-spaces', action='store_true',
+                         default=False, help='Add spaces between words')
     parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')
     parser.add_argument(
         'outputfile', help='Path to the PDF file to be generated')
     args = parser.parse_args()
 
     hocr = HocrTransform(args.hocrfile, args.resolution)
-    hocr.to_pdf(args.outputfile, args.image, args.boundingboxes)
+    hocr.to_pdf(args.outputfile, args.image, args.boundingboxes, interwordSpaces=args.interword_spaces)
diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py
@@ -640,8 +640,8 @@ def render_hocr_page(
 
     hocrtransform = HocrTransform(hocr, dpi)
     hocrtransform.to_pdf(output_file, imageFileName=None,
-                         showBoundingboxes=False, invisibleText=True)
-
+                         showBoundingboxes=False, invisibleText=True,
+                         interwordSpaces=options.interword_spaces)
 
 def flatten_groups(groups):
     for obj in groups:
@@ -665,8 +665,8 @@ def render_hocr_debug_page(
 
     hocrtransform = HocrTransform(hocr, dpi)
     hocrtransform.to_pdf(output_file, imageFileName=None,
-                         showBoundingboxes=True, invisibleText=False)
-
+                         showBoundingboxes=True, invisibleText=False,
+                         interwordSpaces=options.interword_spaces)
 
 def combine_layers(
         infiles,