Implement sidecar text files (#126)

ocrmypdf · May 10, 2017 · 183eafa · 183eafa
1 parent 47a2997
commit 183eafa
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 32 deletions.
diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py
@@ -200,7 +200,7 @@ def page_timedout(log, input_file):
     log.warning(prefix + " took too long to OCR - skipping")
 
 
-def _generate_null_hocr(output_hocr, image):
+def _generate_null_hocr(output_hocr, output_sidecar, image):
     """Produce a .hocr file that reports no text detected on a page that is
     the same size as the input image."""
     from PIL import Image
@@ -210,12 +210,16 @@ def _generate_null_hocr(output_hocr, image):
 
     with open(output_hocr, 'w', encoding="utf-8") as f:
         f.write(HOCR_TEMPLATE.format(w, h))
+    with open(output_sidecar, 'w', encoding='utf-8') as f:
+        f.write('[skipped page]')
 
 
-def generate_hocr(input_file, output_hocr, language: list, engine_mode,
+def generate_hocr(input_file, output_files, language: list, engine_mode,
                   tessconfig: list,
                   timeout: float, pagesegmode: int, log):
 
+    output_hocr = next(o for o in output_files if o.endswith('.hocr'))
+    output_sidecar = next(o for o in output_files if o.endswith('.txt'))
     badxml = os.path.splitext(output_hocr)[0] + '.badxml'
 
     args_tesseract = tess_base_args(language, engine_mode)
@@ -226,7 +230,8 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
     args_tesseract.extend([
         input_file,
         badxml,
-        'hocr'
+        'hocr',
+        'txt'
     ] + tessconfig)
     try:
         log.debug(args_tesseract)
@@ -238,13 +243,13 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
         # Temporary workaround to hocrTransform not being able to function if
         # it does not have a valid hOCR file.
         page_timedout(log, input_file)
-        _generate_null_hocr(output_hocr, input_file)
+        _generate_null_hocr(output_hocr, output_sidecar, input_file)
     except CalledProcessError as e:
         tesseract_log_output(log, e.output, input_file)
         if 'read_params_file: parameter not found' in e.output:
             raise TesseractConfigError() from e
         if 'Image too large' in e.output:
-            _generate_null_hocr(output_hocr, input_file)
+            _generate_null_hocr(output_hocr, output_sidecar, input_file)
             return
 
         raise e from e
@@ -258,6 +263,9 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
             # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
             shutil.move(badxml + '.hocr', badxml)
 
+        if os.path.exists(badxml + '.txt'):
+            shutil.move(badxml + '.txt', output_sidecar)
+
         # Tesseract 3.03 inserts source filename into hocr file without
         # escaping it, creating invalid XML and breaking the parser.
         # As a workaround, rewrite the hocr file, replacing the filename
@@ -273,7 +281,10 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
                 f_out.write(line)
 
 
-def use_skip_page(text_only, skip_pdf, output_pdf):
+def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
+    with open(output_text, 'w') as f:
+        f.write('[skipped page]')
+
     if not text_only:
         os.symlink(skip_pdf, output_pdf)
         return
@@ -291,14 +302,15 @@ def use_skip_page(text_only, skip_pdf, output_pdf):
         pdf_out.write(out)
 
 
-def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
-                 engine_mode, text_only: bool,
+def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
+                 language: list, engine_mode, text_only: bool,
                  tessconfig: list, timeout: float, pagesegmode: int, log):
     '''Use Tesseract to render a PDF.
 
     input_image -- image to analyze
     skip_pdf -- if we time out, use this file as output
     output_pdf -- file to generate
+    output_text -- OCR text file
     language -- list of languages to consider
     engine_mode -- engine mode argument for tess v4
     text_only -- enable tesseract text only mode?
@@ -315,27 +327,31 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
     if text_only:
         args_tesseract.extend(['-c', 'textonly_pdf=1'])
 
+    prefix = os.path.splitext(output_pdf)[0]  # Tesseract appends suffixes
+
     args_tesseract.extend([
         input_image,
-        os.path.splitext(output_pdf)[0],  # Tesseract appends suffix
-        'pdf'
+        prefix,
+        'pdf', 'txt'
     ] + tessconfig)
 
     try:
         log.debug(args_tesseract)
         stdout = check_output(
             args_tesseract, close_fds=True, stderr=STDOUT,
             universal_newlines=True, timeout=timeout)
+        if os.path.exists(prefix + '.txt'):
+            shutil.move(prefix + '.txt', output_text)
     except TimeoutExpired:
         page_timedout(log, input_image)
-        use_skip_page(text_only, skip_pdf, output_pdf)
+        use_skip_page(text_only, skip_pdf, output_pdf, output_text)
     except CalledProcessError as e:
         tesseract_log_output(log, e.output, input_image)
         if 'read_params_file: parameter not found' in e.output:
             raise TesseractConfigError() from e
 
         if 'Image too large' in e.output:
-            use_skip_page(text_only, skip_pdf, output_pdf)
+            use_skip_page(text_only, skip_pdf, output_pdf, output_text)
             return
         raise e from e
     else:

diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py
@@ -483,13 +483,13 @@ def select_ocr_image(
 
 def ocr_tesseract_hocr(
         input_file,
-        output_file,
+        output_files,
         log,
         context):
     options = context.get_options()
     tesseract.generate_hocr(
         input_file=input_file,
-        output_hocr=output_file,
+        output_files=output_files,
         language=options.language,
         engine_mode=options.tesseract_oem,
         tessconfig=options.tesseract_config,
@@ -579,12 +579,12 @@ def select_image_layer(
 
 
 def render_hocr_page(
-        input_file,
+        infiles,
         output_file,
         log,
         context):
     options = context.get_options()
-    hocr = input_file
+    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
     pageinfo = get_pageinfo(hocr, context)
     dpi = get_page_square_dpi(pageinfo, options)
 
@@ -610,13 +610,23 @@ def render_hocr_debug_page(
                          showBoundingboxes=True, invisibleText=False)
 
 
+def flatten_groups(groups):
+    for obj in groups:
+        if is_iterable_notstr(obj):
+            yield from obj
+        else:
+            yield obj
+
+
 def combine_layers(
         infiles,
         output_file,
         log,
         context):
-    text = next(ii for ii in infiles if ii.endswith('.text.pdf'))
-    image = next(ii for ii in infiles if ii.endswith('.image-layer.pdf'))
+    text = next(ii for ii in flatten_groups(infiles)
+                if ii.endswith('.text.pdf'))
+    image = next(ii for ii in flatten_groups(infiles)
+                 if ii.endswith('.image-layer.pdf'))
 
     pdf_text = pypdf.PdfFileReader(open(text, "rb"))
     pdf_image = pypdf.PdfFileReader(open(image, "rb"))
@@ -682,21 +692,27 @@ def combine_layers(
 
 def ocr_tesseract_and_render_pdf(
         infiles,
-        output_file,
+        outfiles,
         log,
         context):
     options = context.get_options()
     input_image = next((ii for ii in infiles if ii.endswith('.image')), '')
     input_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
+    output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
+    output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
+
     if not input_image:
         # Skipping this page
-        re_symlink(input_pdf, output_file, log)
+        re_symlink(input_pdf, output_pdf, log)
+        with open(output_text, 'w') as f:
+            f.write('[skipped page]')
         return
 
     tesseract.generate_pdf(
         input_image=input_image,
         skip_pdf=input_pdf,
-        output_pdf=output_file,
+        output_pdf=output_pdf,
+        output_text=output_text,
         language=options.language,
         engine_mode=options.tesseract_oem,
         text_only=False,
@@ -708,19 +724,23 @@ def ocr_tesseract_and_render_pdf(
 
 def ocr_tesseract_textonly_pdf(
         infiles,
-        output_file,
+        outfiles,
         log,
         context):
     options = context.get_options()
     input_image = next((ii for ii in infiles if ii.endswith('.ocr.png')), '')
     if not input_image:
         raise ValueError("No image rendered?")
-
     skip_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
+
+    output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
+    output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
+
     tesseract.generate_pdf(
         input_image=input_image,
         skip_pdf=skip_pdf,
-        output_pdf=output_file,
+        output_pdf=output_pdf,
+        output_text=output_text,
         language=options.language,
         engine_mode=options.tesseract_oem,
         text_only=True,
@@ -787,7 +807,7 @@ def skip_page(
 
 
 def merge_pages_ghostscript(
-        input_files,
+        input_files_groups,
         output_file,
         log,
         context):
@@ -805,6 +825,8 @@ def input_file_order(s):
             key += 1
         return key
 
+    input_files = (f for f in flatten_groups(input_files_groups)
+                   if not f.endswith('.txt'))
     pdf_pages = sorted(input_files, key=input_file_order)
     log.debug("Final pages: " + "\n".join(pdf_pages))
     ghostscript.generate_pdfa(
@@ -813,11 +835,14 @@ def input_file_order(s):
 
 
 def merge_pages_qpdf(
-        input_files,
+        input_files_groups,
         output_file,
         log,
         context):
     options = context.get_options()
+
+    input_files = list(f for f in flatten_groups(input_files_groups)
+                       if not f.endswith('.txt'))
     metadata_file = next(
         (ii for ii in input_files if ii.endswith('.repaired.pdf')))
     input_files.remove(metadata_file)
@@ -851,6 +876,31 @@ def input_file_order(s):
     qpdf.merge(pdf_pages, output_file)
 
 
+def merge_sidecars(
+        input_files_groups,
+        output_file,
+        log,
+        context):
+    options = context.get_options()
+
+    txt_files = sorted(f for f in flatten_groups(input_files_groups)
+                       if f.endswith('.txt'))
+
+    def write_pages(stream):
+        for page_number, txt_file in enumerate(txt_files):
+            if page_number != 0:
+                stream.write('\f')  # Form feed between pages
+            with open(txt_file, 'r') as in_:
+                stream.write(in_.read())
+
+    if output_file == '-':
+        write_pages(sys.stdout)
+        sys.stdout.flush()
+    else:
+        with open(output_file, 'w', encoding='utf-8') as out:
+            write_pages(out)
+
+
 def copy_final(
         input_files,
         output_file,
@@ -955,7 +1005,7 @@ def build_pipeline(options, work_folder, log, context):
         task_func=ocr_tesseract_hocr,
         input=task_select_ocr_image,
         filter=suffix(".ocr.png"),
-        output=".hocr",
+        output=[".hocr", ".txt"],
         extras=[log, context])
     task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
     task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
@@ -987,8 +1037,8 @@ def build_pipeline(options, work_folder, log, context):
     task_render_hocr_page = main_pipeline.transform(
         task_func=render_hocr_page,
         input=task_ocr_tesseract_hocr,
-        filter=suffix('.hocr'),
-        output='.text.pdf',
+        filter=regex(r".*/(\d{6})(?:\.hocr)"),
+        output=os.path.join(work_folder, r'\1.text.pdf'),
         extras=[log, context])
     task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
     task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
@@ -1008,7 +1058,8 @@ def build_pipeline(options, work_folder, log, context):
         task_func=ocr_tesseract_textonly_pdf,
         input=[task_select_ocr_image, task_orient_page],
         filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
-        output=os.path.join(work_folder, r'\1.text.pdf'),
+        output=[os.path.join(work_folder, r'\1.text.pdf'),
+                os.path.join(work_folder, r'\1.text.txt')],
         extras=[log, context])
     task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
     task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4')
@@ -1031,7 +1082,8 @@ def build_pipeline(options, work_folder, log, context):
         task_func=ocr_tesseract_and_render_pdf,
         input=[task_select_visible_page_image, task_orient_page],
         filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
-        output=os.path.join(work_folder, r'\1.rendered.pdf'),
+        output=[os.path.join(work_folder, r'\1.rendered.pdf'),
+                os.path.join(work_folder, r'\1.rendered.txt')],
         extras=[log, context])
     task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
     task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract')
@@ -1080,6 +1132,15 @@ def build_pipeline(options, work_folder, log, context):
         extras=[log, context])
     task_merge_pages_qpdf.active_if(options.output_type == 'pdf')
 
+    task_merge_sidecars = main_pipeline.merge(
+        task_func=merge_sidecars,
+        input=[task_ocr_tesseract_hocr,
+               task_ocr_tesseract_and_render_pdf,
+               task_ocr_tesseract_textonly_pdf],
+        output=options.sidecar,
+        extras=[log, context])
+    task_merge_sidecars.active_if(options.sidecar)
+
     # Finalize
     task_copy_final = main_pipeline.merge(
         task_func=copy_final,

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -147,14 +147,18 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir):
 @pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
 def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
                       resources, outdir):
+    outfile = outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer)
     check_ocrmypdf(
         resources / pdf,
-        outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer),
+        outfile,
         '-dc',
         '-v', '1',
         '--output-type', output_type,
+        '--sidecar',
         '--pdf-renderer', renderer, env=spoof_tesseract_cache)
 
+    assert outfile.with_suffix('.pdf.txt').exists()
+
 
 @pytest.mark.parametrize("output_type", [
     'pdfa', 'pdf'

diff --git a/tests/test_tess4.py b/tests/test_tess4.py
@@ -53,6 +53,7 @@ def test_textonly_pdf(ensure_tess4, resources, outdir):
     check_ocrmypdf(
         resources / 'linn.pdf',
         outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4',
+        '--sidecar', 'foo',
         env=ensure_tess4)