From 19e35db2b7036d1dbf1c9ebdab2155b0acb95ea3 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sun, 10 Feb 2019 01:52:31 -0800 Subject: [PATCH] Fix issue when weave handoff occurs with no OCR font present If using --tesseract-timeout 0 and any image processing on a file with more than 100 pages, the weave handoff will occur. Ensure this works correctly even if no Glyphless font is present. Closes #347 --- src/ocrmypdf/_weave.py | 20 +++++++++++++------- src/ocrmypdf/exec/tesseract.py | 2 +- tests/test_weave.py | 32 ++++++++++++++++++++++++++++---- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/src/ocrmypdf/_weave.py b/src/ocrmypdf/_weave.py index a29510424..54c21549b 100644 --- a/src/ocrmypdf/_weave.py +++ b/src/ocrmypdf/_weave.py @@ -17,6 +17,7 @@ from itertools import groupby from pathlib import Path +import os import pikepdf @@ -24,6 +25,9 @@ from .helpers import flatten_groups, page_number +MAX_OPEN_PAGE_PDFS = int(os.environ.get('_OCRMYPDF_MAX_OPEN_PAGE_PDFS', 100)) + + def _update_page_resources(*, page, font, font_key, procset): """Update this page's fonts with a reference to the Glyphless font""" @@ -34,7 +38,7 @@ def _update_page_resources(*, page, font, font_key, procset): fonts = resources['/Font'] except KeyError: fonts = pikepdf.Dictionary({}) - if font_key not in fonts: + if font_key is not None and font_key not in fonts: fonts[font_key] = font resources['/Font'] = fonts @@ -177,6 +181,8 @@ def _find_font(text, pdf_base): break if pdf_text_font: font = pdf_base.copy_foreign(pdf_text_font) + if font_key is None: + print('font_key is None') return font, font_key @@ -246,10 +252,10 @@ def remap_dest(dest_node): invalidated to its new one. """ try: - pageref = dest_node[0] - if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap: - new_objgen = pageref_remap[pageref.objgen] - dest_node[0] = pdf_base.get_object(new_objgen) + pageref = dest_node[0] + if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap: + new_objgen = pageref_remap[pageref.objgen] + dest_node[0] = pdf_base.get_object(new_objgen) except (IndexError, TypeError) as e: log.warning("This file may contain invalid table of contents entries") log.debug(e) @@ -392,7 +398,7 @@ def input_sorter(key): content_rotation - autorotate_correction ) % 360 - if len(keep_open) > 100: + if len(keep_open) > MAX_OPEN_PAGE_PDFS: # qpdf limitations require us to keep files open when we intend # to copy content from them before saving. However, we want to keep # a lid on file handles and memory usage, so for big files we're @@ -409,7 +415,7 @@ def input_sorter(key): pdf_base = pikepdf.open(interim) procset = pdf_base.pages[0].Resources.ProcSet - font = pdf_base.pages[0].Resources.Font.get(font_key) + font, font_key = None, None # Reacquire this information _fix_toc(pdf_base, pagerefs, log) pdf_base.save(output_file) diff --git a/src/ocrmypdf/exec/tesseract.py b/src/ocrmypdf/exec/tesseract.py index b788d29b4..a81d591d0 100644 --- a/src/ocrmypdf/exec/tesseract.py +++ b/src/ocrmypdf/exec/tesseract.py @@ -349,7 +349,7 @@ def generate_pdf( if os.path.exists(prefix + '.txt'): shutil.move(prefix + '.txt', output_text) except TimeoutExpired: - page_timedout(log, input_image) + page_timedout(log, input_image, timeout) use_skip_page(text_only, skip_pdf, output_pdf, output_text) except CalledProcessError as e: tesseract_log_output(log, e.output, input_image) diff --git a/tests/test_weave.py b/tests/test_weave.py index ae7d8465b..34452215e 100644 --- a/tests/test_weave.py +++ b/tests/test_weave.py @@ -17,20 +17,24 @@ from unittest.mock import MagicMock import logging +import os import pytest import pikepdf -from ocrmypdf._weave import _fix_toc +from ocrmypdf._weave import _fix_toc, _update_page_resources -def test_invalid_toc(resources, tmpdir, caplog): +check_ocrmypdf = pytest.helpers.check_ocrmypdf + + +def test_invalid_toc(resources, outdir, caplog): pdf = pikepdf.open(resources / 'toc.pdf') # Corrupt a TOC entry pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2]) - pdf.save(tmpdir / 'test.pdf') + pdf.save(outdir / 'test.pdf') - pdf = pikepdf.open(tmpdir / 'test.pdf') + pdf = pikepdf.open(outdir / 'test.pdf') remap = {} remap[pdf.pages[0].objgen] = pdf.pages[0].objgen # Dummy remap @@ -38,3 +42,23 @@ def test_invalid_toc(resources, tmpdir, caplog): log = logging.getLogger() _fix_toc(pdf, remap, log) assert 'invalid table of contents entries' in caplog.text + + +def test_no_glyphless_weave(resources, outdir): + pdf = pikepdf.open(resources / 'francais.pdf') + pdf_aspect = pikepdf.open(resources / 'aspect.pdf') + pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf') + pdf.pages.extend(pdf_aspect.pages) + pdf.pages.extend(pdf_cmyk.pages) + pdf.save(outdir / 'test.pdf') + + env = os.environ.copy() + env['_OCRMYPDF_MAX_OPEN_PAGE_PDFS'] = '2' + check_ocrmypdf( + outdir / 'test.pdf', + outdir / 'out.pdf', + '--deskew', + '--tesseract-timeout', + '0', + env=env, + )