Skip to content

Commit

Permalink
Fix issue when weave handoff occurs with no OCR font present
Browse files Browse the repository at this point in the history
If using --tesseract-timeout 0 and any image processing on a file with
more than 100 pages, the weave handoff will occur. Ensure this
works correctly even if no Glyphless font is present.

Closes #347
  • Loading branch information
James R. Barlow committed Feb 10, 2019
1 parent df68874 commit 19e35db
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 12 deletions.
20 changes: 13 additions & 7 deletions src/ocrmypdf/_weave.py
Expand Up @@ -17,13 +17,17 @@

from itertools import groupby
from pathlib import Path
import os

import pikepdf

from .exec import tesseract
from .helpers import flatten_groups, page_number


MAX_OPEN_PAGE_PDFS = int(os.environ.get('_OCRMYPDF_MAX_OPEN_PAGE_PDFS', 100))


def _update_page_resources(*, page, font, font_key, procset):
"""Update this page's fonts with a reference to the Glyphless font"""

Expand All @@ -34,7 +38,7 @@ def _update_page_resources(*, page, font, font_key, procset):
fonts = resources['/Font']
except KeyError:
fonts = pikepdf.Dictionary({})
if font_key not in fonts:
if font_key is not None and font_key not in fonts:
fonts[font_key] = font
resources['/Font'] = fonts

Expand Down Expand Up @@ -177,6 +181,8 @@ def _find_font(text, pdf_base):
break
if pdf_text_font:
font = pdf_base.copy_foreign(pdf_text_font)
if font_key is None:
print('font_key is None')
return font, font_key


Expand Down Expand Up @@ -246,10 +252,10 @@ def remap_dest(dest_node):
invalidated to its new one.
"""
try:
pageref = dest_node[0]
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
new_objgen = pageref_remap[pageref.objgen]
dest_node[0] = pdf_base.get_object(new_objgen)
pageref = dest_node[0]
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
new_objgen = pageref_remap[pageref.objgen]
dest_node[0] = pdf_base.get_object(new_objgen)
except (IndexError, TypeError) as e:
log.warning("This file may contain invalid table of contents entries")
log.debug(e)
Expand Down Expand Up @@ -392,7 +398,7 @@ def input_sorter(key):
content_rotation - autorotate_correction
) % 360

if len(keep_open) > 100:
if len(keep_open) > MAX_OPEN_PAGE_PDFS:
# qpdf limitations require us to keep files open when we intend
# to copy content from them before saving. However, we want to keep
# a lid on file handles and memory usage, so for big files we're
Expand All @@ -409,7 +415,7 @@ def input_sorter(key):

pdf_base = pikepdf.open(interim)
procset = pdf_base.pages[0].Resources.ProcSet
font = pdf_base.pages[0].Resources.Font.get(font_key)
font, font_key = None, None # Reacquire this information

_fix_toc(pdf_base, pagerefs, log)
pdf_base.save(output_file)
2 changes: 1 addition & 1 deletion src/ocrmypdf/exec/tesseract.py
Expand Up @@ -349,7 +349,7 @@ def generate_pdf(
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_text)
except TimeoutExpired:
page_timedout(log, input_image)
page_timedout(log, input_image, timeout)
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image)
Expand Down
32 changes: 28 additions & 4 deletions tests/test_weave.py
Expand Up @@ -17,24 +17,48 @@

from unittest.mock import MagicMock
import logging
import os

import pytest

import pikepdf
from ocrmypdf._weave import _fix_toc
from ocrmypdf._weave import _fix_toc, _update_page_resources

def test_invalid_toc(resources, tmpdir, caplog):
check_ocrmypdf = pytest.helpers.check_ocrmypdf


def test_invalid_toc(resources, outdir, caplog):
pdf = pikepdf.open(resources / 'toc.pdf')

# Corrupt a TOC entry
pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
pdf.save(tmpdir / 'test.pdf')
pdf.save(outdir / 'test.pdf')

pdf = pikepdf.open(tmpdir / 'test.pdf')
pdf = pikepdf.open(outdir / 'test.pdf')
remap = {}
remap[pdf.pages[0].objgen] = pdf.pages[0].objgen # Dummy remap

# Confirm we complain about the TOC and don't throw an exception
log = logging.getLogger()
_fix_toc(pdf, remap, log)
assert 'invalid table of contents entries' in caplog.text


def test_no_glyphless_weave(resources, outdir):
pdf = pikepdf.open(resources / 'francais.pdf')
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
pdf.pages.extend(pdf_aspect.pages)
pdf.pages.extend(pdf_cmyk.pages)
pdf.save(outdir / 'test.pdf')

env = os.environ.copy()
env['_OCRMYPDF_MAX_OPEN_PAGE_PDFS'] = '2'
check_ocrmypdf(
outdir / 'test.pdf',
outdir / 'out.pdf',
'--deskew',
'--tesseract-timeout',
'0',
env=env,
)

0 comments on commit 19e35db

Please sign in to comment.