From 94150f414adf154bda55573e1c442f91cbd86119 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 23 Jun 2018 00:43:40 -0700 Subject: [PATCH] Remove qpdf.merge We no longer need to merge pages this way. Much of the functionality was there to implement page splitting without hitting ulimit which will be fixed in qpdf > 8.0.2. The tests were expensive to run. Also remove pytest-timeout since it breaks the Linux build. --- src/ocrmypdf/exec/qpdf.py | 121 -------------------------------------- src/ocrmypdf/pipeline.py | 8 --- test_requirements.txt | 1 - tests/test_qpdf.py | 70 ---------------------- 4 files changed, 200 deletions(-) delete mode 100644 tests/test_qpdf.py diff --git a/src/ocrmypdf/exec/qpdf.py b/src/ocrmypdf/exec/qpdf.py index 7646561bc..512f7aa3f 100644 --- a/src/ocrmypdf/exec/qpdf.py +++ b/src/ocrmypdf/exec/qpdf.py @@ -95,33 +95,6 @@ def repair(input_file, output_file, log): raise SubprocessOutputError() from e -def get_npages(input_file, log): - try: - pages = run( - ['qpdf', '--show-npages', input_file], - universal_newlines=True, check=True, stdout=PIPE, stderr=STDOUT) - except CalledProcessError as e: - if e.returncode == 2 and e.output.find('No such file'): - log.error(e.output) - raise InputFileError() from e - return int(pages) - - -def split_pages(input_file, work_folder, npages): - """Split multipage PDF into individual pages. - - Incredibly enough, this multiple process approach is about 70 times - faster than using Ghostscript. - """ - for n in range(int(npages)): - args_qpdf = [ - 'qpdf', input_file, - '--pages', input_file, '{0}'.format(n + 1), '--', - os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1)) - ] - run(args_qpdf, check=True) - - def extract_page(input_file, output_file, pageno): args_qpdf = [ 'qpdf', input_file, @@ -129,97 +102,3 @@ def extract_page(input_file, output_file, pageno): output_file ] run(args_qpdf, check=True) - - -def _merge_inner(input_files, output_file, min_version=None, log=None): - """Merge the list of input files (all filenames) into the output file. - - The input files may contain one or more pages. - """ - - # Single page 'merges' should still be attempted to that the same error - # checking is applied to single page case - - version_arg = ['--min-version={}'.format(min_version)] \ - if min_version else [] - - if log is None: - import logging as log - - args_qpdf = [ - 'qpdf' - ] + version_arg + [ - input_files[0], '--pages' - ] + input_files + ['--', output_file] - - try: - run(args_qpdf, check=True, stderr=PIPE, universal_newlines=True) - except CalledProcessError as e: - if e.returncode == 3 and \ - e.stderr.find("unknown token while reading object") and \ - e.stderr.find("operation succeeded"): - # Only whitelist the 'unknown token' problem (decimal/string issue) - # qpdf issue #165 - log.warning('qpdf found and fixed errors: ' + e.stderr) - return - raise e from e - - -def merge(input_files, output_file, min_version=None, log=None, max_files=None): - """Merge the list of input files (all filenames) into the output file. - - The input files may contain one or more pages. - - """ - # qpdf requires that every file that contributes to the output has a file - # descriptor that remains open. That means, given our approach of one - # intermediate PDF per, we can practically hit the number of file - # descriptors. - - if max_files is None or max_files < 2: - # Find out how many open file descriptors we can get away with - ulimits = resource.getrlimit(resource.RLIMIT_NOFILE) - max_open_files = ulimits[0] - max_files = max_open_files // 2 # Conservative guess - - # We'll write things alongside the output file - output_dir = os.path.dirname(output_file) - - import random - import string - - def randstr(): - return ''.join(random.sample(string.ascii_lowercase, 6)) - - # How many files to grab at once, merging all their contents - step_size = max_files - - workqueue = input_files.copy() - counter = 1 - next_workqueue = [] - while len(workqueue) > 1 or len(next_workqueue) > 0: - # Take n files out of the queue - n = min(step_size, len(workqueue)) - job = workqueue[0:n] - del workqueue[0:n] - log.debug('merging ' + repr(job)) - - # Merge them into 1 file, which will contain n^depth pages - merge_file = os.path.join( - output_dir, "merge-{:06d}-{}.pdf".format(counter, randstr())) - counter += 1 - _merge_inner(job, merge_file, min_version=min_version, log=log) - - # On the next - next_workqueue.append(merge_file) - log.debug('next_workqueue ' + repr(next_workqueue)) - - # If we're out of things to do in this queue, move on to the next - # queue. On the counter-th pass of the workqueue we can chew through - # (step_size)**N pages, so on most systems the second pass finishes - # the job. - if len(workqueue) == 0: - workqueue = next_workqueue - next_workqueue = [] - - re_symlink(workqueue.pop(), output_file, log) diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index 72d5e296a..0e8e9036f 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -887,14 +887,6 @@ def build_pipeline(options, work_folder, log, context): os.path.join(work_folder, '*.marker.pdf'), extras=[log, context]) - # task_split_pages = main_pipeline.transform( - # task_func=split_page, - # input=task_pre_split_pages, - # filter=suffix('.presplit.pdf'), - # output='.page.pdf', - # output_dir=work_folder, - # extras=[log, context]) - task_ocr_or_skip = main_pipeline.split( ocr_or_skip, task_marker_pages, diff --git a/test_requirements.txt b/test_requirements.txt index 10a71ae91..c1d250b55 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -2,7 +2,6 @@ pytest >= 3.2 pytest-helpers-namespace pytest-xdist pytest-cov -pytest-timeout python-xmp-toolkit # requires apt-get install libexempi3 # or brew install exempi PyPDF2 >= 1.26.0 diff --git a/tests/test_qpdf.py b/tests/test_qpdf.py deleted file mode 100644 index ed9bd0b83..000000000 --- a/tests/test_qpdf.py +++ /dev/null @@ -1,70 +0,0 @@ -# © 2017 James R. Barlow: github.com/jbarlow83 -# -# This file is part of OCRmyPDF. -# -# OCRmyPDF is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# OCRmyPDF is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with OCRmyPDF. If not, see . - -import logging -import resource -import pytest - -from ocrmypdf.exec import ghostscript, tesseract, qpdf -from ocrmypdf.pdfinfo import PdfInfo - - -@pytest.mark.skipif( - qpdf.version() < '7.0.0', - reason="negzero.pdf crashes earlier versions") -def test_qpdf_negative_zero(resources, outpdf): - negzero = resources / 'negzero.pdf' - hugemono = resources / 'hugemono.pdf' - # raises exception on err - qpdf.merge([str(negzero), str(hugemono)], outpdf, log=logging.getLogger()) - - -@pytest.mark.timeout(15) -@pytest.mark.parametrize('max_files,skip', [ - (2, 0), # Can we merge correctly without opening more than 2 files at once? - (16, 0), # And does this work properly when we can one-shot it? - (2, 1), # Or playing with even/odd - (3, 0) # Or odd step size - ]) -def test_qpdf_merge_correctness(resources, outpdf, max_files, skip): - # All of these must be only one page long - inputs = [ - '2400dpi.pdf', 'aspect.pdf', 'blank.pdf', 'ccitt.pdf', - 'linn.pdf', 'masks.pdf', 'poster.pdf', 'overlay.pdf', - 'skew.pdf', 'trivial.pdf'] - - input_files = [str(resources / f) for f in inputs] - - qpdf.merge( - input_files[skip:], outpdf, log=logging.getLogger(), - max_files=max_files) - assert len(PdfInfo(outpdf).pages) == len(input_files[skip:]) - - -@pytest.mark.timeout(15) -@pytest.mark.skipif( - True, - reason='qpdf binary cannot open multiple files multiple times') -def test_page_merge_ulimit(resources, outpdf): - # Ensure we can merge pages without opening one file descriptor per page - ulimits = resource.getrlimit(resource.RLIMIT_NOFILE) - page_count = ulimits[0] - print(page_count) - input_files = [str(resources / 'trivial.pdf')] * page_count - - qpdf.merge(input_files, outpdf, log=logging.getLogger()) -