From 94150f414adf154bda55573e1c442f91cbd86119 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Sat, 23 Jun 2018 00:43:40 -0700
Subject: [PATCH] Remove qpdf.merge

We no longer need to merge pages this way. Much of the functionality
was there to implement page splitting without hitting ulimit which
will be fixed in qpdf > 8.0.2. The tests were expensive to run.

Also remove pytest-timeout since it breaks the Linux build.
---
 src/ocrmypdf/exec/qpdf.py | 121 --------------------------------------
 src/ocrmypdf/pipeline.py  |   8 ---
 test_requirements.txt     |   1 -
 tests/test_qpdf.py        |  70 ----------------------
 4 files changed, 200 deletions(-)
 delete mode 100644 tests/test_qpdf.py

diff --git a/src/ocrmypdf/exec/qpdf.py b/src/ocrmypdf/exec/qpdf.py
index 7646561bc..512f7aa3f 100644
--- a/src/ocrmypdf/exec/qpdf.py
+++ b/src/ocrmypdf/exec/qpdf.py
@@ -95,33 +95,6 @@ def repair(input_file, output_file, log):
             raise SubprocessOutputError() from e
 
 
-def get_npages(input_file, log):
-    try:
-        pages = run(
-            ['qpdf', '--show-npages', input_file],
-            universal_newlines=True, check=True, stdout=PIPE, stderr=STDOUT)
-    except CalledProcessError as e:
-        if e.returncode == 2 and e.output.find('No such file'):
-            log.error(e.output)
-            raise InputFileError() from e
-    return int(pages)
-
-
-def split_pages(input_file, work_folder, npages):
-    """Split multipage PDF into individual pages.
-
-    Incredibly enough, this multiple process approach is about 70 times
-    faster than using Ghostscript.
-    """
-    for n in range(int(npages)):
-        args_qpdf = [
-            'qpdf', input_file,
-            '--pages', input_file, '{0}'.format(n + 1), '--',
-            os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1))
-        ]
-        run(args_qpdf, check=True)
-
-
 def extract_page(input_file, output_file, pageno):
     args_qpdf = [
         'qpdf', input_file,
@@ -129,97 +102,3 @@ def extract_page(input_file, output_file, pageno):
         output_file
     ]
     run(args_qpdf, check=True)
-
-
-def _merge_inner(input_files, output_file, min_version=None, log=None):
-    """Merge the list of input files (all filenames) into the output file.
-
-    The input files may contain one or more pages.
-    """
-
-    # Single page 'merges' should still be attempted to that the same error
-    # checking is applied to single page case
-
-    version_arg = ['--min-version={}'.format(min_version)] \
-                  if min_version else []
-
-    if log is None:
-        import logging as log
-
-    args_qpdf = [
-        'qpdf'
-    ] + version_arg + [
-        input_files[0], '--pages'
-    ] + input_files + ['--', output_file]
-
-    try:
-        run(args_qpdf, check=True, stderr=PIPE, universal_newlines=True)
-    except CalledProcessError as e:
-        if e.returncode == 3 and \
-                e.stderr.find("unknown token while reading object") and \
-                e.stderr.find("operation succeeded"):
-            # Only whitelist the 'unknown token' problem (decimal/string issue)
-            # qpdf issue #165
-            log.warning('qpdf found and fixed errors: ' + e.stderr)
-            return
-        raise e from e
-
-
-def merge(input_files, output_file, min_version=None, log=None, max_files=None):
-    """Merge the list of input files (all filenames) into the output file.
-
-    The input files may contain one or more pages.
-
-    """
-    # qpdf requires that every file that contributes to the output has a file
-    # descriptor that remains open. That means, given our approach of one
-    # intermediate PDF per, we can practically hit the number of file
-    # descriptors.
-
-    if max_files is None or max_files < 2:
-        # Find out how many open file descriptors we can get away with
-        ulimits = resource.getrlimit(resource.RLIMIT_NOFILE)
-        max_open_files = ulimits[0]
-        max_files = max_open_files // 2  # Conservative guess
-
-    # We'll write things alongside the output file
-    output_dir = os.path.dirname(output_file)
-
-    import random
-    import string
-
-    def randstr():
-        return ''.join(random.sample(string.ascii_lowercase, 6))
-
-    # How many files to grab at once, merging all their contents
-    step_size = max_files
-
-    workqueue = input_files.copy()
-    counter = 1
-    next_workqueue = []
-    while len(workqueue) > 1 or len(next_workqueue) > 0:
-        # Take n files out of the queue
-        n = min(step_size, len(workqueue))
-        job = workqueue[0:n]
-        del workqueue[0:n]
-        log.debug('merging ' + repr(job))
-
-        # Merge them into 1 file, which will contain n^depth pages
-        merge_file = os.path.join(
-            output_dir, "merge-{:06d}-{}.pdf".format(counter, randstr()))
-        counter += 1
-        _merge_inner(job, merge_file, min_version=min_version, log=log)
-
-        # On the next
-        next_workqueue.append(merge_file)
-        log.debug('next_workqueue ' + repr(next_workqueue))
-
-        # If we're out of things to do in this queue, move on to the next
-        # queue. On the counter-th pass of the workqueue we can chew through
-        # (step_size)**N pages, so on most systems the second pass finishes
-        # the job.
-        if len(workqueue) == 0:
-            workqueue = next_workqueue
-            next_workqueue = []
-
-    re_symlink(workqueue.pop(), output_file, log)
diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py
index 72d5e296a..0e8e9036f 100644
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@@ -887,14 +887,6 @@ def build_pipeline(options, work_folder, log, context):
         os.path.join(work_folder, '*.marker.pdf'),
         extras=[log, context])
 
-    # task_split_pages = main_pipeline.transform(
-    #     task_func=split_page,
-    #     input=task_pre_split_pages,
-    #     filter=suffix('.presplit.pdf'),
-    #     output='.page.pdf',
-    #     output_dir=work_folder,
-    #     extras=[log, context])
-
     task_ocr_or_skip = main_pipeline.split(
         ocr_or_skip,
         task_marker_pages,
diff --git a/test_requirements.txt b/test_requirements.txt
index 10a71ae91..c1d250b55 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -2,7 +2,6 @@ pytest >= 3.2
 pytest-helpers-namespace
 pytest-xdist
 pytest-cov
-pytest-timeout
 python-xmp-toolkit   # requires apt-get install libexempi3
                      # or brew install exempi
 PyPDF2 >= 1.26.0
diff --git a/tests/test_qpdf.py b/tests/test_qpdf.py
deleted file mode 100644
index ed9bd0b83..000000000
--- a/tests/test_qpdf.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# © 2017 James R. Barlow: github.com/jbarlow83
-#
-# This file is part of OCRmyPDF.
-#
-# OCRmyPDF is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# OCRmyPDF is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
-
-import logging
-import resource
-import pytest
-
-from ocrmypdf.exec import ghostscript, tesseract, qpdf
-from ocrmypdf.pdfinfo import PdfInfo
-
-
-@pytest.mark.skipif(
-    qpdf.version() < '7.0.0',
-    reason="negzero.pdf crashes earlier versions")
-def test_qpdf_negative_zero(resources, outpdf):
-    negzero = resources / 'negzero.pdf'
-    hugemono = resources / 'hugemono.pdf'
-    # raises exception on err
-    qpdf.merge([str(negzero), str(hugemono)], outpdf, log=logging.getLogger())
-    
-
-@pytest.mark.timeout(15)
-@pytest.mark.parametrize('max_files,skip', [
-    (2, 0),  # Can we merge correctly without opening more than 2 files at once?
-    (16, 0), # And does this work properly when we can one-shot it?
-    (2, 1),  # Or playing with even/odd
-    (3, 0)   # Or odd step size
-    ])
-def test_qpdf_merge_correctness(resources, outpdf, max_files, skip):
-    # All of these must be only one page long
-    inputs = [
-        '2400dpi.pdf', 'aspect.pdf', 'blank.pdf', 'ccitt.pdf', 
-        'linn.pdf', 'masks.pdf', 'poster.pdf', 'overlay.pdf',
-        'skew.pdf', 'trivial.pdf']
-    
-    input_files = [str(resources / f) for f in inputs]
-
-    qpdf.merge(
-        input_files[skip:], outpdf, log=logging.getLogger(), 
-        max_files=max_files)
-    assert len(PdfInfo(outpdf).pages) == len(input_files[skip:])
-    
-
-@pytest.mark.timeout(15)
-@pytest.mark.skipif(
-    True, 
-    reason='qpdf binary cannot open multiple files multiple times')
-def test_page_merge_ulimit(resources, outpdf):
-    # Ensure we can merge pages without opening one file descriptor per page
-    ulimits = resource.getrlimit(resource.RLIMIT_NOFILE)
-    page_count = ulimits[0]
-    print(page_count)
-    input_files = [str(resources / 'trivial.pdf')] * page_count
-
-    qpdf.merge(input_files, outpdf, log=logging.getLogger())
-