mindee · ianardee · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
       - id: gitleaks
 
   - repo: https://github.com/PyCQA/pylint
-    rev: v3.3.1
+    rev: v3.3.9
     hooks:
       - id: pylint
         name: pylint

diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py
@@ -11,7 +11,7 @@
 def parse_invoice(file_path):
     input_source = PathInput(file_path)
 
-    if input_source.is_pdf() and input_source.count_doc_pages() > 1:
+    if input_source.is_pdf() and input_source.page_count > 1:
         parse_multi_page(input_source)
     else:
         parse_single_page(input_source)

diff --git a/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py
@@ -24,7 +24,7 @@ def extract_receipts(
         raise MindeeError(
             "No possible receipts candidates found for MultiReceipts extraction."
         )
-    for page_id in range(input_source.count_doc_pages()):
+    for page_id in range(input_source.page_count):
         receipt_positions = [
             receipt.bounding_box
             for receipt in inference.pages[page_id].prediction.receipts

diff --git a/mindee/input/sources/local_input_source.py b/mindee/input/sources/local_input_source.py
@@ -36,6 +36,7 @@ class LocalInputSource:
     file_mimetype: str
     input_type: InputType
     filepath: Optional[str]
+    _page_count: Optional[int] = None
 
     def __init__(self, input_type: InputType):
         self.input_type = input_type
@@ -100,17 +101,25 @@ def is_pdf(self) -> bool:
         """:return: True if the file is a PDF."""
         return self.file_mimetype == "application/pdf"
 
-    def count_doc_pages(self) -> int:
+    @property
+    def page_count(self) -> int:
         """
-        Count the pages in the PDF.
+        Count the pages in the document.
 
-        :return: the number of pages.
+        :return: The number of pages.
         """
-        if self.is_pdf():
-            self.file_object.seek(0)
-            pdf = pdfium.PdfDocument(self.file_object)
-            return len(pdf)
-        return 1
+        if self._page_count is None:
+            if self.is_pdf():
+                self.file_object.seek(0)
+                pdf = pdfium.PdfDocument(self.file_object)
+                self._page_count = len(pdf)
+            else:
+                self._page_count = 1
+        return self._page_count
+
+    def count_doc_pages(self) -> int:
+        """Deprecated. Use ``page_count`` instead."""
+        return self.page_count
 
     def apply_page_options(self, page_options: PageOptions) -> None:
         """Apply cut and merge options on multipage documents."""
@@ -131,10 +140,10 @@ def process_pdf(
         """Run any required processing on a PDF file."""
         if self.is_pdf_empty():
             raise MindeeSourceError(f"PDF pages are empty in: {self.filename}")
-        pages_count = self.count_doc_pages()
-        if on_min_pages > pages_count:
+        page_count = self.page_count
+        if on_min_pages > page_count:
             return
-        all_pages = list(range(pages_count))
+        all_pages = list(range(page_count))
         if behavior == KEEP_ONLY:
             pages_to_keep = set()
             for page_id in page_indexes:
@@ -161,7 +170,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
         """
         Create a new PDF from pages and set it to ``file_object``.
 
-        :param page_numbers: List of pages number to use for merging in the original PDF.
+        :param page_numbers: List of page numbers to use for merging in the original PDF.
         :return: None
         """
         self.file_object.seek(0)
@@ -172,6 +181,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
         bytes_io = io.BytesIO()
         new_pdf.save(bytes_io)
         self.file_object = bytes_io
+        self._page_count = len(new_pdf)
 
     def is_pdf_empty(self) -> bool:
         """

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,15 +44,15 @@ Changelog = "https://github.com/mindee/mindee-api-python/blob/main/CHANGELOG.md"
 
 [project.optional-dependencies]
 lint = [
-  "pylint==3.3.1",
-  "pre-commit~=3.2.2",
-  "types-pytz>=2023.3",
+  "pylint==3.3.9",
+  "pre-commit~=3.6.0",
+  "types-pytz>=2024.2",
   "types-requests>=2.31",
 ]
 test = [
   "toml~=0.10.2",
   "pytest~=7.4",
-  "pytest-cov~=4.1",
+  "pytest-cov~=5.0",
 ]
 docs = [
   "sphinx~=5.3",

diff --git a/tests/extraction/test_image_extractor.py b/tests/extraction/test_image_extractor.py
@@ -6,7 +6,7 @@
 from mindee.extraction.common.image_extractor import extract_multiple_images_from_source
 from mindee.input.sources.path_input import PathInput
 from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1
-from tests.test_inputs import PRODUCT_DATA_DIR
+from tests.utils import PRODUCT_DATA_DIR
 
 
 @pytest.fixture

diff --git a/tests/extraction/test_invoice_splitter_auto_extraction.py b/tests/extraction/test_invoice_splitter_auto_extraction.py
@@ -9,8 +9,7 @@
 from mindee.product.invoice.invoice_v4 import InvoiceV4
 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
 from tests.product import get_id, get_version
-from tests.test_inputs import PRODUCT_DATA_DIR
-from tests.utils import levenshtein_ratio
+from tests.utils import PRODUCT_DATA_DIR, levenshtein_ratio
 
 
 @pytest.fixture

diff --git a/tests/extraction/test_multi_receipts_extractor.py b/tests/extraction/test_multi_receipts_extractor.py
@@ -10,7 +10,7 @@
 from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import (
     MultiReceiptsDetectorV1,
 )
-from tests.test_inputs import PRODUCT_DATA_DIR
+from tests.utils import PRODUCT_DATA_DIR
 
 
 @pytest.fixture

diff --git a/tests/extraction/test_pdf_extractor.py b/tests/extraction/test_pdf_extractor.py
@@ -8,7 +8,7 @@
 from mindee.product.invoice_splitter.invoice_splitter_v1_document import (
     InvoiceSplitterV1Document,
 )
-from tests.test_inputs import PRODUCT_DATA_DIR
+from tests.utils import PRODUCT_DATA_DIR
 
 
 @pytest.fixture

diff --git a/tests/extras/test_extras_integration.py b/tests/extras/test_extras_integration.py
@@ -3,7 +3,7 @@
 from mindee import Client
 from mindee.product.international_id.international_id_v2 import InternationalIdV2
 from mindee.product.invoice.invoice_v4 import InvoiceV4
-from tests.product import PRODUCT_DATA_DIR
+from tests.utils import PRODUCT_DATA_DIR
 
 
 @pytest.fixture

diff --git a/tests/input/test_apply_page_options.py b/tests/input/test_apply_page_options.py
@@ -0,0 +1,163 @@
+import io
+
+import pypdfium2 as pdfium
+import pytest
+
+from mindee.error import MindeeError
+from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
+from mindee.input.sources import (
+    Base64Input,
+    BytesInput,
+    FileInput,
+    LocalInputSource,
+    PathInput,
+)
+from tests.utils import FILE_TYPES_DIR, PRODUCT_DATA_DIR
+
+
+def _assert_page_options(input_source: LocalInputSource, numb_pages: int):
+    assert input_source.is_pdf() is True
+    # Currently the least verbose way of comparing pages with pypdfium2
+    # I.e., each page is read and rendered as a rasterized image.
+    # These images are then compared as raw byte sequences.
+    cut_pdf = pdfium.PdfDocument(input_source.file_object)
+    pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
+    for idx in range(len(pdf)):
+        pdf_page = pdf.get_page(idx)
+        pdf_page_render = pdfium.PdfPage.render(pdf_page)
+        cut_pdf_page = cut_pdf.get_page(idx)
+        cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)
+
+        assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
+    cut_pdf.close()
+    pdf.close()
+
+
+def test_pdf_reconstruct_ok():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=range(5))
+    assert isinstance(input_source.file_object, io.BytesIO)
+
+
+@pytest.mark.parametrize("numb_pages", [1, 2, 3])
+def test_process_pdf_cut_n_pages(numb_pages: int):
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.page_count == 12
+    input_source.process_pdf(
+        behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages]
+    )
+    assert input_source.page_count == numb_pages
+    _assert_page_options(input_source, numb_pages)
+
+
+@pytest.mark.parametrize("numb_pages", [1, 2, 3])
+def test_apply_pages_pdf_cut_n_pages(numb_pages: int):
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.page_count == 12
+    input_source.apply_page_options(
+        PageOptions(on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages])
+    )
+    assert input_source.page_count == numb_pages
+    _assert_page_options(input_source, numb_pages)
+
+
+def test_pdf_keep_5_first_pages():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.page_count == 12
+    input_source.process_pdf(
+        behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 2, 3, 4]
+    )
+    assert input_source.page_count == 5
+
+
+def test_pdf_keep_invalid_pages():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.page_count == 12
+    input_source.process_pdf(
+        behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 17]
+    )
+    assert input_source.page_count == 2
+
+
+def test_pdf_remove_5_last_pages():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.is_pdf() is True
+    input_source.process_pdf(
+        behavior=REMOVE, on_min_pages=2, page_indexes=[-5, -4, -3, -2, -1]
+    )
+    assert input_source.page_count == 7
+
+
+def test_pdf_remove_5_first_pages():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.is_pdf() is True
+    input_source.process_pdf(
+        behavior=REMOVE, on_min_pages=2, page_indexes=list(range(5))
+    )
+    assert input_source.page_count == 7
+
+
+def test_pdf_remove_invalid_pages():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.is_pdf() is True
+    input_source.process_pdf(behavior=REMOVE, on_min_pages=2, page_indexes=[16])
+    assert input_source.page_count == 12
+
+
+def test_pdf_keep_no_pages():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.is_pdf() is True
+    # empty page indexes
+    with pytest.raises(RuntimeError):
+        input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[])
+    # all invalid pages
+    with pytest.raises(RuntimeError):
+        input_source.process_pdf(
+            behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[16, 17]
+        )
+
+
+def test_pdf_remove_all_pages():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
+    assert input_source.is_pdf() is True
+    with pytest.raises(RuntimeError):
+        input_source.process_pdf(
+            behavior=REMOVE, on_min_pages=2, page_indexes=list(range(15))
+        )
+
+
+def test_pdf_input_from_file():
+    with open(FILE_TYPES_DIR / "pdf" / "multipage.pdf", "rb") as fp:
+        input_source = FileInput(fp)
+        assert input_source.is_pdf() is True
+        input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
+    assert input_source.page_count == 1
+
+
+def test_pdf_input_from_base64():
+    with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.txt", "rt") as fp:
+        input_source = Base64Input(fp.read(), filename="invoice_10p.pdf")
+    assert input_source.is_pdf() is True
+    input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
+    assert input_source.page_count == 1
+
+
+def test_pdf_input_from_bytes():
+    with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.pdf", "rb") as fp:
+        input_source = BytesInput(fp.read(), filename="invoice_10p.pdf")
+    assert input_source.is_pdf() is True
+    input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
+    assert input_source.page_count == 1
+
+
+def test_pdf_blank_check():
+    with pytest.raises(MindeeError):
+        input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank.pdf")
+        input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
+
+    with pytest.raises(MindeeError):
+        input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank_1.pdf")
+        input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
+
+    input_not_blank = PathInput(FILE_TYPES_DIR / "pdf" / "not_blank_image_only.pdf")
+    assert input_not_blank.page_count == 1
diff --git a/tests/input/test_fix_pdf.py b/tests/input/test_fix_pdf.py
@@ -0,0 +1,22 @@
+import pytest
+
+from mindee import PathInput
+from mindee.error import MimeTypeError
+from tests.utils import FILE_TYPES_DIR
+
+
+def test_broken_unfixable_pdf():
+    with pytest.raises(MimeTypeError):
+        input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_unfixable.pdf")
+        input_source.fix_pdf()
+
+
+def test_broken_fixable_pdf():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_fixable.pdf")
+    input_source.fix_pdf()
+    assert input_source.page_count == 1
+
+
+def test_broken_fixable_invoice_pdf():
+    input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_invoice.pdf")
+    input_source.fix_pdf()