Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ repos:
- id: gitleaks

- repo: https://github.com/PyCQA/pylint
rev: v3.3.1
rev: v3.3.9
hooks:
- id: pylint
name: pylint
Expand Down
2 changes: 1 addition & 1 deletion examples/auto_invoice_splitter_extraction_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def parse_invoice(file_path):
input_source = PathInput(file_path)

if input_source.is_pdf() and input_source.count_doc_pages() > 1:
if input_source.is_pdf() and input_source.page_count > 1:
parse_multi_page(input_source)
else:
parse_single_page(input_source)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def extract_receipts(
raise MindeeError(
"No possible receipts candidates found for MultiReceipts extraction."
)
for page_id in range(input_source.count_doc_pages()):
for page_id in range(input_source.page_count):
receipt_positions = [
receipt.bounding_box
for receipt in inference.pages[page_id].prediction.receipts
Expand Down
34 changes: 22 additions & 12 deletions mindee/input/sources/local_input_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class LocalInputSource:
file_mimetype: str
input_type: InputType
filepath: Optional[str]
_page_count: Optional[int] = None

def __init__(self, input_type: InputType):
self.input_type = input_type
Expand Down Expand Up @@ -100,17 +101,25 @@ def is_pdf(self) -> bool:
""":return: True if the file is a PDF."""
return self.file_mimetype == "application/pdf"

def count_doc_pages(self) -> int:
@property
def page_count(self) -> int:
"""
Count the pages in the PDF.
Count the pages in the document.

:return: the number of pages.
:return: The number of pages.
"""
if self.is_pdf():
self.file_object.seek(0)
pdf = pdfium.PdfDocument(self.file_object)
return len(pdf)
return 1
if self._page_count is None:
if self.is_pdf():
self.file_object.seek(0)
pdf = pdfium.PdfDocument(self.file_object)
self._page_count = len(pdf)
else:
self._page_count = 1
return self._page_count

def count_doc_pages(self) -> int:
"""Deprecated. Use ``page_count`` instead."""
return self.page_count

def apply_page_options(self, page_options: PageOptions) -> None:
"""Apply cut and merge options on multipage documents."""
Expand All @@ -131,10 +140,10 @@ def process_pdf(
"""Run any required processing on a PDF file."""
if self.is_pdf_empty():
raise MindeeSourceError(f"PDF pages are empty in: {self.filename}")
pages_count = self.count_doc_pages()
if on_min_pages > pages_count:
page_count = self.page_count
if on_min_pages > page_count:
return
all_pages = list(range(pages_count))
all_pages = list(range(page_count))
if behavior == KEEP_ONLY:
pages_to_keep = set()
for page_id in page_indexes:
Expand All @@ -161,7 +170,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
"""
Create a new PDF from pages and set it to ``file_object``.

:param page_numbers: List of pages number to use for merging in the original PDF.
:param page_numbers: List of page numbers to use for merging in the original PDF.
:return: None
"""
self.file_object.seek(0)
Expand All @@ -172,6 +181,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
bytes_io = io.BytesIO()
new_pdf.save(bytes_io)
self.file_object = bytes_io
self._page_count = len(new_pdf)

def is_pdf_empty(self) -> bool:
"""
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ Changelog = "https://github.com/mindee/mindee-api-python/blob/main/CHANGELOG.md"

[project.optional-dependencies]
lint = [
"pylint==3.3.1",
"pre-commit~=3.2.2",
"types-pytz>=2023.3",
"pylint==3.3.9",
"pre-commit~=3.6.0",
"types-pytz>=2024.2",
"types-requests>=2.31",
]
test = [
"toml~=0.10.2",
"pytest~=7.4",
"pytest-cov~=4.1",
"pytest-cov~=5.0",
]
docs = [
"sphinx~=5.3",
Expand Down
2 changes: 1 addition & 1 deletion tests/extraction/test_image_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from mindee.extraction.common.image_extractor import extract_multiple_images_from_source
from mindee.input.sources.path_input import PathInput
from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1
from tests.test_inputs import PRODUCT_DATA_DIR
from tests.utils import PRODUCT_DATA_DIR


@pytest.fixture
Expand Down
3 changes: 1 addition & 2 deletions tests/extraction/test_invoice_splitter_auto_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from mindee.product.invoice.invoice_v4 import InvoiceV4
from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
from tests.product import get_id, get_version
from tests.test_inputs import PRODUCT_DATA_DIR
from tests.utils import levenshtein_ratio
from tests.utils import PRODUCT_DATA_DIR, levenshtein_ratio


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion tests/extraction/test_multi_receipts_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import (
MultiReceiptsDetectorV1,
)
from tests.test_inputs import PRODUCT_DATA_DIR
from tests.utils import PRODUCT_DATA_DIR


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion tests/extraction/test_pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from mindee.product.invoice_splitter.invoice_splitter_v1_document import (
InvoiceSplitterV1Document,
)
from tests.test_inputs import PRODUCT_DATA_DIR
from tests.utils import PRODUCT_DATA_DIR


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion tests/extras/test_extras_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from mindee import Client
from mindee.product.international_id.international_id_v2 import InternationalIdV2
from mindee.product.invoice.invoice_v4 import InvoiceV4
from tests.product import PRODUCT_DATA_DIR
from tests.utils import PRODUCT_DATA_DIR


@pytest.fixture
Expand Down
163 changes: 163 additions & 0 deletions tests/input/test_apply_page_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import io

import pypdfium2 as pdfium
import pytest

from mindee.error import MindeeError
from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
from mindee.input.sources import (
Base64Input,
BytesInput,
FileInput,
LocalInputSource,
PathInput,
)
from tests.utils import FILE_TYPES_DIR, PRODUCT_DATA_DIR


def _assert_page_options(input_source: LocalInputSource, numb_pages: int):
assert input_source.is_pdf() is True
# Currently the least verbose way of comparing pages with pypdfium2
# I.e., each page is read and rendered as a rasterized image.
# These images are then compared as raw byte sequences.
cut_pdf = pdfium.PdfDocument(input_source.file_object)
pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
for idx in range(len(pdf)):
pdf_page = pdf.get_page(idx)
pdf_page_render = pdfium.PdfPage.render(pdf_page)
cut_pdf_page = cut_pdf.get_page(idx)
cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)

assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
cut_pdf.close()
pdf.close()


def test_pdf_reconstruct_ok():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=range(5))
assert isinstance(input_source.file_object, io.BytesIO)


@pytest.mark.parametrize("numb_pages", [1, 2, 3])
def test_process_pdf_cut_n_pages(numb_pages: int):
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.page_count == 12
input_source.process_pdf(
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages]
)
assert input_source.page_count == numb_pages
_assert_page_options(input_source, numb_pages)


@pytest.mark.parametrize("numb_pages", [1, 2, 3])
def test_apply_pages_pdf_cut_n_pages(numb_pages: int):
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.page_count == 12
input_source.apply_page_options(
PageOptions(on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages])
)
assert input_source.page_count == numb_pages
_assert_page_options(input_source, numb_pages)


def test_pdf_keep_5_first_pages():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.page_count == 12
input_source.process_pdf(
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 2, 3, 4]
)
assert input_source.page_count == 5


def test_pdf_keep_invalid_pages():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.page_count == 12
input_source.process_pdf(
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 17]
)
assert input_source.page_count == 2


def test_pdf_remove_5_last_pages():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.is_pdf() is True
input_source.process_pdf(
behavior=REMOVE, on_min_pages=2, page_indexes=[-5, -4, -3, -2, -1]
)
assert input_source.page_count == 7


def test_pdf_remove_5_first_pages():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.is_pdf() is True
input_source.process_pdf(
behavior=REMOVE, on_min_pages=2, page_indexes=list(range(5))
)
assert input_source.page_count == 7


def test_pdf_remove_invalid_pages():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.is_pdf() is True
input_source.process_pdf(behavior=REMOVE, on_min_pages=2, page_indexes=[16])
assert input_source.page_count == 12


def test_pdf_keep_no_pages():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.is_pdf() is True
# empty page indexes
with pytest.raises(RuntimeError):
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[])
# all invalid pages
with pytest.raises(RuntimeError):
input_source.process_pdf(
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[16, 17]
)


def test_pdf_remove_all_pages():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_source.is_pdf() is True
with pytest.raises(RuntimeError):
input_source.process_pdf(
behavior=REMOVE, on_min_pages=2, page_indexes=list(range(15))
)


def test_pdf_input_from_file():
with open(FILE_TYPES_DIR / "pdf" / "multipage.pdf", "rb") as fp:
input_source = FileInput(fp)
assert input_source.is_pdf() is True
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
assert input_source.page_count == 1


def test_pdf_input_from_base64():
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.txt", "rt") as fp:
input_source = Base64Input(fp.read(), filename="invoice_10p.pdf")
assert input_source.is_pdf() is True
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
assert input_source.page_count == 1


def test_pdf_input_from_bytes():
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.pdf", "rb") as fp:
input_source = BytesInput(fp.read(), filename="invoice_10p.pdf")
assert input_source.is_pdf() is True
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
assert input_source.page_count == 1


def test_pdf_blank_check():
with pytest.raises(MindeeError):
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank.pdf")
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])

with pytest.raises(MindeeError):
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank_1.pdf")
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])

input_not_blank = PathInput(FILE_TYPES_DIR / "pdf" / "not_blank_image_only.pdf")
assert input_not_blank.page_count == 1
22 changes: 22 additions & 0 deletions tests/input/test_fix_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pytest

from mindee import PathInput
from mindee.error import MimeTypeError
from tests.utils import FILE_TYPES_DIR


def test_broken_unfixable_pdf():
with pytest.raises(MimeTypeError):
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_unfixable.pdf")
input_source.fix_pdf()


def test_broken_fixable_pdf():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_fixable.pdf")
input_source.fix_pdf()
assert input_source.page_count == 1


def test_broken_fixable_invoice_pdf():
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_invoice.pdf")
input_source.fix_pdf()
Loading