Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion mindee/extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def save_to_file(
if not file_format:
if len(resolved_path.suffix) < 1:
raise ValueError("Invalid file format.")
# Let PIL infer format from filename extension
self.buffer.seek(0)
image = Image.open(self.buffer)
if file_format:
Expand Down
7 changes: 4 additions & 3 deletions mindee/v2/file_operations/crop_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
class CropFiles(List[ExtractedImage]):
"""Crop files."""

def save_all_to_disk(self, path: Union[Path, str]):
def save_all_to_disk(self, path: Union[Path, str], prefix: str = "crop"):
"""
Save all extracted crops to disk.

:param path: Path to save the extracted splits to
:param path: Path to save the extracted splits to.
:param prefix: Prefix to add to the filename, defaults to 'crop'.
"""
if isinstance(path, str):
path = Path(path)
path.mkdir(parents=True, exist_ok=True)
for idx, split in enumerate(self, start=1):
split.save_to_file(path / f"crop_{idx:03}.jpg")
split.save_to_file(path / f"{prefix}_{idx:03}.jpg")
27 changes: 17 additions & 10 deletions mindee/v2/file_operations/split.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,28 @@
from typing import List, Union

from mindee.error import MindeeError
from mindee.extraction import PdfExtractor
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.v2.file_operations.split_files import SplitFiles
from mindee.v2.product.split.split_range import SplitRange


def extract_single_split(
input_source: LocalInputSource, split: List[int]
) -> ExtractedPdf:
"""
Extracts a single split as a complete PDF from the document.

:param input_source: Input source to split.
:param split: List of pages to keep.
:return: Extracted PDF
"""
return extract_splits(input_source, [split])[0]


def extract_splits(
input_source: LocalInputSource,
splits: Union[List[SplitRange], List[List[int]]],
splits: Union[List[List[int]]],
) -> SplitFiles:
"""
Extracts splits as complete PDFs from the document.
Expand All @@ -21,13 +34,7 @@ def extract_splits(
pdf_extractor = PdfExtractor(input_source)
page_groups = []
for split in splits:
if isinstance(split, SplitRange):
lower_bound = split.page_range[0]
upper_bound = split.page_range[1]
else:
lower_bound = split[0]
upper_bound = split[1]
page_groups.append(list(range(lower_bound, upper_bound + 1)))
page_groups.append(list(range(split[0], split[1] + 1)))
if len(splits) < 1:
raise MindeeError("No indexes provided.")
return SplitFiles(pdf_extractor.extract_sub_documents(page_groups))
7 changes: 4 additions & 3 deletions mindee/v2/file_operations/split_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
class SplitFiles(List[ExtractedPdf]):
"""Split files."""

def save_all_to_disk(self, path: Union[str, Path]):
def save_all_to_disk(self, path: Union[str, Path], prefix: str = "split"):
"""
Save all extracted splits to disk.

:param path: Path to save the extracted splits to
:param path: Path to save the extracted splits to.
:param prefix: Prefix to add to the filename, defaults to 'split'.
"""
if isinstance(path, str):
path = Path(path)
path.mkdir(parents=True, exist_ok=True)
for idx, split in enumerate(self, start=1):
split.save_to_file(path / f"split_{idx:03}.pdf")
split.save_to_file(path / f"{prefix}_{idx:03}.pdf")
5 changes: 2 additions & 3 deletions mindee/v2/product/split/split_range.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import List

from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.v2.file_operations.split import extract_single_split


class SplitRange:
Expand Down Expand Up @@ -32,5 +32,4 @@ def extract_from_file(self, input_source: LocalInputSource) -> ExtractedPdf:
:param input_source: Local file to apply the inference to
:return: Extracted PDF
"""
pdf_extractor = PdfExtractor(input_source)
return pdf_extractor.extract_sub_documents([self.page_range])[0]
return extract_single_split(input_source, self.page_range)
5 changes: 2 additions & 3 deletions tests/v2/file_operations/test_split_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pytest

from mindee.v2.file_operations.split import extract_splits
from mindee.input.sources.path_input import PathInput
from mindee.v2.product.split.split_response import (
SplitResponse,
Expand Down Expand Up @@ -37,7 +36,7 @@ def test_single_page_split_split(splits_default, splits_single_page_json_path):
with open(splits_single_page_json_path, "rb") as f:
response = json.load(f)
doc = SplitResponse(response)
extracted_splits = extract_splits(input_sample, doc.inference.result.splits)
extracted_splits = doc.extract_from_file(input_sample)
assert len(extracted_splits) == 1

assert extracted_splits[0].get_page_count() == 1
Expand All @@ -48,7 +47,7 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
with open(splits_multi_page_json_path, "rb") as f:
response = json.load(f)
doc = SplitResponse(response)
extracted_splits = extract_splits(input_sample, doc.inference.result.splits)
extracted_splits = doc.extract_from_file(input_sample)
assert len(extracted_splits) == 3

assert extracted_splits[0].get_page_count() == 1
Expand Down
3 changes: 1 addition & 2 deletions tests/v2/file_operations/test_split_operation_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
SplitResponse,
)
from mindee.input.sources.path_input import PathInput
from mindee.v2.file_operations.split import extract_splits
from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files


Expand Down Expand Up @@ -38,7 +37,7 @@ def test_pdf_should_extract_splits():
)
assert response.inference.file.page_count == 2

extracted_pdfs = extract_splits(split_input, response.inference.result.splits)
extracted_pdfs = response.extract_from_file(split_input)

assert len(extracted_pdfs) == 2
assert extracted_pdfs[0].filename == "default_sample_001-001.pdf"
Expand Down
Loading