Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions mindee/input/inference_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ class InferenceParameters:

model_id: str
"""ID of the model, required."""
rag: bool = False
"""Use Retrieval-Augmented Generation during inference."""
raw_text: bool = False
"""Extract the entire text from the document as strings, and fill the ``raw_text`` attribute."""
polygon: bool = False
"""Calculate bounding box polygons for values, and fill the ``locations`` attribute of fields"""
confidence: bool = False
rag: Optional[bool] = None
"""Enhance extraction accuracy with Retrieval-Augmented Generation."""
raw_text: Optional[bool] = None
"""Extract the full text content from the document as strings, and fill the ``raw_text`` attribute."""
polygon: Optional[bool] = None
"""Calculate bounding box polygons for all fields, and fill their ``locations`` attribute."""
confidence: Optional[bool] = None
"""
Calculate confidence scores for values, and fill the ``confidence`` attribute of fields.
Useful for automation.
Boost the precision and accuracy of all extractions.
Calculate confidence scores for all fields, and fill their ``confidence`` attribute.
"""
alias: Optional[str] = None
"""Use an alias to link the file to your own DB. If empty, no alias will be used."""
Expand Down
16 changes: 8 additions & 8 deletions mindee/mindee_http/mindee_api_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,14 @@ def req_post_inference_enqueue(
data = {"model_id": params.model_id}
url = f"{self.url_root}/inferences/enqueue"

if params.rag:
data["rag"] = "true"
if params.raw_text:
data["raw_text"] = "true"
if params.confidence:
data["confidence"] = "true"
if params.polygon:
data["polygon"] = "true"
if params.rag is not None:
data["rag"] = str(params.rag).lower()
if params.raw_text is not None:
data["raw_text"] = str(params.raw_text).lower()
if params.confidence is not None:
data["confidence"] = str(params.confidence).lower()
if params.polygon is not None:
data["polygon"] = str(params.polygon).lower()
if params.webhook_ids and len(params.webhook_ids) > 0:
data["webhook_ids"] = ",".join(params.webhook_ids)
if params.alias and len(params.alias):
Expand Down
10 changes: 8 additions & 2 deletions mindee/parsing/v2/raw_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ class RawText:
"""Raw text extracted from the document."""

pages: List[RawTextPage]
"""Page the raw text was found on."""
"""Pages of raw text content."""

def __init__(self, raw_response: StringDict):
self.pages = [RawTextPage(page) for page in raw_response.get("pages", [])]

def __str__(self) -> str:
return "\n\n".join([page.content for page in self.pages])
"""
Text content of all pages.

Each page is separated by 2 newline characters.
"""
page_contents = "\n\n".join([page.content for page in self.pages])
return page_contents + "\n"
2 changes: 1 addition & 1 deletion tests/data
37 changes: 32 additions & 5 deletions tests/test_client_v2_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_parse_file_empty_multiple_pages_must_succeed(
v2_client: ClientV2, findoc_model_id: str
) -> None:
"""
Upload a 2-page blank PDF and make sure the returned inference contains the
Upload a 2-page almost blank PDF and make sure the returned inference contains the
file & model metadata.
"""
input_path: Path = FILE_TYPES_DIR / "pdf" / "multipage_cut-2.pdf"
Expand Down Expand Up @@ -73,6 +73,37 @@ def test_parse_file_empty_multiple_pages_must_succeed(
assert len(response.inference.result.raw_text.pages) == 2


@pytest.mark.integration
@pytest.mark.v2
def test_parse_file_empty_single_page_options_must_succeed(
v2_client: ClientV2, findoc_model_id: str
) -> None:
"""
Upload a blank PDF and make sure the options are set correctly.
"""
input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf"

input_source = PathInput(input_path)
params = InferenceParameters(
model_id=findoc_model_id,
rag=True,
raw_text=True,
polygon=True,
confidence=True,
webhook_ids=[],
alias="py_integration_empty_page_options",
)
response: InferenceResponse = v2_client.enqueue_and_get_inference(
input_source, params
)

assert response.inference.active_options is not None
assert response.inference.active_options.rag is True
assert response.inference.active_options.raw_text is True
assert response.inference.active_options.polygon is True
assert response.inference.active_options.confidence is True


@pytest.mark.integration
@pytest.mark.v2
def test_parse_file_filled_single_page_must_succeed(
Expand All @@ -86,10 +117,6 @@ def test_parse_file_filled_single_page_must_succeed(
input_source = PathInput(input_path)
params = InferenceParameters(
model_id=findoc_model_id,
rag=False,
raw_text=False,
polygon=False,
confidence=False,
webhook_ids=[],
alias="py_integration_filled_single",
)
Expand Down
2 changes: 1 addition & 1 deletion tests/v2/test_inference_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_standard_field_simple_list():

@pytest.mark.v2
def test_raw_texts():
json_sample, rst_sample = _get_inference_samples("raw_texts")
json_sample, _ = _get_inference_samples("raw_texts")
inference_result = InferenceResponse(json_sample)
assert isinstance(inference_result.inference, Inference)

Expand Down