From f42283088cc940aee31024ed72b3d938e4f2c7ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Wed, 3 Sep 2025 13:26:39 +0200 Subject: [PATCH] :bug: options should default to None --- mindee/input/inference_parameters.py | 18 +++++++------- mindee/mindee_http/mindee_api_v2.py | 16 ++++++------ mindee/parsing/v2/raw_text.py | 10 ++++++-- tests/data | 2 +- tests/test_client_v2_integration.py | 37 ++++++++++++++++++++++++---- tests/v2/test_inference_response.py | 2 +- 6 files changed, 59 insertions(+), 26 deletions(-) diff --git a/mindee/input/inference_parameters.py b/mindee/input/inference_parameters.py index 13c4dcbc..fd9f0dc6 100644 --- a/mindee/input/inference_parameters.py +++ b/mindee/input/inference_parameters.py @@ -10,16 +10,16 @@ class InferenceParameters: model_id: str """ID of the model, required.""" - rag: bool = False - """Use Retrieval-Augmented Generation during inference.""" - raw_text: bool = False - """Extract the entire text from the document as strings, and fill the ``raw_text`` attribute.""" - polygon: bool = False - """Calculate bounding box polygons for values, and fill the ``locations`` attribute of fields""" - confidence: bool = False + rag: Optional[bool] = None + """Enhance extraction accuracy with Retrieval-Augmented Generation.""" + raw_text: Optional[bool] = None + """Extract the full text content from the document as strings, and fill the ``raw_text`` attribute.""" + polygon: Optional[bool] = None + """Calculate bounding box polygons for all fields, and fill their ``locations`` attribute.""" + confidence: Optional[bool] = None """ - Calculate confidence scores for values, and fill the ``confidence`` attribute of fields. - Useful for automation. + Boost the precision and accuracy of all extractions. + Calculate confidence scores for all fields, and fill their ``confidence`` attribute. """ alias: Optional[str] = None """Use an alias to link the file to your own DB. If empty, no alias will be used.""" diff --git a/mindee/mindee_http/mindee_api_v2.py b/mindee/mindee_http/mindee_api_v2.py index fdfa5d5c..03202f29 100644 --- a/mindee/mindee_http/mindee_api_v2.py +++ b/mindee/mindee_http/mindee_api_v2.py @@ -82,14 +82,14 @@ def req_post_inference_enqueue( data = {"model_id": params.model_id} url = f"{self.url_root}/inferences/enqueue" - if params.rag: - data["rag"] = "true" - if params.raw_text: - data["raw_text"] = "true" - if params.confidence: - data["confidence"] = "true" - if params.polygon: - data["polygon"] = "true" + if params.rag is not None: + data["rag"] = str(params.rag).lower() + if params.raw_text is not None: + data["raw_text"] = str(params.raw_text).lower() + if params.confidence is not None: + data["confidence"] = str(params.confidence).lower() + if params.polygon is not None: + data["polygon"] = str(params.polygon).lower() if params.webhook_ids and len(params.webhook_ids) > 0: data["webhook_ids"] = ",".join(params.webhook_ids) if params.alias and len(params.alias): diff --git a/mindee/parsing/v2/raw_text.py b/mindee/parsing/v2/raw_text.py index 0d9c6329..7f6c0d54 100644 --- a/mindee/parsing/v2/raw_text.py +++ b/mindee/parsing/v2/raw_text.py @@ -8,10 +8,16 @@ class RawText: """Raw text extracted from the document.""" pages: List[RawTextPage] - """Page the raw text was found on.""" + """Pages of raw text content.""" def __init__(self, raw_response: StringDict): self.pages = [RawTextPage(page) for page in raw_response.get("pages", [])] def __str__(self) -> str: - return "\n\n".join([page.content for page in self.pages]) + """ + Text content of all pages. + + Each page is separated by 2 newline characters. + """ + page_contents = "\n\n".join([page.content for page in self.pages]) + return page_contents + "\n" diff --git a/tests/data b/tests/data index 11c2edc3..bc8356c1 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 11c2edc3d2778b121644317b0fc3efc0102ec83a +Subproject commit bc8356c1ce52d60351ed3430d336f33366025012 diff --git a/tests/test_client_v2_integration.py b/tests/test_client_v2_integration.py index 5b31fd3e..82525289 100644 --- a/tests/test_client_v2_integration.py +++ b/tests/test_client_v2_integration.py @@ -33,7 +33,7 @@ def test_parse_file_empty_multiple_pages_must_succeed( v2_client: ClientV2, findoc_model_id: str ) -> None: """ - Upload a 2-page blank PDF and make sure the returned inference contains the + Upload a 2-page almost blank PDF and make sure the returned inference contains the file & model metadata. """ input_path: Path = FILE_TYPES_DIR / "pdf" / "multipage_cut-2.pdf" @@ -73,6 +73,37 @@ def test_parse_file_empty_multiple_pages_must_succeed( assert len(response.inference.result.raw_text.pages) == 2 +@pytest.mark.integration +@pytest.mark.v2 +def test_parse_file_empty_single_page_options_must_succeed( + v2_client: ClientV2, findoc_model_id: str +) -> None: + """ + Upload a blank PDF and make sure the options are set correctly. + """ + input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf" + + input_source = PathInput(input_path) + params = InferenceParameters( + model_id=findoc_model_id, + rag=True, + raw_text=True, + polygon=True, + confidence=True, + webhook_ids=[], + alias="py_integration_empty_page_options", + ) + response: InferenceResponse = v2_client.enqueue_and_get_inference( + input_source, params + ) + + assert response.inference.active_options is not None + assert response.inference.active_options.rag is True + assert response.inference.active_options.raw_text is True + assert response.inference.active_options.polygon is True + assert response.inference.active_options.confidence is True + + @pytest.mark.integration @pytest.mark.v2 def test_parse_file_filled_single_page_must_succeed( @@ -86,10 +117,6 @@ def test_parse_file_filled_single_page_must_succeed( input_source = PathInput(input_path) params = InferenceParameters( model_id=findoc_model_id, - rag=False, - raw_text=False, - polygon=False, - confidence=False, webhook_ids=[], alias="py_integration_filled_single", ) diff --git a/tests/v2/test_inference_response.py b/tests/v2/test_inference_response.py index 0ac98433..c5829dec 100644 --- a/tests/v2/test_inference_response.py +++ b/tests/v2/test_inference_response.py @@ -186,7 +186,7 @@ def test_standard_field_simple_list(): @pytest.mark.v2 def test_raw_texts(): - json_sample, rst_sample = _get_inference_samples("raw_texts") + json_sample, _ = _get_inference_samples("raw_texts") inference_result = InferenceResponse(json_sample) assert isinstance(inference_result.inference, Inference)