From 9bb0fbdedb2ece3093a0c9441dc5fad81ccad790 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 19 Sep 2024 16:13:39 +0200 Subject: [PATCH 1/9] update test lib --- tests/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data b/tests/data index f737d62b..9677f5e5 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit f737d62bb2fb6b064e324f31f25c75767793aa1a +Subproject commit 9677f5e54ae57db7b95a73bb49e2383ea1c6b4bb From 0ec97fd84eb5b7ec89a314c50dfcd1df9ed7a8d8 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 19 Sep 2024 17:03:18 +0200 Subject: [PATCH 2/9] update lib again --- tests/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data b/tests/data index 9677f5e5..43984fb9 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 9677f5e54ae57db7b95a73bb49e2383ea1c6b4bb +Subproject commit 43984fb924bb22e1182a5eb218fafddf4ce0ec4d From de86c78da8a0daae92240b969ed6a29dd5f95d83 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 19 Sep 2024 17:51:39 +0200 Subject: [PATCH 3/9] fix typo & update generatedV1 guide documentation --- docs/extras/guide/generated_v1.md | 10 +++++++++- mindee/product/generated/generated_v1_document.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/extras/guide/generated_v1.md b/docs/extras/guide/generated_v1.md index 31a3f15e..47d53095 100644 --- a/docs/extras/guide/generated_v1.md +++ b/docs/extras/guide/generated_v1.md @@ -87,12 +87,20 @@ Generated builds always have access to at least two attributes: ## Fields -**fields** (`Dict[str`: `List[Union[`[GeneratedListField](#generated-list-field)[GeneratedObjectField](#generated-object-field), `(#stringfield)[StringField]]]`): +**fields** (`Dict[str`: `List[Union[`[GeneratedListField](#generated-list-field), [GeneratedObjectField](#generated-object-field), `(#stringfield)[StringField]]]`): ```python print(str(result.document.inference.prediction.fields["my-field"])) ``` +### Nested fields + +If your field `my-field` is a `GeneratedObjectField`, you can access its individual properties using the following syntax: + +```python +print(str(result.document.inference.prediction.fields["my-field"].my_attribute)) +``` + # Questions? [Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-2d0ds7dtz-DPAF81ZqTy20chsYpQBW5g) diff --git a/mindee/product/generated/generated_v1_document.py b/mindee/product/generated/generated_v1_document.py index 72785d65..1378fc63 100644 --- a/mindee/product/generated/generated_v1_document.py +++ b/mindee/product/generated/generated_v1_document.py @@ -35,5 +35,5 @@ def __init__(self, raw_prediction: StringDict) -> None: ): field_contents_str["value"] = str( field_contents_str["value"] - ) # str cohersion for numbers + ) # str coercion for numbers self.fields[field_name] = StringField(field_contents_str) From b3cd932bafc8072c203cd3f339cebd1427e59453 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 20 Sep 2024 11:20:08 +0200 Subject: [PATCH 4/9] update test to include levenshtein difference --- tests/api/__init__.py | 0 .../test_invoice_splitter_auto_extraction.py | 3 +- tests/input/__init__.py | 0 tests/{Input => input}/test_local_response.py | 0 tests/mindee_http/__init__.py | 0 tests/utils.py | 51 +++++++++++++++++++ 6 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/api/__init__.py create mode 100644 tests/input/__init__.py rename tests/{Input => input}/test_local_response.py (100%) create mode 100644 tests/mindee_http/__init__.py diff --git a/tests/api/__init__.py b/tests/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extraction/test_invoice_splitter_auto_extraction.py b/tests/extraction/test_invoice_splitter_auto_extraction.py index 66ced3de..5d160476 100644 --- a/tests/extraction/test_invoice_splitter_auto_extraction.py +++ b/tests/extraction/test_invoice_splitter_auto_extraction.py @@ -9,6 +9,7 @@ from mindee.product import InvoiceSplitterV1, InvoiceV4 from tests.product import get_id, get_version from tests.test_inputs import PRODUCT_DATA_DIR +from tests.utils import levenshtein_ratio @pytest.fixture @@ -52,4 +53,4 @@ def test_pdf_should_extract_invoices_strict(): PRODUCT_DATA_DIR / "invoices" / "response_v4" / "summary_full_invoice_p1.rst", invoice_0.document, ) - assert test_string_rst_invoice_0 == str(invoice_0.document) + assert levenshtein_ratio(test_string_rst_invoice_0, str(invoice_0.document)) >= 0.97 diff --git a/tests/input/__init__.py b/tests/input/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/Input/test_local_response.py b/tests/input/test_local_response.py similarity index 100% rename from tests/Input/test_local_response.py rename to tests/input/test_local_response.py diff --git a/tests/mindee_http/__init__.py b/tests/mindee_http/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/utils.py b/tests/utils.py index 263ee0d4..1630adda 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -25,3 +25,54 @@ def dummy_envvars(monkeypatch) -> None: EXTRAS_DIR = Path("./tests/data/extras/") + + +def levenshtein_distance(reference_str: str, target_str: str) -> int: + """ + Calculate the Levenshtein distance between two strings. + + The Levenshtein distance is a measure of the difference between two sequences. + Informally, the Levenshtein distance between two words is the minimum number + of single-character edits (insertions, deletions or substitutions) required + to change one word into the other. + + + :param reference_str: The reference string. + :param target_str: The target string. + + :return: The distance between the two strings. + """ + reference_len, target_len = len(reference_str), len(target_str) + previous_row = list(range(target_len + 1)) + current_row = [0] * (target_len + 1) + + for i in range(reference_len): + current_row[0] = i + 1 + + for j in range(target_len): + deletion_cost = previous_row[j + 1] + 1 + insertion_cost = current_row[j] + 1 + substitution_cost = previous_row[j] if reference_str[i] == target_str[j] else previous_row[j] + 1 + + current_row[j + 1] = min(deletion_cost, insertion_cost, substitution_cost) + + previous_row, current_row = current_row, previous_row + + return previous_row[target_len] + + +def levenshtein_ratio(ref_str: str, target_str: str) -> float: + """ + Calculates the Levenshtein ratio between two strings. + + :param ref_str: Reference string. + :param target_str: Target String. + :return: Ratio between the two strings + """ + lev_distance = levenshtein_distance(ref_str, target_str) + max_len = max(len(ref_str), len(target_str)) + + if max_len == 0: + return 1.0 + + return 1.0 - (lev_distance / max_len) From f7d56fdb7957acaa9640ef55dd2f62e3c7b452ef Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 20 Sep 2024 11:26:27 +0200 Subject: [PATCH 5/9] fix lint --- tests/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 1630adda..28724931 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -52,7 +52,11 @@ def levenshtein_distance(reference_str: str, target_str: str) -> int: for j in range(target_len): deletion_cost = previous_row[j + 1] + 1 insertion_cost = current_row[j] + 1 - substitution_cost = previous_row[j] if reference_str[i] == target_str[j] else previous_row[j] + 1 + substitution_cost = ( + previous_row[j] + if reference_str[i] == target_str[j] + else previous_row[j] + 1 + ) current_row[j + 1] = min(deletion_cost, insertion_cost, substitution_cost) From 9c282c5f62ed282d2f2643b325d57876fe035d24 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 20 Sep 2024 11:33:23 +0200 Subject: [PATCH 6/9] bump cache & python setup actions versions --- .github/workflows/docs.yml | 4 ++-- .github/workflows/license.yml | 4 ++-- .github/workflows/linting.yml | 6 +++--- .github/workflows/publish.yml | 4 ++-- .github/workflows/test-code-samples.yml | 4 ++-- .github/workflows/test-integration.yml | 4 ++-- .github/workflows/test-regression.yml | 4 ++-- .github/workflows/unit-test.yml | 4 ++-- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 57cdd4e8..44168c64 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -21,12 +21,12 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-docs-${{ hashFiles('setup.cfg') }} diff --git a/.github/workflows/license.yml b/.github/workflows/license.yml index ec3ffa6d..640fa666 100644 --- a/.github/workflows/license.yml +++ b/.github/workflows/license.yml @@ -18,12 +18,12 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-lic-${{ hashFiles('**/setup.cfg') }} diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 55d76447..0c76dcb4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -17,12 +17,12 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-dev-${{ hashFiles('setup.cfg') }} @@ -36,7 +36,7 @@ jobs: pip install -e .[dev] - name: Cache pre-commit - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pre-commit key: ${{ runner.os }}-prec-${{ hashFiles('.pre-commit-config.yaml') }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 98185461..6a1d3cee 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,12 +21,12 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-build-${{ hashFiles('setup.cfg') }} diff --git a/.github/workflows/test-code-samples.yml b/.github/workflows/test-code-samples.yml index 1aff76d1..2adfae32 100644 --- a/.github/workflows/test-code-samples.yml +++ b/.github/workflows/test-code-samples.yml @@ -25,12 +25,12 @@ jobs: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-samples-${{ hashFiles('**/setup.cfg') }} diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 27574e29..ef15b0a3 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -32,12 +32,12 @@ jobs: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-dev-${{ hashFiles('setup.cfg') }} diff --git a/.github/workflows/test-regression.yml b/.github/workflows/test-regression.yml index e56a834b..bcb0ffb5 100644 --- a/.github/workflows/test-regression.yml +++ b/.github/workflows/test-regression.yml @@ -32,12 +32,12 @@ jobs: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-dev-${{ hashFiles('setup.cfg') }} diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 77fc6fc9..4042a754 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -29,12 +29,12 @@ jobs: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-dev-${{ hashFiles('setup.cfg') }} From 7f591e098f74b0aec68c83e8cdb11e7b78077576 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 20 Sep 2024 11:39:30 +0200 Subject: [PATCH 7/9] prevent workflows from triggering in main from other branches --- .github/workflows/test-integration.yml | 2 -- .github/workflows/test-regression.yml | 2 -- 2 files changed, 4 deletions(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index ef15b0a3..1f62e04d 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -7,8 +7,6 @@ on: pull_request: workflow_run: workflows: ["Test Code Samples"] - branches: - - '*' types: - completed diff --git a/.github/workflows/test-regression.yml b/.github/workflows/test-regression.yml index bcb0ffb5..bba7fbea 100644 --- a/.github/workflows/test-regression.yml +++ b/.github/workflows/test-regression.yml @@ -7,8 +7,6 @@ on: pull_request: workflow_run: workflows: ["Test Code Samples"] - branches: - - '*' types: - completed From 2a8769c36fe9c9e8fa48ed5b7ac84c034104f2ef Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 20 Sep 2024 11:43:16 +0200 Subject: [PATCH 8/9] decomplexify levensthein computation --- tests/utils.py | 48 ++---------------------------------------------- 1 file changed, 2 insertions(+), 46 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 28724931..3245adfa 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,4 @@ +from difflib import SequenceMatcher from pathlib import Path from mindee.mindee_http.mindee_api import ( @@ -27,56 +28,11 @@ def dummy_envvars(monkeypatch) -> None: EXTRAS_DIR = Path("./tests/data/extras/") -def levenshtein_distance(reference_str: str, target_str: str) -> int: - """ - Calculate the Levenshtein distance between two strings. - - The Levenshtein distance is a measure of the difference between two sequences. - Informally, the Levenshtein distance between two words is the minimum number - of single-character edits (insertions, deletions or substitutions) required - to change one word into the other. - - - :param reference_str: The reference string. - :param target_str: The target string. - - :return: The distance between the two strings. - """ - reference_len, target_len = len(reference_str), len(target_str) - previous_row = list(range(target_len + 1)) - current_row = [0] * (target_len + 1) - - for i in range(reference_len): - current_row[0] = i + 1 - - for j in range(target_len): - deletion_cost = previous_row[j + 1] + 1 - insertion_cost = current_row[j] + 1 - substitution_cost = ( - previous_row[j] - if reference_str[i] == target_str[j] - else previous_row[j] + 1 - ) - - current_row[j + 1] = min(deletion_cost, insertion_cost, substitution_cost) - - previous_row, current_row = current_row, previous_row - - return previous_row[target_len] - - def levenshtein_ratio(ref_str: str, target_str: str) -> float: """ Calculates the Levenshtein ratio between two strings. - :param ref_str: Reference string. :param target_str: Target String. :return: Ratio between the two strings """ - lev_distance = levenshtein_distance(ref_str, target_str) - max_len = max(len(ref_str), len(target_str)) - - if max_len == 0: - return 1.0 - - return 1.0 - (lev_distance / max_len) + return SequenceMatcher(None, ref_str, target_str).ratio() From 25156343831989702785e5430773fd6268eb3853 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 20 Sep 2024 16:07:14 +0200 Subject: [PATCH 9/9] fix typo in example --- examples/auto_invoice_splitter_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/auto_invoice_splitter_extraction.py b/examples/auto_invoice_splitter_extraction.py index 56f7212e..4a456a3a 100644 --- a/examples/auto_invoice_splitter_extraction.py +++ b/examples/auto_invoice_splitter_extraction.py @@ -1,7 +1,7 @@ import os from mindee import Client -from mindee.extraction.common.pdf_extractor import PdfExtractor +from mindee.extraction.pdf_extractor import PdfExtractor from mindee.input import PathInput from mindee.product import InvoiceSplitterV1, InvoiceV4