From 200cc8ec0f4c57014ad64c56d736bd30913873b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 30 Jun 2023 17:55:20 +0200 Subject: [PATCH 1/3] :sparkles: add support for line items --- mindee/documents/custom/line_items.py | 103 +++++++++++++++++++ tests/documents/test_custom_v1_line_items.py | 25 +++++ 2 files changed, 128 insertions(+) create mode 100644 mindee/documents/custom/line_items.py create mode 100644 tests/documents/test_custom_v1_line_items.py diff --git a/mindee/documents/custom/line_items.py b/mindee/documents/custom/line_items.py new file mode 100644 index 00000000..edc286af --- /dev/null +++ b/mindee/documents/custom/line_items.py @@ -0,0 +1,103 @@ +from typing import Dict, List, Sequence + +from mindee.geometry import ( + get_bounding_box_for_polygons, + get_centroid, + get_min_max_y, + is_point_in_y, +) + + +def array_product(array: Sequence[float]) -> float: + """ + Get the product of a sequence of floats. + + :array: List of floats + """ + product = 1.0 + for k in array: + product = product * k + return product + + +def find_best_anchor(anchors: Sequence[str], fields: Dict[str, dict]) -> str: + """ + Find the anchor with the most rows, in the order specified by `anchors`. + + Anchor will be the name of the field. + """ + anchor = "" + anchor_rows = 0 + for field in anchors: + values = fields[field]["values"] + if len(values) > anchor_rows: + anchor_rows = len(values) + anchor = field + return anchor + + +def get_empty_field() -> dict: + """Return sample field with empty values.""" + return {"content": "", "polygon": [], "confidence": 0.0} + + +def get_line_items( + anchors: Sequence[str], columns: Sequence[str], fields: Dict[str, dict] +) -> List[dict]: + """ + Reconstruct line items from fields. + + :anchors: Possible fields to use as an anchor + :columns: All fields which are columns + :fields: List of field names to reconstruct table with + """ + line_items: List[dict] = [] + anchor = find_best_anchor(anchors, fields) + if not anchor: + print(Warning("Could not find an anchor!")) + return line_items + + # Loop on anchor items and create an item for each anchor item. + # This will create all rows with just the anchor column value. + for item in fields[anchor]["values"]: + line_item = {f: get_empty_field() for f in columns} + line_item[anchor] = item + line_items.append(line_item) + + # Loop on all created rows + for idx, _ in enumerate(line_items): + # Compute sliding window between anchor item and the next + min_y, _ = get_min_max_y(line_items[idx][anchor]["polygon"]) + if idx != len(line_items) - 1: + max_y, _ = get_min_max_y(line_items[idx + 1][anchor]["polygon"]) + else: + max_y = 1.0 # bottom of page + # Get candidates of each field included in sliding window and add it in line item + for field in columns: + field_words = [ + word + for word in fields[field]["values"] + if is_point_in_y(get_centroid(word["polygon"]), min_y, max_y) + ] + line_items[idx][field]["content"] = " ".join( + [v["content"] for v in field_words] + ) + try: + line_items[idx][field]["polygon"] = get_bounding_box_for_polygons( + [v["polygon"] for v in field_words] + ) + except ValueError: + pass + line_items[idx][field]["confidence"] = array_product( + [v["confidence"] for v in field_words] + ) + # Create coordinates and id attributes for frontend SDK of line item + all_polygons = [line_items[idx][anchor]["polygon"]] + for field in columns: + try: + all_polygons.append(line_items[idx][field]["polygon"]) + except IndexError: + pass + line_items[idx]["bounding_box"] = get_bounding_box_for_polygons(all_polygons) + line_items[idx]["id"] = idx + return line_items diff --git a/tests/documents/test_custom_v1_line_items.py b/tests/documents/test_custom_v1_line_items.py new file mode 100644 index 00000000..5d642d11 --- /dev/null +++ b/tests/documents/test_custom_v1_line_items.py @@ -0,0 +1,25 @@ +import json + +from mindee.documents import CustomV1 +from mindee.documents.custom.line_items import get_line_items +from tests import CUSTOM_DATA_DIR + + +def test_line_items(): + json_data_path = f"{CUSTOM_DATA_DIR}/response_v1/line_items/single_table_01.json" + json_data = json.load(open(json_data_path, "r")) + doc = CustomV1( + "field_test", api_prediction=json_data["document"]["inference"], page_n=None + ) + anchors = ["beneficiary_birth_date"] + columns = [ + "beneficiary_name", + "beneficiary_birth_date", + "beneficiary_rank", + "beneficiary_number", + ] + fields = json_data["document"]["inference"]["prediction"] + line_items = get_line_items(anchors, columns, fields) + assert line_items[0]["beneficiary_name"]["content"] == "JAMES BOND 007" + assert line_items[1]["beneficiary_name"]["content"] == "HARRY POTTER" + assert line_items[2]["beneficiary_name"]["content"] == "DRAGO MALFOY" From f323b776d2960df178b0aba5ce397a43e0686d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 30 Jun 2023 17:55:47 +0200 Subject: [PATCH 2/3] :sparkles: add basic support for line items --- mindee/documents/custom/line_items.py | 70 +++++++++++--------- mindee/geometry.py | 33 ++++++++- tests/documents/test_custom_v1_line_items.py | 18 +++-- tests/test_geometry.py | 11 ++- 4 files changed, 93 insertions(+), 39 deletions(-) diff --git a/mindee/documents/custom/line_items.py b/mindee/documents/custom/line_items.py index edc286af..3e3e25da 100644 --- a/mindee/documents/custom/line_items.py +++ b/mindee/documents/custom/line_items.py @@ -1,14 +1,16 @@ from typing import Dict, List, Sequence +from mindee.documents.custom.custom_v1_fields import ListField, ListFieldValue from mindee.geometry import ( - get_bounding_box_for_polygons, - get_centroid, + Quadrilateral, get_min_max_y, is_point_in_y, + merge_polygons_as_bounding_box, + merge_polygons_as_polygon, ) -def array_product(array: Sequence[float]) -> float: +def _array_product(array: Sequence[float]) -> float: """ Get the product of a sequence of floats. @@ -20,7 +22,7 @@ def array_product(array: Sequence[float]) -> float: return product -def find_best_anchor(anchors: Sequence[str], fields: Dict[str, dict]) -> str: +def _find_best_anchor(anchors: Sequence[str], fields: Dict[str, ListField]) -> str: """ Find the anchor with the most rows, in the order specified by `anchors`. @@ -29,21 +31,29 @@ def find_best_anchor(anchors: Sequence[str], fields: Dict[str, dict]) -> str: anchor = "" anchor_rows = 0 for field in anchors: - values = fields[field]["values"] + values = fields[field].values if len(values) > anchor_rows: anchor_rows = len(values) anchor = field return anchor -def get_empty_field() -> dict: +def _get_empty_field() -> ListFieldValue: """Return sample field with empty values.""" - return {"content": "", "polygon": [], "confidence": 0.0} + return ListFieldValue({"content": "", "polygon": [], "confidence": 0.0}) + + +class Line: + """Represent a single line.""" + + row_number: int + fields: Dict[str, ListFieldValue] + bounding_box: Quadrilateral def get_line_items( - anchors: Sequence[str], columns: Sequence[str], fields: Dict[str, dict] -) -> List[dict]: + anchors: Sequence[str], columns: Sequence[str], fields: Dict[str, ListField] +) -> List[Line]: """ Reconstruct line items from fields. @@ -51,53 +61,51 @@ def get_line_items( :columns: All fields which are columns :fields: List of field names to reconstruct table with """ - line_items: List[dict] = [] - anchor = find_best_anchor(anchors, fields) + line_items: List[Line] = [] + anchor = _find_best_anchor(anchors, fields) if not anchor: print(Warning("Could not find an anchor!")) return line_items # Loop on anchor items and create an item for each anchor item. # This will create all rows with just the anchor column value. - for item in fields[anchor]["values"]: - line_item = {f: get_empty_field() for f in columns} - line_item[anchor] = item + for item in fields[anchor].values: + line_item = Line() + line_item.fields = {f: _get_empty_field() for f in columns} + line_item.fields[anchor] = item line_items.append(line_item) # Loop on all created rows - for idx, _ in enumerate(line_items): + for idx, line in enumerate(line_items): # Compute sliding window between anchor item and the next - min_y, _ = get_min_max_y(line_items[idx][anchor]["polygon"]) + min_y, _ = get_min_max_y(line.fields[anchor].polygon) if idx != len(line_items) - 1: - max_y, _ = get_min_max_y(line_items[idx + 1][anchor]["polygon"]) + max_y, _ = get_min_max_y(line_items[idx + 1].fields[anchor].polygon) else: max_y = 1.0 # bottom of page # Get candidates of each field included in sliding window and add it in line item for field in columns: field_words = [ word - for word in fields[field]["values"] - if is_point_in_y(get_centroid(word["polygon"]), min_y, max_y) + for word in fields[field].values + if is_point_in_y(word.polygon.centroid, min_y, max_y) ] - line_items[idx][field]["content"] = " ".join( - [v["content"] for v in field_words] - ) + line.fields[field].content = " ".join([v.content for v in field_words]) try: - line_items[idx][field]["polygon"] = get_bounding_box_for_polygons( - [v["polygon"] for v in field_words] + line.fields[field].polygon = merge_polygons_as_polygon( + [v.polygon for v in field_words] ) except ValueError: pass - line_items[idx][field]["confidence"] = array_product( - [v["confidence"] for v in field_words] + line.fields[field].confidence = _array_product( + [v.confidence for v in field_words] ) - # Create coordinates and id attributes for frontend SDK of line item - all_polygons = [line_items[idx][anchor]["polygon"]] + all_polygons = [line.fields[anchor].polygon] for field in columns: try: - all_polygons.append(line_items[idx][field]["polygon"]) + all_polygons.append(line.fields[field].polygon) except IndexError: pass - line_items[idx]["bounding_box"] = get_bounding_box_for_polygons(all_polygons) - line_items[idx]["id"] = idx + line.bounding_box = merge_polygons_as_bounding_box(all_polygons) + line.row_number = idx return line_items diff --git a/mindee/geometry.py b/mindee/geometry.py index b1b16a85..d64166a2 100644 --- a/mindee/geometry.py +++ b/mindee/geometry.py @@ -28,6 +28,11 @@ class Quadrilateral(NamedTuple): bottom_left: Point """Bottom left Point""" + @property + def centroid(self) -> Point: + """The central point (centroid) of the quadrilateral.""" + return get_centroid(self) + class BBox(NamedTuple): """Contains exactly 4 coordinates.""" @@ -73,6 +78,11 @@ class Polygon(list): Inherits from base class ``list`` so is compatible with type ``Points``. """ + @property + def centroid(self) -> Point: + """The central point (centroid) of the polygon.""" + return get_centroid(self) + Points = Sequence[Point] @@ -132,7 +142,7 @@ def get_bbox(points: Points) -> BBox: return BBox(x_min, y_min, x_max, y_max) -def get_bounding_box_for_polygons(vertices: Sequence[Polygon]) -> Quadrilateral: +def merge_polygons_as_bounding_box(vertices: Sequence[Polygon]) -> Quadrilateral: """ Given a sequence of polygons, calculate a bounding box that encompasses all polygons. @@ -151,6 +161,27 @@ def get_bounding_box_for_polygons(vertices: Sequence[Polygon]) -> Quadrilateral: ) +def merge_polygons_as_polygon(vertices: Sequence[Polygon]) -> Polygon: + """ + Given a sequence of polygons, calculate a polygon box that encompasses all polygons. + + :param vertices: List of polygons + :return: A bounding box that encompasses all polygons + """ + y_min = min(y for v in vertices for _, y in v) + y_max = max(y for v in vertices for _, y in v) + x_min = min(x for v in vertices for x, _ in v) + x_max = max(x for v in vertices for x, _ in v) + return Polygon( + [ + Point(x_min, y_min), + Point(x_max, y_min), + Point(x_max, y_max), + Point(x_min, y_max), + ] + ) + + def get_centroid(points: Points) -> Point: """ Get the central point (centroid) given a sequence of points. diff --git a/tests/documents/test_custom_v1_line_items.py b/tests/documents/test_custom_v1_line_items.py index 5d642d11..d592ffe8 100644 --- a/tests/documents/test_custom_v1_line_items.py +++ b/tests/documents/test_custom_v1_line_items.py @@ -5,7 +5,7 @@ from tests import CUSTOM_DATA_DIR -def test_line_items(): +def test_single_table_01(): json_data_path = f"{CUSTOM_DATA_DIR}/response_v1/line_items/single_table_01.json" json_data = json.load(open(json_data_path, "r")) doc = CustomV1( @@ -18,8 +18,14 @@ def test_line_items(): "beneficiary_rank", "beneficiary_number", ] - fields = json_data["document"]["inference"]["prediction"] - line_items = get_line_items(anchors, columns, fields) - assert line_items[0]["beneficiary_name"]["content"] == "JAMES BOND 007" - assert line_items[1]["beneficiary_name"]["content"] == "HARRY POTTER" - assert line_items[2]["beneficiary_name"]["content"] == "DRAGO MALFOY" + line_items = get_line_items(anchors, columns, doc.fields) + assert len(line_items) == 3 + assert line_items[0].fields["beneficiary_name"].content == "JAMES BOND 007" + assert line_items[0].fields["beneficiary_birth_date"].content == "1970-11-11" + assert line_items[0].row_number == 0 + assert line_items[1].fields["beneficiary_name"].content == "HARRY POTTER" + assert line_items[1].fields["beneficiary_birth_date"].content == "2010-07-18" + assert line_items[1].row_number == 1 + assert line_items[2].fields["beneficiary_name"].content == "DRAGO MALFOY" + assert line_items[2].fields["beneficiary_birth_date"].content == "2015-07-05" + assert line_items[2].row_number == 2 diff --git a/tests/test_geometry.py b/tests/test_geometry.py index b6aee4ca..2a26ee28 100644 --- a/tests/test_geometry.py +++ b/tests/test_geometry.py @@ -87,9 +87,18 @@ def test_get_centroid(rectangle_a): def test_bounding_box_several_polygons(rectangle_b, quadrangle_a): - assert geometry.get_bounding_box_for_polygons((rectangle_b, quadrangle_a)) == ( + assert geometry.merge_polygons_as_bounding_box((rectangle_b, quadrangle_a)) == ( (0.124, 0.407), (0.381, 0.407), (0.381, 0.546), (0.124, 0.546), ) + + +def test_polygon_merge(rectangle_b, quadrangle_a): + assert geometry.merge_polygons_as_polygon((rectangle_b, quadrangle_a)) == [ + (0.124, 0.407), + (0.381, 0.407), + (0.381, 0.546), + (0.124, 0.546), + ] From ad700a6047558741384585ec5f3916c95b4c4444 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Mon, 3 Jul 2023 10:47:21 +0200 Subject: [PATCH 3/3] remove redundant merge_polygons_as_bounding_box --- mindee/documents/custom/line_items.py | 8 ++++---- mindee/geometry.py | 21 +-------------------- tests/test_geometry.py | 5 +++-- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/mindee/documents/custom/line_items.py b/mindee/documents/custom/line_items.py index 3e3e25da..3989614e 100644 --- a/mindee/documents/custom/line_items.py +++ b/mindee/documents/custom/line_items.py @@ -3,10 +3,10 @@ from mindee.documents.custom.custom_v1_fields import ListField, ListFieldValue from mindee.geometry import ( Quadrilateral, + get_bounding_box, get_min_max_y, is_point_in_y, - merge_polygons_as_bounding_box, - merge_polygons_as_polygon, + merge_polygons, ) @@ -92,7 +92,7 @@ def get_line_items( ] line.fields[field].content = " ".join([v.content for v in field_words]) try: - line.fields[field].polygon = merge_polygons_as_polygon( + line.fields[field].polygon = merge_polygons( [v.polygon for v in field_words] ) except ValueError: @@ -106,6 +106,6 @@ def get_line_items( all_polygons.append(line.fields[field].polygon) except IndexError: pass - line.bounding_box = merge_polygons_as_bounding_box(all_polygons) + line.bounding_box = get_bounding_box(merge_polygons(all_polygons)) line.row_number = idx return line_items diff --git a/mindee/geometry.py b/mindee/geometry.py index d64166a2..7f76149b 100644 --- a/mindee/geometry.py +++ b/mindee/geometry.py @@ -142,26 +142,7 @@ def get_bbox(points: Points) -> BBox: return BBox(x_min, y_min, x_max, y_max) -def merge_polygons_as_bounding_box(vertices: Sequence[Polygon]) -> Quadrilateral: - """ - Given a sequence of polygons, calculate a bounding box that encompasses all polygons. - - :param vertices: List of polygons - :return: A bounding box that encompasses all polygons - """ - y_min = min(y for v in vertices for _, y in v) - y_max = max(y for v in vertices for _, y in v) - x_min = min(x for v in vertices for x, _ in v) - x_max = max(x for v in vertices for x, _ in v) - return Quadrilateral( - Point(x_min, y_min), - Point(x_max, y_min), - Point(x_max, y_max), - Point(x_min, y_max), - ) - - -def merge_polygons_as_polygon(vertices: Sequence[Polygon]) -> Polygon: +def merge_polygons(vertices: Sequence[Polygon]) -> Polygon: """ Given a sequence of polygons, calculate a polygon box that encompasses all polygons. diff --git a/tests/test_geometry.py b/tests/test_geometry.py index 2a26ee28..4c98d19b 100644 --- a/tests/test_geometry.py +++ b/tests/test_geometry.py @@ -87,7 +87,8 @@ def test_get_centroid(rectangle_a): def test_bounding_box_several_polygons(rectangle_b, quadrangle_a): - assert geometry.merge_polygons_as_bounding_box((rectangle_b, quadrangle_a)) == ( + merged = geometry.merge_polygons((rectangle_b, quadrangle_a)) + assert geometry.get_bounding_box(merged) == ( (0.124, 0.407), (0.381, 0.407), (0.381, 0.546), @@ -96,7 +97,7 @@ def test_bounding_box_several_polygons(rectangle_b, quadrangle_a): def test_polygon_merge(rectangle_b, quadrangle_a): - assert geometry.merge_polygons_as_polygon((rectangle_b, quadrangle_a)) == [ + assert geometry.merge_polygons((rectangle_b, quadrangle_a)) == [ (0.124, 0.407), (0.381, 0.407), (0.381, 0.546),