Skip to content

Commit

Permalink
feat: table threshold option now enabled by default
Browse files Browse the repository at this point in the history
  • Loading branch information
yshalenyk committed Apr 22, 2021
1 parent 9e85095 commit 42283e6
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 10 deletions.
22 changes: 15 additions & 7 deletions spoonbill/spec.py
Expand Up @@ -4,7 +4,7 @@
from typing import List, Mapping, Sequence

from spoonbill.common import DEFAULT_FIELDS
from spoonbill.utils import combine_path, generate_table_name, get_root, prepare_title
from spoonbill.utils import combine_path, common_prefix, generate_table_name, get_root, prepare_title

LOGGER = logging.getLogger("spoonbill")

Expand Down Expand Up @@ -106,7 +106,6 @@ def add_column(
parent,
combined_only=False,
additional=False,
joinable=False,
):
"""Add new column to the table
Expand All @@ -116,32 +115,41 @@ def add_column(
:param parent: Parent object schema description
:param combined_only: Make this column available only in combined version of table
:param additional: Mark this column as missing in schema
:param joinable: Mark this column as array of strings
"""
title = prepare_title(item, parent)
column = Column(title, item_type, path)
root = get_root(self)
# combined_path = combine_path(root, path) if not joinable else path
is_array = self.is_array(path)
combined_path = combine_path(root, path)

column = Column(title, item_type, path)
self.combined_columns[combined_path] = Column(title, item_type, combined_path)

for p in (path, combined_path):
self.titles[p] = title

if not combined_only:
self.columns[path] = column
if combined_only and is_array:
self.columns[combined_path] = Column(title, item_type, combined_path)
if not self.is_root:
root.add_column(
self.parent.add_column(
path,
item,
item_type,
parent=parent,
combined_only=True,
joinable=joinable,
)

if additional:
self.additional_columns[path] = column

def is_array(self, path):
"""Check if provided path is inside any tables arrays"""
for array in get_root(self).arrays:
if common_prefix(array, path) == array:
return array
return False

def inc_column(self, header, combined=False):
"""Increment data counter in column
Expand Down
7 changes: 5 additions & 2 deletions spoonbill/stats.py
Expand Up @@ -124,7 +124,7 @@ def parse_schema(self):
# This means we in array of strings, so this becomes a single joinable column
type_ = ARRAY.format(items_type)
self.current_table.types[pointer] = JOINABLE
self.current_table.add_column(pointer, item, type_, parent=prop, joinable=True)
self.current_table.add_column(pointer, item, type_, parent=prop)
else:
if self.current_table.is_combined:
pointer = separator + separator.join((parent_key, key))
Expand Down Expand Up @@ -239,7 +239,7 @@ def process_items(self, releases, with_preview=True):
else:
root = get_root(self.current_table)
if root.set_array(pointer, item):
recalculate_headers(root, abs_path, key, item, separator)
recalculate_headers(root, abs_path, key, item, self.table_threshold, separator)

for i, value in enumerate(item):
if isinstance(value, dict):
Expand Down Expand Up @@ -280,6 +280,9 @@ def process_items(self, releases, with_preview=True):
self.current_table.inc_column(pointer)
root.inc_column(abs_pointer, combined=True)
if with_preview and count < PREVIEW_ROWS:
array = root.is_array(pointer)
if array and root.arrays[array] < self.table_threshold:
root.preview_rows[-1][abs_pointer] = item
self.current_table.preview_rows[-1][pointer] = item
root.preview_rows_combined[-1][abs_pointer] = item
yield count
Expand Down
10 changes: 9 additions & 1 deletion spoonbill/utils.py
Expand Up @@ -200,20 +200,24 @@ def generate_row_id(ocid, item_id, parent_key=None, top_level_id=None):
return f"{ocid}/{tail}"


def recalculate_headers(root, abs_path, key, item, separator="/"):
def recalculate_headers(root, abs_path, key, item, max_items, separator="/"):
"""Rebuild table headers when array is expanded with attempt to preserve order
Also deletes combined columns from tables columns if array becomes bigger than threshold
:param root: Table for which headers should be rebuild
:param abs_path: Full jsonpath to array
:param key: Array field name
:param item: Array items
:param max_items: Maximum elements in array before it should be split into table
:param separator: header path separator
"""
head = OrderedDict()
tail = OrderedDict()
cols = head
base_prefix = separator.join((abs_path, key))
zero_prefix = separator.join((base_prefix, "0"))
should_split = len(item) > max_items

zero_cols = {
col_p: col
Expand All @@ -237,6 +241,10 @@ def recalculate_headers(root, abs_path, key, item, separator="/"):
else:
if col_p not in cols:
cols[col_p] = col
if should_split:
for col_path in chain(zero_cols, new_cols):
root.columns.pop(col_path, "")

for col_path, col in chain(head.items(), tail.items()):
root.combined_columns[col_path] = col

Expand Down
20 changes: 20 additions & 0 deletions tests/data/ocds-sample-data.json
Expand Up @@ -573,6 +573,26 @@
"id": "E09000003",
"legalName": "London Borough of Barnet",
"scheme": "GB-LAC"
},
{
"id": "E09000003",
"legalName": "London Borough of Barnet",
"scheme": "GB-LAC"
},
{
"id": "E09000003",
"legalName": "London Borough of Barnet",
"scheme": "GB-LAC"
},
{
"id": "E09000003",
"legalName": "London Borough of Barnet",
"scheme": "GB-LAC"
},
{
"id": "E09000003",
"legalName": "London Borough of Barnet",
"scheme": "GB-LAC"
}
],
"address": {
Expand Down
10 changes: 10 additions & 0 deletions tests/test_spec.py
Expand Up @@ -56,6 +56,16 @@ def test_set_array(root_table):
assert items == 10


def test_is_array(root_table):
assert root_table.is_array("/tender/items")
assert root_table.is_array("/tender/items/id")
assert root_table.is_array("/tender/items/0/id")
assert root_table.is_array("/tender/items/additionalClassifications")
assert not root_table.is_array("/tender/id")
assert not root_table.is_array("/tender/title")
assert not root_table.is_array("/tender/submissionMethod")


def test_add_child_table(root_table):
data = root_table.dump()
assert not data["parent"]
Expand Down

0 comments on commit 42283e6

Please sign in to comment.