Skip to content

Commit

Permalink
Merge pull request #164 from mariob0y/feat/schema-headers
Browse files Browse the repository at this point in the history
feat: extracting headers from schema
  • Loading branch information
mariob0y committed Sep 9, 2021
2 parents e5ad26c + a6aa558 commit 09f6f8a
Show file tree
Hide file tree
Showing 12 changed files with 315 additions and 60 deletions.
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -30,6 +30,7 @@
"xlsxwriter",
'dataclasses;python_version<"3.7"',
"flatten-dict",
"scalpl",
],
extras_require={
"test": [
Expand Down
41 changes: 31 additions & 10 deletions spoonbill/__init__.py
Expand Up @@ -6,7 +6,7 @@
from ocdsextensionregistry import ProfileBuilder
from ocdskit.util import detect_format

from spoonbill.common import COMBINED_TABLES, CURRENT_SCHEMA_TAG, ROOT_TABLES, TABLE_THRESHOLD
from spoonbill.common import COMBINED_TABLES, CURRENT_SCHEMA_TAG, DEFAULT_SCHEMA_URL, ROOT_TABLES, TABLE_THRESHOLD
from spoonbill.flatten import Flattener
from spoonbill.i18n import LOCALE, _
from spoonbill.stats import DataPreprocessor
Expand All @@ -18,7 +18,6 @@

class FileAnalyzer:
"""Main utility for analyzing files
:param workdir: Working directory
:param schema: Json schema file to use with data
:param root_tables: Path configuration which should become root tables
Expand Down Expand Up @@ -82,6 +81,7 @@ def analyze_file(self, filenames, with_preview=True):
language=self.language,
table_threshold=self.table_threshold,
multiple_values=self.multiple_values,
pkg_type=self.pkg_type,
)
reader = get_reader(path)
with reader(path, "rb") as fd:
Expand All @@ -92,7 +92,6 @@ def analyze_file(self, filenames, with_preview=True):

def dump_to_file(self, filenames):
"""Save analyzed information to file
:param filename: Output filename in working directory
"""
if not isinstance(filenames, list):
Expand All @@ -110,9 +109,10 @@ def parse_schema(self, input_format, schema=None):
else:
pkg_type = "records"
getter = attrgetter("record_package_schema")
url = DEFAULT_SCHEMA_URL[pkg_type][self.language]
if not schema:
LOGGER.info(_("No schema provided, using version {}").format(CURRENT_SCHEMA_TAG))
profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {})
profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}, schema_base_url=url)
schema = getter(profile)()
title = schema.get("title", "").lower()
if not title:
Expand Down Expand Up @@ -151,7 +151,6 @@ def sort_tables(self):

class FileFlattener:
"""Main utility for flattening files
:param workdir: Working directory
:param options: Flattening configuration
:param analyzer: Analyzed data object
Expand All @@ -172,6 +171,7 @@ def __init__(
xlsx="result.xlsx",
language=LOCALE,
multiple_values=False,
schema=None,
):
self.tables = tables if tables else analyzer.spec.tables
self.flattener = Flattener(options, self.tables, language=language)
Expand All @@ -182,6 +182,7 @@ def __init__(
self.xlsx = xlsx
self.multiple_values = multiple_values if multiple_values else analyzer.multiple_values if analyzer else False
self.pkg_type = pkg_type if pkg_type else analyzer.pkg_type if analyzer else "releases"
self.schema = schema or getattr(getattr(analyzer, "spec"), "schema")

def _flatten(self, filenames, writers):
if not isinstance(filenames, list):
Expand All @@ -200,25 +201,45 @@ def _flatten(self, filenames, writers):

def flatten_file(self, filename):
"""Flatten file
:param filename: Input filename in working directory
"""
workdir = self.workdir

if isinstance(self.csv, Path):
workdir = self.csv
if not self.xlsx and self.csv:
with CSVWriter(workdir, self.flattener.tables, self.flattener.options) as writer:
with CSVWriter(
workdir,
self.flattener.tables,
self.flattener.options,
schema=self.schema,
) as writer:
for count in self._flatten(filename, [writer]):
yield count
if self.xlsx and not self.csv:
with XlsxWriter(self.workdir, self.flattener.tables, self.flattener.options, filename=self.xlsx) as writer:
with XlsxWriter(
self.workdir,
self.flattener.tables,
self.flattener.options,
filename=self.xlsx,
schema=self.schema,
) as writer:
for count in self._flatten(filename, [writer]):
yield count

if self.xlsx and self.csv:
with XlsxWriter(
self.workdir, self.flattener.tables, self.flattener.options, filename=self.xlsx
) as xlsx, CSVWriter(workdir, self.flattener.tables, self.flattener.options) as csv:
self.workdir,
self.flattener.tables,
self.flattener.options,
filename=self.xlsx,
schema=self.schema,
) as xlsx, CSVWriter(
workdir,
self.flattener.tables,
self.flattener.options,
schema=self.schema,
) as csv:
for count in self._flatten(filename, [xlsx, csv]):
yield count

Expand Down
15 changes: 15 additions & 0 deletions spoonbill/common.py
Expand Up @@ -45,3 +45,18 @@
JOINABLE_SEPARATOR = ";"
TABLE_THRESHOLD = 5
CURRENT_SCHEMA_TAG = "1__1__5"
CURRENT_URL_TAG = "1.1"
DEFAULT_SCHEMA_URL = {
"releases": {
"en": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/en/release-package-schema.json",
"es": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/es/release-package-schema.json",
"en_US": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/en/release-package-schema.json",
"es_ES": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/es/release-package-schema.json",
},
"records": {
"en": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/en/record-package-schema.json",
"es": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/es/record-package-schema.json",
"en_US": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/en/record-package-schema.json",
"es_ES": f"https://standard.open-contracting.org/{CURRENT_URL_TAG}/es/record-package-schema.json",
},
}
11 changes: 7 additions & 4 deletions spoonbill/spec.py
Expand Up @@ -25,6 +25,7 @@ class Column:
type: str
id: str
hits: int = 0
header: list = field(default_factory=list)


@dataclass
Expand Down Expand Up @@ -130,6 +131,7 @@ def add_column(
propagate=True,
additional=False,
abs_path=None,
header=[]
):
"""
Add a new column to the table.
Expand All @@ -145,20 +147,20 @@ def add_column(
is_array = self.is_array(path)
combined_path = combine_path(self, path)
if not combined_only:
self.columns[combined_path] = Column(title, item_type, combined_path)
self.columns[combined_path] = Column(title, item_type, combined_path, header=header)
# new column to track hits differently
self.combined_columns[combined_path] = Column(title, item_type, combined_path)
self.combined_columns[combined_path] = Column(title, item_type, combined_path, header=header)

if additional:
if is_array:
# when we analyzing file we need to keep index from data not to use 0
# e.g. /tender/items/166/relatedLot
combined_path = abs_path
LOGGER.debug(_("Detected additional column: %s in %s table") % (path, self.name))
self.additional_columns[combined_path] = Column(title, item_type, combined_path)
self.additional_columns[combined_path] = Column(title, item_type, combined_path, header=header)

for p in (path, combined_path):
self.titles[p] = title
self.titles[p] = header
if not self.is_root and propagate:
self.parent.add_column(
path,
Expand All @@ -167,6 +169,7 @@ def add_column(
combined_only=combined_only,
additional=additional,
abs_path=abs_path,
header=header,
)

def is_array(self, path):
Expand Down
18 changes: 15 additions & 3 deletions spoonbill/stats.py
Expand Up @@ -15,6 +15,7 @@
from spoonbill.utils import (
PYTHON_TO_JSON_TYPE,
RepeatFilter,
add_paths_to_schema,
extract_type,
generate_table_name,
get_matching_tables,
Expand Down Expand Up @@ -54,6 +55,7 @@ def __init__(
header_separator="/",
language=LOCALE,
multiple_values=False,
pkg_type=None,
):
self.schema = schema
self.root_tables = root_tables
Expand All @@ -70,6 +72,7 @@ def __init__(
self.names_counter = defaultdict(int)
if not self.tables:
self.parse_schema()
self.pkg_type = pkg_type

def __getitem__(self, table):
return self.tables[table]
Expand Down Expand Up @@ -101,7 +104,8 @@ def parse_schema(self):
if self.combined_tables:
self.init_tables(self.combined_tables, is_combined=True)
separator = self.header_separator
to_analyze = deque([("", "", {}, self.schema)])
proxy = add_paths_to_schema(self.schema)
to_analyze = deque([("", "", {}, proxy)])

# TODO: check if recursion is better for field ordering
while to_analyze:
Expand All @@ -112,6 +116,8 @@ def parse_schema(self):
properties = prop.get("properties", {})
if properties:
for key, item in properties.items():
if key in ("$title", "$path"):
continue
if item.get("deprecated"):
continue
if hasattr(item, "__reference__") and item.__reference__.get("deprecated"):
Expand All @@ -138,11 +144,17 @@ def parse_schema(self):
# This means we in array of strings, so this becomes a single joinable column
typeset = ARRAY.format(items_type)
self.current_table.types[pointer] = JOINABLE
self.current_table.add_column(pointer, typeset, _(pointer, self.language))
self.current_table.add_column(
pointer, typeset, _(pointer, self.language), header=item["$title"]
)
else:
if self.current_table.is_combined:
pointer = separator + separator.join((parent_key, key))
self.current_table.add_column(pointer, typeset, _(pointer, self.language))

self.current_table.add_column(
pointer, typeset, _(pointer, self.language), header=item["$title"]
)

else:
# TODO: not sure what to do here
continue
Expand Down

0 comments on commit 09f6f8a

Please sign in to comment.