Skip to content

Commit

Permalink
test: Refactor calculate_coverage and fix non-lowercase pointers
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed May 6, 2022
1 parent 847ad8b commit 9e6cdb7
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 111 deletions.
5 changes: 5 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ Changed

- :func:`~ocdskingfishercolab.save_dataframe_to_sheet` and :func:`save_dataframe_to_spreadsheet` do nothing if the data frame is empty.

Fixed
~~~~~

- :func:`~ocdskingfishercolab.calculate_coverage` uses the ``relatedprocesses_summary`` table for fields starting with ``relatedProcesses/``, where appropriate.

0.3.8 (2022-04-27)
------------------

Expand Down
244 changes: 134 additions & 110 deletions ocdskingfishercolab/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,156 +375,180 @@ def render_json(json_string):
def calculate_coverage(fields, scope=None, sql=True, sql_only=False):
"""
Calculates the coverage of one or more fields using the summary tables produced by Kingfisher Summarize's
`--field-lists` option. Returns the coverage of each field and the co-occurrence coverage of all the fields.
``--field-lists`` option. Returns the coverage of each field and the co-occurrence coverage of all fields.
`scope` is the Kingfisher Summarize table to measure coverage against, e.g. `awards_summary`.
The number of rows in this table is used as the denominator when calculating the coverage.
``scope`` is the Kingfisher Summarize table to measure coverage against, e.g. ``"awards_summary"``.
Coverage is calculated using the number of rows in this table as the denominator.
`fields` is a list of fields to measure the coverage of, specified using JSON Pointer.
If ``scope`` is not set, it defaults to the parent table of the first field.
To specify fields that are children of the scope table, you can use either an absolute pointer or a relative
pointer prefixed with `:`, e.g. if `scope` is set to 'awards_summary', then `awards/value/amount` and
`:value/amount` refer to the same field. Coverage of such fields is measured against the number of rows
in the `scope` table.
``fields`` is a list of fields to measure the coverage of, specified using JSON Pointer.
To specify fields that are not children of the scope table, use an absolute path,
e.g. `tender/procurementMethod`. Coverage of such fields is measured against the number of releases/records.
If a field isn't a child of the ``scope`` table, use an absolute pointer:
For arrays, a field is counted if it appears in **any** object in the array,
e.g. if `scope` is set to `awards_summary` and `field` is set to `:items/description`,
at least one item must have a description for the coverage to be non-zero.
.. code-block:: python
To specify that a field must appear in **all** objects in the array, prepend the field with `ALL `,
e.g. if `scope` is set to `awards_summary` and `field` is set to `ALL :items/description`,
all items must have a description for the coverage to be non-zero.
calculate_coverage(["tender/procurementMethod"], "awards_summary")
If `scope` is set to `awards_summary`, specify fields on related contracts by prefixing the path with
`:contracts/`, e.g. to measure how many awards have a value and a related contract with a period, set `scope`
to `awards` and `fields` to `[':value', ':contracts/period']`. Similarly, if `scope` is set to
`contracts_summary`, specify fields on related awards by prefixing the path with `:awards/`.
If a field is a child of the ``scope`` table, use either an absolute pointer:
.. code-block:: python
:param list fields: a list of fields as described above.
:param str scope: table name as described above; defaults to the parent table of the first item in the fields list.
:param bool sql: print the SQL query generated by the function.
calculate_coverage(["awards/value/amount"], "awards_summary")
:returns: the coverage of each field and the co-occurrence coverage of all the fields as a pandas DataFrame or an
ipython-sql :ipython-sql:`ResultSet<src/sql/run.py#L99>`, depending on whether
``%config SqlMagic.autopandas`` is ``True`` or ``False`` respectively. This is the same behaviour as
ipython-sql's ``%sql`` magic.
:rtype: pandas.DataFrame or sql.run.ResultSet
"""
Or a relative pointer (prepend with ``":"``):
def get_table_and_path(field, scope_table):
.. code-block:: python
if field.startswith(':'):
return scope_table, field[1:]
calculate_coverage([":value/amount"], "awards_summary")
path = field.split("/")
table_candidates = [
"_".join(path[:-i]) for i in reversed(range(1, len(path)))
]
table = "release_summary"
If a field is within an array, it counts if it appears in **any** object in the array.
for num, table_candidate in enumerate(table_candidates):
if scope_table[:-8] == table_candidate: # remove "_summary" from `scope_table`
path = path[num+1:]
table = scope_table
break
return table, "/".join(path)
.. code-block:: python
def get_scope_table(field):
calculate_coverage([":items/description"], "awards_summary")
all_tables = _all_tables()
path = field.split("/")
table_candidates = {
"_".join(path[:-i]) for i in range(1, len(path))
}
table = "release"

for table_candidate in table_candidates:
if f"{table_candidate}_summary" in all_tables:
table = table_candidate
return f"{table}_summary"

def coverage_wrapper(condition, field):
field_name = field.replace("/", "_").replace(" ", "_").lower()
return f"ROUND(SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) * 100.0 / count(*), 2) AS {field_name}_percentage"

def any_condition(field, current_scope_table):
return f"{current_scope_table}.field_list ? '{field}'"

def all_condition(field, current_scope_table):
split_field = field.split("/")
one_to_manys = [field for field in split_field[:-1] if field.endswith("s")]

if not one_to_manys:
nearest_parent_one_to_many = split_field[0]
else:
nearest_parent_one_to_many = one_to_manys[-1]
To require a field to appear in **all** objects in the array, prepend with ``"ALL "``:
if len(one_to_manys) > 1:
print(
'WARNING: Results might be inaccurate due to nested arrays. Check that there is exactly one '
f"`{'/'.join(one_to_manys[:-1])}` entry per `{current_scope_table[:-8]}`."
)
.. code-block:: python
return f"""coalesce({current_scope_table}.field_list->>'{field}' =
{current_scope_table}.field_list->>'{nearest_parent_one_to_many}', false)"""
calculate_coverage(["ALL :items/description"], "awards_summary")
def release_summary_join(scope_table, join_to_release):
if not join_to_release:
return ""
return f"""JOIN
release_summary ON release_summary.id = {scope_table}.id"""
.. note::
if not scope:
field = fields[0].split()[-1]
scope = get_scope_table(field)
Nested arrays, like the ``"awards/items/description"`` field with a ``"release_summary"`` scope, will yield
inaccurate results, unless the initial arrays are present and one-to-one with the scope table (i.e. there is
always exactly one award for each release).
scope_table = scope
If ``scope`` is ``"awards_summary"``, you can specify fields on related contracts by prepending ``":contracts/"``:
join_to_release = False
.. code-block:: python
conditions = []
calculate_coverage([":value/amount", ":contracts/period"], "awards_summary")
query_parts = []
If ``scope`` is ``"contracts_summary"``, you can specify fields on related awards by prepending ``":awards/"``:
for field in fields:
split_field = field.split()
field_name = split_field[-1]
.. code-block:: python
calculate_coverage([":value/amount", ":awards/date"], "contracts_summary")
:param list fields: the fields to measure coverage of
:param str scope: the table to measure coverage against
:param bool sql: print the SQL query
:param bool sql_only: return the SQL query instead of the results
:returns: the results as a pandas DataFrame or an ipython-sql :ipython-sql:`ResultSet<src/sql/run.py#L99>`,
depending on whether ``%config SqlMagic.autopandas`` is ``True`` or ``False`` respectively. This is the
same behaviour as ipython-sql's ``%sql`` magic.
:rtype: pandas.DataFrame or sql.run.ResultSet
"""

def get_table_and_pointer(scope, pointer):
# Handle relative pointers.
if pointer.startswith(":"):
return scope, pointer[1:]

table, path = get_table_and_path(field_name, scope)
# Handle absolute pointers.
parts = pointer.split("/")
table = "release_summary"

# Abbreviate absolute pointers to relative pointers if the pointer is on the scope table.
# For example: "awards/date" to "date" if the scope is "awards_summary."
for i in range(len(parts), 0, -1):
# Kingfisher Summarize tables are lowercase.
candidate = f"{'_'.join(parts[:i])}_summary".lower()
if scope == candidate:
parts = parts[i:]
table = scope
break

if table == "release_summary" and scope_table != "release_summary":
join_to_release = True
return table, "/".join(parts)

if len(split_field) == 2 and split_field[0].lower() == "all":
condition = all_condition(path, table)
def wrap(condition, pointer):
alias = pointer.replace("/", "_").lower()
return f"ROUND(SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) * 100.0 / count(*), 2) AS {alias}_percentage"

# Default to the parent table of the first field.
if not scope:
all_tables = _all_tables()
parts = fields[0].split()[-1].split("/")
scope = "release_summary"

for i in range(len(parts), 0, -1):
# Kingfisher Summarize tables are lowercase.
candidate = f"{'_'.join(parts[:i])}_summary".lower()
if candidate in all_tables:
scope = candidate
break

join_release_summary = False
columns = []
conditions = []
for field in fields:
split = field.split()

table, pointer = get_table_and_pointer(scope, split[-1])

if table == "release_summary" and scope != "release_summary":
join_release_summary = True

# If the first token isn't "ALL" or if there are more than 2, behave as if only the last token was provided.
if len(split) == 2 and split[0].lower() == "all":
parts = pointer.split("/")
# https://github.com/open-contracting/kingfisher-colab/issues/62
one_to_manys = [part for part in parts[:-1] if part.endswith("s")]

if not one_to_manys:
nearest_one_to_many_parent = parts[0]
else:
nearest_one_to_many_parent = one_to_manys[-1]

if len(one_to_manys) > 1:
print(
'WARNING: Results might be inaccurate due to nested arrays. Check that there is exactly one '
f"`{'/'.join(one_to_manys[:-1])}` entry per `{table[:-8]}`."
)

# https://github.com/open-contracting/kingfisher-colab/issues/63#issuecomment-1120005015
condition = (
f"coalesce({table}.field_list->>'{pointer}' =\n"
f" {table}.field_list->>'{nearest_one_to_many_parent}', false)"
)
else:
condition = any_condition(path, table)
# Test for the presence of the field.
# https://www.postgresql.org/docs/11/functions-json.html
condition = f"{table}.field_list ? '{pointer}'"

# Add the field coverage.
columns.append(wrap(condition, pointer))

# Collect the conditions for co-occurrence coverage.
conditions.append(condition)
query_parts.append(coverage_wrapper(condition, path))

query_parts.append(
coverage_wrapper(" AND\n ".join(conditions), "total")
)
# Add the co-occurrence coverage.
columns.append(wrap(" AND\n ".join(conditions), "total"))

select = ",\n ".join(columns)
if join_release_summary:
join = f"JOIN\n release_summary ON release_summary.id = {scope}.id"
else:
join = ""

select = ",\n ".join(query_parts)
select = textwrap.dedent(f"""\
query = textwrap.dedent(f"""\
SELECT
count(*) AS total_{scope},
{select}
FROM {scope_table}
{release_summary_join(scope_table, join_to_release)}
FROM {scope}
{join}
""")

if sql:
print(select)
print(query)

if sql_only:
return select
return get_ipython().run_cell_magic("sql", "", select)
return query

return get_ipython().run_cell_magic("sql", "", query)


class OCDSKingfisherColabError(Exception):
Expand Down
1 change: 0 additions & 1 deletion tests/test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,6 @@ def test_calculate_coverage_default_scope_tender_documents(db, tmpdir):
""") # noqa: E501


@pytest.mark.xfail()
@patch('ocdskingfishercolab._all_tables', _all_tables)
def test_calculate_coverage_default_scope_related_processes(db, tmpdir):
sql = calculate_coverage(["relatedProcesses/relationship"], sql_only=True)
Expand Down

0 comments on commit 9e6cdb7

Please sign in to comment.