Skip to content

Commit

Permalink
Merge pull request #55 from open-contracting/coverage-queries
Browse files Browse the repository at this point in the history
Coverage:  Add `calculate_coverage` function.
  • Loading branch information
kindly committed Feb 11, 2022
2 parents 623bea7 + 52b5a11 commit 5cff5b0
Show file tree
Hide file tree
Showing 2 changed files with 187 additions and 3 deletions.
166 changes: 166 additions & 0 deletions ocdskingfishercolab/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import textwrap
import warnings
from urllib.parse import urljoin

Expand Down Expand Up @@ -365,6 +366,171 @@ def render_json(json_string):
""")


def calculate_coverage(fields, scope=None, sql=True, sql_only=False):
"""
Calculates the coverage of one or more fields using the summary tables produced by Kingfisher Summarize's
`--field-lists` option. Returns the coverage of each field and the co-occurrence coverage of all the fields.
`scope` is the Kingfisher Summarize table to measure coverage against, e.g. `awards_summary`.
The number of rows in this table is used as the denominator when calculating the coverage.
`fields` is a list of fields to measure the coverage of, specified using JSON Pointer.
To specify fields that are children of the scope table, you can use either an absolute pointer or a relative
pointer prefixed with `:`, e.g. if `scope` is set to 'awards', then `awards/value/amount` and `:value/amount`
refer to the same field. Coverage of such fields is measured against the number of rows in the `scope` table.
To specify fields that are not children of the scope table, use an absolute path,
e.g. `tender/procurementMethod`. Coverage of such fields is measured against the number of releases/records.
For arrays, a field is counted if it appears in **any** object in the array,
e.g. if `scope` is set to `awards` and `field` is set to `:items/description`,
at least one item must have a description for the coverage to be non-zero.
To specify that a field must appear in **all** objects in the array, prepend the field with `ALL `,
e.g. if `scope` is set to `awards` and `field` is set to `ALL :items/description`,
all items must have a description for the coverage to be non-zero.
If `scope` is set to `awards`, specify fields on related contracts by prefixing the path with `:contracts/`,
e.g. to measure how many awards have a value and a related contract with a period, set `scope` to `awards`
and `fields` to `[':value', ':contracts/period']`. Similarly, if `scope` is set to `contracts`, specify fields
on related awards by prefixing the path with `:awards/`.
:param list fields: a list of fields as described above.
:param str scope: table name as described above; defaults to the parent table of the first item in the fields list.
:param bool sql: print the SQL query generated by the function.
:returns: the coverage of each field and the co-occurrence coverage of all the fields as a pandas DataFrame or an
ipython-sql :ipython-sql:`ResultSet<src/sql/run.py#L99>`, depending on whether
``%config SqlMagic.autopandas`` is ``True`` or ``False`` respectively. This is the same behaviour as
ipython-sql's ``%sql`` magic.
:rtype: pandas.DataFrame or sql.run.ResultSet
"""

def get_all_tables():
views = get_ipython_sql_resultset_from_query(
"SELECT viewname FROM pg_catalog.pg_views WHERE schemaname = ANY (CURRENT_SCHEMAS(false))"
)
tables = get_ipython_sql_resultset_from_query(
"SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname = ANY (CURRENT_SCHEMAS(false))"
)
return [x[0] for x in list(views) + list(tables)]

def get_table_and_path(field, scope_table):

if field.startswith(':'):
return scope_table, field[1:]

path = field.split("/")
table_candidates = [
"_".join(path[:-i]) for i in reversed(range(1, len(path)))
]
table = "release_summary"

for num, table_canditate in enumerate(table_candidates):
if scope_table == table_canditate:
path = path[num+1:]
table = scope_table
break
return table, "/".join(path)

def get_scope_table(field):

all_tables = get_all_tables()
path = field.split("/")
table_candidates = {
"_".join(path[:-i]) for i in range(1, len(path))
}
table = "release"

for table_canditate in table_candidates:
if table_canditate + '_summary' in all_tables:
table = table_canditate
return table + '_summary'

def coverage_wrapper(condition, field):
field_name = field.replace("/", "_").replace(" ", "_").lower()
return f"ROUND(SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) * 100.0 / count(*), 2) AS {field_name}_percentage"

def any_condition(field, current_scope_table):
return f"{current_scope_table}.field_list ? '{field}'"

def all_condition(field, current_scope_table):
split_field = field.split("/")
one_to_manys = [field for field in split_field[:-1] if field.endswith("s")]

if not one_to_manys:
nearest_parent_one_to_many = split_field[0]
else:
nearest_parent_one_to_many = one_to_manys[-1]

if len(one_to_manys) > 1:
print(
f"""WARNING: The results of this query might be inacurate, you will need to check that
`{', '.join(one_to_manys[:-1])}` fields are one to one with `{current_scope_table[:-8]}`
and that `{', '.join(one_to_manys[:-1])}` exists for all `{current_scope_table[:-8]}` """
)

return f"""coalesce({current_scope_table}.field_list ->> '{field}' =
{current_scope_table}.field_list ->> '{nearest_parent_one_to_many}', false)"""

def release_summary_join(scope_table, join_to_release):
if not join_to_release:
return ""
return f"""JOIN
release_summary ON release_summary.id = {scope_table}.id"""

if not scope:
field = fields[0].split()[-1]
scope = get_scope_table(field)

scope_table = scope

join_to_release = False

conditions = []

query_parts = []

for field in fields:
split_field = field.split()
field_name = split_field[-1]

table, path = get_table_and_path(field_name, scope)

if table == "release_summary" and scope_table != "release_summary":
join_to_release = True

if len(split_field) == 2 and split_field[0].lower() == "all":
condition = all_condition(path, table)
else:
condition = any_condition(path, table)

conditions.append(condition)
query_parts.append(coverage_wrapper(condition, path))

query_parts.append(
coverage_wrapper(" AND \n ".join(conditions), "total")
)

select = ",\n ".join(query_parts)
select = textwrap.dedent(f"""
SELECT
count(*) AS total_{scope},
{select}
FROM
{scope_table}
{release_summary_join(scope_table, join_to_release)}
""")

if sql:
print(select)
if sql_only:
return select
return get_ipython().run_cell_magic("sql", "", select)


class OCDSKingfisherColabError(Exception):
"""Base class for exceptions from within this package"""

Expand Down
24 changes: 21 additions & 3 deletions tests/test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
import pytest
from IPython import get_ipython

from ocdskingfishercolab import (UnknownPackageTypeError, download_dataframe_as_csv, download_package_from_ocid,
download_package_from_query, get_ipython_sql_resultset_from_query, list_collections,
list_source_ids, save_dataframe_to_spreadsheet, set_search_path)
from ocdskingfishercolab import (UnknownPackageTypeError, calculate_coverage, download_dataframe_as_csv,
download_package_from_ocid, download_package_from_query,
get_ipython_sql_resultset_from_query, list_collections, list_source_ids,
save_dataframe_to_spreadsheet, set_search_path)


def path(filename):
Expand Down Expand Up @@ -303,3 +304,20 @@ def test_save_dataframe_to_spreadsheet(save, capsys, tmpdir):
assert capsys.readouterr().out == "Uploaded file with ID 'test'\n"

save.assert_called_once_with({'title': 'yet_another_excel_file.xlsx'}, 'flattened.xlsx')


@patch('ocdskingfishercolab._notebook_id', _notebook_id)
def test_calculate_coverage(db, tmpdir):

sql = calculate_coverage(["ocid"], scope="release_summary", sql_only=True)

# only seperated to reduce line length
case_statement = "CASE WHEN release_summary.field_list ? 'ocid' THEN 1 ELSE 0 END"

assert sql.strip() == textwrap.dedent(f'''
SELECT
count(*) AS total_release_summary,
ROUND(SUM({case_statement}) * 100.0 / count(*), 2) AS ocid_percentage,
ROUND(SUM({case_statement}) * 100.0 / count(*), 2) AS total_percentage
FROM
release_summary''').strip()

0 comments on commit 5cff5b0

Please sign in to comment.