Merge pull request #55 from open-contracting/coverage-queries

Coverage: Add `calculate_coverage` function.
open-contracting · Feb 11, 2022 · 5cff5b0 · 5cff5b0
2 parents 623bea7 + 52b5a11
commit 5cff5b0
Show file tree

Hide file tree

Showing 2 changed files with 187 additions and 3 deletions.
diff --git a/ocdskingfishercolab/__init__.py b/ocdskingfishercolab/__init__.py
@@ -1,5 +1,6 @@
 import json
 import os
+import textwrap
 import warnings
 from urllib.parse import urljoin
 
@@ -365,6 +366,171 @@ def render_json(json_string):
         """)
 
 
+def calculate_coverage(fields, scope=None, sql=True, sql_only=False):
+    """
+    Calculates the coverage of one or more fields using the summary tables produced by Kingfisher Summarize's
+    `--field-lists` option. Returns the coverage of each field and the co-occurrence coverage of all the fields.
+
+    `scope` is the Kingfisher Summarize table to measure coverage against, e.g. `awards_summary`.
+    The number of rows in this table is used as the denominator when calculating the coverage.
+
+    `fields` is a list of fields to measure the coverage of, specified using JSON Pointer.
+
+    To specify fields that are children of the scope table, you can use either an absolute pointer or a relative
+    pointer prefixed with `:`, e.g. if `scope` is set to 'awards', then `awards/value/amount` and `:value/amount`
+    refer to the same field. Coverage of such fields is measured against the number of rows in the `scope` table.
+
+    To specify fields that are not children of the scope table, use an absolute path,
+    e.g. `tender/procurementMethod`. Coverage of such fields is measured against the number of releases/records.
+
+    For arrays, a field is counted if it appears in **any** object in the array,
+    e.g. if `scope` is set to `awards` and `field` is set to `:items/description`,
+    at least one item must have a description for the coverage to be non-zero.
+
+    To specify that a field must appear in **all** objects in the array, prepend the field with `ALL `,
+    e.g. if `scope` is set to `awards` and `field` is set to `ALL :items/description`,
+    all items must have a description for the coverage to be non-zero.
+
+    If `scope` is set to `awards`, specify fields on related contracts by prefixing the path with `:contracts/`,
+    e.g. to measure how many awards have a value and a related contract with a period, set `scope` to `awards`
+    and `fields` to `[':value', ':contracts/period']`. Similarly, if `scope` is set to `contracts`, specify fields
+    on related awards by prefixing the path with `:awards/`.
+
+
+    :param list fields: a list of fields as described above.
+    :param str scope: table name as described above; defaults to the parent table of the first item in the fields list.
+    :param bool sql: print the SQL query generated by the function.
+
+    :returns: the coverage of each field and the co-occurrence coverage of all the fields as a pandas DataFrame or an
+                  ipython-sql :ipython-sql:`ResultSet<src/sql/run.py#L99>`, depending on whether
+                  ``%config SqlMagic.autopandas`` is ``True`` or ``False`` respectively. This is the same behaviour as
+                  ipython-sql's ``%sql`` magic.
+    :rtype: pandas.DataFrame or sql.run.ResultSet
+    """
+
+    def get_all_tables():
+        views = get_ipython_sql_resultset_from_query(
+            "SELECT viewname FROM pg_catalog.pg_views WHERE schemaname = ANY (CURRENT_SCHEMAS(false))"
+        )
+        tables = get_ipython_sql_resultset_from_query(
+            "SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname = ANY (CURRENT_SCHEMAS(false))"
+        )
+        return [x[0] for x in list(views) + list(tables)]
+
+    def get_table_and_path(field, scope_table):
+
+        if field.startswith(':'):
+            return scope_table, field[1:]
+
+        path = field.split("/")
+        table_candidates = [
+            "_".join(path[:-i]) for i in reversed(range(1, len(path)))
+        ]
+        table = "release_summary"
+
+        for num, table_canditate in enumerate(table_candidates):
+            if scope_table == table_canditate:
+                path = path[num+1:]
+                table = scope_table
+                break
+        return table, "/".join(path)
+
+    def get_scope_table(field):
+
+        all_tables = get_all_tables()
+        path = field.split("/")
+        table_candidates = {
+            "_".join(path[:-i]) for i in range(1, len(path))
+        }
+        table = "release"
+
+        for table_canditate in table_candidates:
+            if table_canditate + '_summary' in all_tables:
+                table = table_canditate
+        return table + '_summary'
+
+    def coverage_wrapper(condition, field):
+        field_name = field.replace("/", "_").replace(" ", "_").lower()
+        return f"ROUND(SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) * 100.0 / count(*), 2) AS {field_name}_percentage"
+
+    def any_condition(field, current_scope_table):
+        return f"{current_scope_table}.field_list ? '{field}'"
+
+    def all_condition(field, current_scope_table):
+        split_field = field.split("/")
+        one_to_manys = [field for field in split_field[:-1] if field.endswith("s")]
+
+        if not one_to_manys:
+            nearest_parent_one_to_many = split_field[0]
+        else:
+            nearest_parent_one_to_many = one_to_manys[-1]
+
+        if len(one_to_manys) > 1:
+            print(
+                f"""WARNING: The results of this query might be inacurate, you will need to check that
+                 `{', '.join(one_to_manys[:-1])}` fields are one to one with `{current_scope_table[:-8]}`
+                 and that `{', '.join(one_to_manys[:-1])}` exists for all `{current_scope_table[:-8]}` """
+            )
+
+        return f"""coalesce({current_scope_table}.field_list ->> '{field}' =
+                  {current_scope_table}.field_list ->> '{nearest_parent_one_to_many}', false)"""
+
+    def release_summary_join(scope_table, join_to_release):
+        if not join_to_release:
+            return ""
+        return f"""JOIN
+        release_summary ON release_summary.id = {scope_table}.id"""
+
+    if not scope:
+        field = fields[0].split()[-1]
+        scope = get_scope_table(field)
+
+    scope_table = scope
+
+    join_to_release = False
+
+    conditions = []
+
+    query_parts = []
+
+    for field in fields:
+        split_field = field.split()
+        field_name = split_field[-1]
+
+        table, path = get_table_and_path(field_name, scope)
+
+        if table == "release_summary" and scope_table != "release_summary":
+            join_to_release = True
+
+        if len(split_field) == 2 and split_field[0].lower() == "all":
+            condition = all_condition(path, table)
+        else:
+            condition = any_condition(path, table)
+
+        conditions.append(condition)
+        query_parts.append(coverage_wrapper(condition, path))
+
+    query_parts.append(
+        coverage_wrapper(" AND \n              ".join(conditions), "total")
+    )
+
+    select = ",\n            ".join(query_parts)
+    select = textwrap.dedent(f"""
+        SELECT
+            count(*) AS total_{scope},
+            {select}
+        FROM
+        {scope_table}
+        {release_summary_join(scope_table, join_to_release)}
+    """)
+
+    if sql:
+        print(select)
+    if sql_only:
+        return select
+    return get_ipython().run_cell_magic("sql", "", select)
+
+
 class OCDSKingfisherColabError(Exception):
     """Base class for exceptions from within this package"""
 

diff --git a/tests/test_module.py b/tests/test_module.py
@@ -13,9 +13,10 @@
 import pytest
 from IPython import get_ipython
 
-from ocdskingfishercolab import (UnknownPackageTypeError, download_dataframe_as_csv, download_package_from_ocid,
-                                 download_package_from_query, get_ipython_sql_resultset_from_query, list_collections,
-                                 list_source_ids, save_dataframe_to_spreadsheet, set_search_path)
+from ocdskingfishercolab import (UnknownPackageTypeError, calculate_coverage, download_dataframe_as_csv,
+                                 download_package_from_ocid, download_package_from_query,
+                                 get_ipython_sql_resultset_from_query, list_collections, list_source_ids,
+                                 save_dataframe_to_spreadsheet, set_search_path)
 
 
 def path(filename):
@@ -303,3 +304,20 @@ def test_save_dataframe_to_spreadsheet(save, capsys, tmpdir):
         assert capsys.readouterr().out == "Uploaded file with ID 'test'\n"
 
         save.assert_called_once_with({'title': 'yet_another_excel_file.xlsx'}, 'flattened.xlsx')
+
+
+@patch('ocdskingfishercolab._notebook_id', _notebook_id)
+def test_calculate_coverage(db, tmpdir):
+
+    sql = calculate_coverage(["ocid"], scope="release_summary", sql_only=True)
+
+    # only seperated to reduce line length
+    case_statement = "CASE WHEN release_summary.field_list ? 'ocid' THEN 1 ELSE 0 END"
+
+    assert sql.strip() == textwrap.dedent(f'''
+    SELECT
+        count(*) AS total_release_summary,
+        ROUND(SUM({case_statement}) * 100.0 / count(*), 2) AS ocid_percentage,
+        ROUND(SUM({case_statement}) * 100.0 / count(*), 2) AS total_percentage
+    FROM
+    release_summary''').strip()