test: Refactor calculate_coverage and fix non-lowercase pointers

open-contracting · May 6, 2022 · 9e6cdb7 · 9e6cdb7
1 parent 847ad8b
commit 9e6cdb7
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 111 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -9,6 +9,11 @@ Changed
 
 -  :func:`~ocdskingfishercolab.save_dataframe_to_sheet` and :func:`save_dataframe_to_spreadsheet` do nothing if the data frame is empty.
 
+Fixed
+~~~~~
+
+-  :func:`~ocdskingfishercolab.calculate_coverage` uses the ``relatedprocesses_summary`` table for fields starting with ``relatedProcesses/``, where appropriate.
+
 0.3.8 (2022-04-27)
 ------------------
 

diff --git a/ocdskingfishercolab/__init__.py b/ocdskingfishercolab/__init__.py
@@ -375,156 +375,180 @@ def render_json(json_string):
 def calculate_coverage(fields, scope=None, sql=True, sql_only=False):
     """
     Calculates the coverage of one or more fields using the summary tables produced by Kingfisher Summarize's
-    `--field-lists` option. Returns the coverage of each field and the co-occurrence coverage of all the fields.
+    ``--field-lists`` option. Returns the coverage of each field and the co-occurrence coverage of all fields.
 
-    `scope` is the Kingfisher Summarize table to measure coverage against, e.g. `awards_summary`.
-    The number of rows in this table is used as the denominator when calculating the coverage.
+    ``scope`` is the Kingfisher Summarize table to measure coverage against, e.g. ``"awards_summary"``.
+    Coverage is calculated using the number of rows in this table as the denominator.
 
-    `fields` is a list of fields to measure the coverage of, specified using JSON Pointer.
+    If ``scope`` is not set, it defaults to the parent table of the first field.
 
-    To specify fields that are children of the scope table, you can use either an absolute pointer or a relative
-    pointer prefixed with `:`, e.g. if `scope` is set to 'awards_summary', then `awards/value/amount` and
-    `:value/amount` refer to the same field. Coverage of such fields is measured against the number of rows
-    in the `scope` table.
+    ``fields`` is a list of fields to measure the coverage of, specified using JSON Pointer.
 
-    To specify fields that are not children of the scope table, use an absolute path,
-    e.g. `tender/procurementMethod`. Coverage of such fields is measured against the number of releases/records.
+    If a field isn't a child of the ``scope`` table, use an absolute pointer:
 
-    For arrays, a field is counted if it appears in **any** object in the array,
-    e.g. if `scope` is set to `awards_summary` and `field` is set to `:items/description`,
-    at least one item must have a description for the coverage to be non-zero.
+    .. code-block:: python
 
-    To specify that a field must appear in **all** objects in the array, prepend the field with `ALL `,
-    e.g. if `scope` is set to `awards_summary` and `field` is set to `ALL :items/description`,
-    all items must have a description for the coverage to be non-zero.
+       calculate_coverage(["tender/procurementMethod"], "awards_summary")
 
-    If `scope` is set to `awards_summary`, specify fields on related contracts by prefixing the path with
-    `:contracts/`, e.g. to measure how many awards have a value and a related contract with a period, set `scope`
-    to `awards` and `fields` to `[':value', ':contracts/period']`. Similarly, if `scope` is set to
-    `contracts_summary`, specify fields on related awards by prefixing the path with `:awards/`.
+    If a field is a child of the ``scope`` table, use either an absolute pointer:
 
+    .. code-block:: python
 
-    :param list fields: a list of fields as described above.
-    :param str scope: table name as described above; defaults to the parent table of the first item in the fields list.
-    :param bool sql: print the SQL query generated by the function.
+       calculate_coverage(["awards/value/amount"], "awards_summary")
 
-    :returns: the coverage of each field and the co-occurrence coverage of all the fields as a pandas DataFrame or an
-                  ipython-sql :ipython-sql:`ResultSet<src/sql/run.py#L99>`, depending on whether
-                  ``%config SqlMagic.autopandas`` is ``True`` or ``False`` respectively. This is the same behaviour as
-                  ipython-sql's ``%sql`` magic.
-    :rtype: pandas.DataFrame or sql.run.ResultSet
-    """
+    Or a relative pointer (prepend with ``":"``):
 
-    def get_table_and_path(field, scope_table):
+    .. code-block:: python
 
-        if field.startswith(':'):
-            return scope_table, field[1:]
+       calculate_coverage([":value/amount"], "awards_summary")
 
-        path = field.split("/")
-        table_candidates = [
-            "_".join(path[:-i]) for i in reversed(range(1, len(path)))
-        ]
-        table = "release_summary"
+    If a field is within an array, it counts if it appears in **any** object in the array.
 
-        for num, table_candidate in enumerate(table_candidates):
-            if scope_table[:-8] == table_candidate:  # remove "_summary" from `scope_table`
-                path = path[num+1:]
-                table = scope_table
-                break
-        return table, "/".join(path)
+    .. code-block:: python
 
-    def get_scope_table(field):
+       calculate_coverage([":items/description"], "awards_summary")
 
-        all_tables = _all_tables()
-        path = field.split("/")
-        table_candidates = {
-            "_".join(path[:-i]) for i in range(1, len(path))
-        }
-        table = "release"
-
-        for table_candidate in table_candidates:
-            if f"{table_candidate}_summary" in all_tables:
-                table = table_candidate
-        return f"{table}_summary"
-
-    def coverage_wrapper(condition, field):
-        field_name = field.replace("/", "_").replace(" ", "_").lower()
-        return f"ROUND(SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) * 100.0 / count(*), 2) AS {field_name}_percentage"
-
-    def any_condition(field, current_scope_table):
-        return f"{current_scope_table}.field_list ? '{field}'"
-
-    def all_condition(field, current_scope_table):
-        split_field = field.split("/")
-        one_to_manys = [field for field in split_field[:-1] if field.endswith("s")]
-
-        if not one_to_manys:
-            nearest_parent_one_to_many = split_field[0]
-        else:
-            nearest_parent_one_to_many = one_to_manys[-1]
+    To require a field to appear in **all** objects in the array, prepend with ``"ALL "``:
 
-        if len(one_to_manys) > 1:
-            print(
-                'WARNING: Results might be inaccurate due to nested arrays. Check that there is exactly one '
-                f"`{'/'.join(one_to_manys[:-1])}` entry per `{current_scope_table[:-8]}`."
-            )
+    .. code-block:: python
 
-        return f"""coalesce({current_scope_table}.field_list->>'{field}' =
-                  {current_scope_table}.field_list->>'{nearest_parent_one_to_many}', false)"""
+       calculate_coverage(["ALL :items/description"], "awards_summary")
 
-    def release_summary_join(scope_table, join_to_release):
-        if not join_to_release:
-            return ""
-        return f"""JOIN
-            release_summary ON release_summary.id = {scope_table}.id"""
+    .. note::
 
-    if not scope:
-        field = fields[0].split()[-1]
-        scope = get_scope_table(field)
+       Nested arrays, like the ``"awards/items/description"`` field with a ``"release_summary"`` scope, will yield
+       inaccurate results, unless the initial arrays are present and one-to-one with the scope table (i.e. there is
+       always exactly one award for each release).
 
-    scope_table = scope
+    If ``scope`` is ``"awards_summary"``, you can specify fields on related contracts by prepending ``":contracts/"``:
 
-    join_to_release = False
+    .. code-block:: python
 
-    conditions = []
+       calculate_coverage([":value/amount", ":contracts/period"], "awards_summary")
 
-    query_parts = []
+    If ``scope`` is ``"contracts_summary"``, you can specify fields on related awards by prepending ``":awards/"``:
 
-    for field in fields:
-        split_field = field.split()
-        field_name = split_field[-1]
+    .. code-block:: python
+
+       calculate_coverage([":value/amount", ":awards/date"], "contracts_summary")
+
+    :param list fields: the fields to measure coverage of
+    :param str scope: the table to measure coverage against
+    :param bool sql: print the SQL query
+    :param bool sql_only: return the SQL query instead of the results
+
+    :returns: the results as a pandas DataFrame or an ipython-sql :ipython-sql:`ResultSet<src/sql/run.py#L99>`,
+              depending on whether ``%config SqlMagic.autopandas`` is ``True`` or ``False`` respectively. This is the
+              same behaviour as ipython-sql's ``%sql`` magic.
+    :rtype: pandas.DataFrame or sql.run.ResultSet
+    """
+
+    def get_table_and_pointer(scope, pointer):
+        # Handle relative pointers.
+        if pointer.startswith(":"):
+            return scope, pointer[1:]
 
-        table, path = get_table_and_path(field_name, scope)
+        # Handle absolute pointers.
+        parts = pointer.split("/")
+        table = "release_summary"
+
+        # Abbreviate absolute pointers to relative pointers if the pointer is on the scope table.
+        # For example: "awards/date" to "date" if the scope is "awards_summary."
+        for i in range(len(parts), 0, -1):
+            # Kingfisher Summarize tables are lowercase.
+            candidate = f"{'_'.join(parts[:i])}_summary".lower()
+            if scope == candidate:
+                parts = parts[i:]
+                table = scope
+                break
 
-        if table == "release_summary" and scope_table != "release_summary":
-            join_to_release = True
+        return table, "/".join(parts)
 
-        if len(split_field) == 2 and split_field[0].lower() == "all":
-            condition = all_condition(path, table)
+    def wrap(condition, pointer):
+        alias = pointer.replace("/", "_").lower()
+        return f"ROUND(SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) * 100.0 / count(*), 2) AS {alias}_percentage"
+
+    # Default to the parent table of the first field.
+    if not scope:
+        all_tables = _all_tables()
+        parts = fields[0].split()[-1].split("/")
+        scope = "release_summary"
+
+        for i in range(len(parts), 0, -1):
+            # Kingfisher Summarize tables are lowercase.
+            candidate = f"{'_'.join(parts[:i])}_summary".lower()
+            if candidate in all_tables:
+                scope = candidate
+                break
+
+    join_release_summary = False
+    columns = []
+    conditions = []
+    for field in fields:
+        split = field.split()
+
+        table, pointer = get_table_and_pointer(scope, split[-1])
+
+        if table == "release_summary" and scope != "release_summary":
+            join_release_summary = True
+
+        # If the first token isn't "ALL" or if there are more than 2, behave as if only the last token was provided.
+        if len(split) == 2 and split[0].lower() == "all":
+            parts = pointer.split("/")
+            # https://github.com/open-contracting/kingfisher-colab/issues/62
+            one_to_manys = [part for part in parts[:-1] if part.endswith("s")]
+
+            if not one_to_manys:
+                nearest_one_to_many_parent = parts[0]
+            else:
+                nearest_one_to_many_parent = one_to_manys[-1]
+
+            if len(one_to_manys) > 1:
+                print(
+                    'WARNING: Results might be inaccurate due to nested arrays. Check that there is exactly one '
+                    f"`{'/'.join(one_to_manys[:-1])}` entry per `{table[:-8]}`."
+                )
+
+            # https://github.com/open-contracting/kingfisher-colab/issues/63#issuecomment-1120005015
+            condition = (
+                f"coalesce({table}.field_list->>'{pointer}' =\n"
+                f"                  {table}.field_list->>'{nearest_one_to_many_parent}', false)"
+            )
         else:
-            condition = any_condition(path, table)
+            # Test for the presence of the field.
+            # https://www.postgresql.org/docs/11/functions-json.html
+            condition = f"{table}.field_list ? '{pointer}'"
 
+        # Add the field coverage.
+        columns.append(wrap(condition, pointer))
+
+        # Collect the conditions for co-occurrence coverage.
         conditions.append(condition)
-        query_parts.append(coverage_wrapper(condition, path))
 
-    query_parts.append(
-        coverage_wrapper(" AND\n                ".join(conditions), "total")
-    )
+    # Add the co-occurrence coverage.
+    columns.append(wrap(" AND\n                ".join(conditions), "total"))
+
+    select = ",\n            ".join(columns)
+    if join_release_summary:
+        join = f"JOIN\n            release_summary ON release_summary.id = {scope}.id"
+    else:
+        join = ""
 
-    select = ",\n            ".join(query_parts)
-    select = textwrap.dedent(f"""\
+    query = textwrap.dedent(f"""\
         SELECT
             count(*) AS total_{scope},
             {select}
-        FROM {scope_table}
-        {release_summary_join(scope_table, join_to_release)}
+        FROM {scope}
+        {join}
     """)
 
     if sql:
-        print(select)
+        print(query)
+
     if sql_only:
-        return select
-    return get_ipython().run_cell_magic("sql", "", select)
+        return query
+
+    return get_ipython().run_cell_magic("sql", "", query)
 
 
 class OCDSKingfisherColabError(Exception):

diff --git a/tests/test_module.py b/tests/test_module.py
@@ -497,7 +497,6 @@ def test_calculate_coverage_default_scope_tender_documents(db, tmpdir):
     """)  # noqa: E501
 
 
-@pytest.mark.xfail()
 @patch('ocdskingfishercolab._all_tables', _all_tables)
 def test_calculate_coverage_default_scope_related_processes(db, tmpdir):
     sql = calculate_coverage(["relatedProcesses/relationship"], sql_only=True)