Count the number excluded by each criteria

Add a new study def so we can count the excluded population. This could be in one study def (to ensure they get run at the same time), but that would mean that all variables are extracted for the excluded population, and all measures would need to add an AND included to the numerator and denominator. Add a processing script to count and then redact and round the criteria of interest. We could also use the measures framework, but this is faster for now.
opensafely · Aug 23, 2023 · 0f7ae06 · 0f7ae06
1 parent a4868fc
commit 0f7ae06
Show file tree

Hide file tree

Showing 3 changed files with 386 additions and 0 deletions.
diff --git a/analysis/count_excluded.py b/analysis/count_excluded.py
@@ -0,0 +1,165 @@
+import argparse
+import pathlib
+import re
+import glob
+import pandas
+import itertools
+from report.report_utils import round_values
+
+FNAME_REGEX = re.compile(r"input_excluded_(?P<id>\S+)\.csv.gz")
+
+
+def _round_table(table, round_to, redact=False, redaction_threshold=5):
+    table = table.astype(float)
+
+    table = table.apply(
+        lambda x: round_values(
+            x, round_to, redact=redact, redaction_threshold=redaction_threshold
+        )
+    )
+    table = table.fillna("[REDACTED]")
+    return table
+
+
+def _join_tables(tables):
+    return pandas.concat(tables)
+
+
+def get_input_tables(input_files, exclude_files):
+    all_files = set(itertools.chain(*input_files))
+    all_exclude = set(itertools.chain(*exclude_files))
+    all_files = all_files - all_exclude
+    for input_file in all_files:
+        measure_fname_match = re.match(FNAME_REGEX, input_file.name)
+        if measure_fname_match is not None:
+            # The `date` column is assigned by the measures framework.
+            measure_table = pandas.read_csv(input_file)
+
+            # We can reconstruct the parameters passed to `Measure` without
+            # the study definition.
+            measure_table.attrs["id"] = measure_fname_match.group("id")
+            yield measure_table
+
+
+def compute_excluded(input_table):
+    d = {}
+    d["total"] = len(input_table)
+    registered = input_table[input_table.registered == 1]
+    d["not_registered"] = len(input_table) - len(registered)
+    alive = registered[registered.died == 0]
+    d["died"] = len(registered) - len(alive)
+    age = alive[alive.age != "missing"]
+    d["unknown_age"] = len(alive) - len(age)
+    sex = age[(age.sex == "M") | (age.sex == "F")]
+    d["unknown_sex"] = len(age) - len(sex)
+    excluded = input_table[input_table.included == 0]
+    d["total_excluded"] = len(excluded)
+    d["clinical_any"] = (excluded.event_clinical_any == 1).sum()
+    d["medication_any"] = (excluded.event_medication_any == 1).sum()
+    counts = pandas.Series(d)
+    counts.name = "count"
+    counts.index.name = "attribute"
+    return counts
+
+
+def write_table(measure_table, path, filename):
+    create_dir(path)
+    measure_table.to_csv(path / filename, index=False, header=True)
+
+
+def create_dir(path):
+    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
+
+
+def get_path(*args):
+    return pathlib.Path(*args).resolve()
+
+
+def match_input(input_list):
+    path = get_path(input_list)
+    if path.exists():
+        return path
+
+
+def match_paths(pattern):
+    return [get_path(x) for x in glob.glob(pattern)]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-files",
+        required=True,
+        type=match_paths,
+        action="append",
+        help="Glob pattern(s) for matching one or more input files",
+    )
+    parser.add_argument(
+        "--exclude-files",
+        required=False,
+        type=match_paths,
+        action="append",
+        default=[],
+        help="Glob pattern(s) to exclude one or more input files",
+    )
+    parser.add_argument(
+        "--output-dir",
+        required=True,
+        type=pathlib.Path,
+        help="Path to the output directory",
+    )
+    parser.add_argument(
+        "--output-name",
+        required=True,
+        help="Name for joined measures file",
+    )
+    parser.add_argument(
+        "--round-to",
+        required=False,
+        default=10,
+        type=int,
+        help="Round to the nearest",
+    )
+    parser.add_argument(
+        "--redact",
+        action="store_true",
+        help="Redact values below a threshold",
+    )
+    parser.add_argument(
+        "--redaction-threshold",
+        required=False,
+        default=5,
+        type=int,
+        help="Redact values below or equal to this threshold",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    input_files = args.input_files
+    exclude_files = args.exclude_files
+    output_dir = args.output_dir
+    output_name = args.output_name
+    round_to = args.round_to
+    redact = args.redact
+    redaction_threshold = args.redaction_threshold
+
+    tables = []
+    for input_table in get_input_tables(input_files, exclude_files):
+        table_date = input_table.attrs["id"]
+        excluded_counts = compute_excluded(input_table)
+        redacted_and_rounded = _round_table(
+            excluded_counts, round_to, redact, redaction_threshold
+        )
+        df = redacted_and_rounded.reset_index()
+        df["date"] = table_date
+        tables.append(df)
+
+    output = _join_tables(tables)
+
+    write_table(output, output_dir, output_name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/study_definition_excluded.py b/analysis/study_definition_excluded.py
@@ -0,0 +1,198 @@
+from cohortextractor import (
+    StudyDefinition,
+    patients,
+    params,
+    combine_codelists,
+)
+
+from codelists import (
+    amoxicillin_codes,
+    azithromycin_codes,
+    clarithromycin_codes,
+    erythromycin_codes,
+    phenoxymethylpenicillin_codes,
+    cefalexin_codes,
+    co_amoxiclav_codes,
+    flucloxacillin_codes,
+    scarlet_fever_codes,
+    invasive_strep_a_codes,
+    sore_throat_tonsillitis_codes,
+)
+
+# Import so we can inspect metadata logs for correct variable expansion
+import logging
+import json
+
+
+medication_codelists = {
+    "amoxicillin": amoxicillin_codes,
+    "azithromycin": azithromycin_codes,
+    "clarithromycin": clarithromycin_codes,
+    "erythromycin": erythromycin_codes,
+    "phenoxymethylpenicillin": phenoxymethylpenicillin_codes,
+    "cefalexin": cefalexin_codes,
+    "co_amoxiclav": co_amoxiclav_codes,
+    "flucloxacillin": flucloxacillin_codes,
+}
+
+
+clinical_event_codelists = {
+    "scarlet_fever": scarlet_fever_codes,
+    "invasive_strep_a": invasive_strep_a_codes,
+    "sore_throat_tonsillitis": sore_throat_tonsillitis_codes,
+}
+
+all_medication_codes = combine_codelists(*list(medication_codelists.values()))
+all_clinical_codes = combine_codelists(
+    *list(clinical_event_codelists.values())
+)
+
+
+frequency = params.get("frequency", None)
+if frequency == "weekly":
+    ENDDATE = "index_date + 6 days"
+else:
+    ENDDATE = "last_day_of_month(index_date)"
+
+
+def generate_all_medications():
+    var = {
+        "event_medication_any": patients.satisfying(
+            " OR ".join(
+                list(map(lambda x: f"event_{x}", medication_codelists.keys()))
+            )
+        ),
+    }
+    logging.info(json.dumps(var, indent=4))
+    return var
+
+
+def generate_all_clinical():
+    var = {
+        "event_clinical_any": patients.satisfying(
+            " OR ".join(
+                list(
+                    map(
+                        lambda x: f"event_{x}", clinical_event_codelists.keys()
+                    )
+                )
+            )
+        ),
+    }
+    logging.info(json.dumps(var, indent=4))
+    return var
+
+
+if frequency == "weekly":
+    start_date = "2022-09-01"
+    end_date = "2023-02-15"
+else:
+    start_date = "2018-01-01"
+    end_date = "2022-01-01"
+
+demographics = {
+    "sex": patients.sex(
+        return_expectations={
+            "rate": "universal",
+            "category": {"ratios": {"M": 0.49, "F": 0.50, "U": 0.01}},
+        }
+    ),
+    "age_band": (
+        patients.categorised_as(
+            {
+                "missing": "DEFAULT",
+                "0-4": """ age >=  0 AND age < 5""",
+                "5-9": """ age >=  5 AND age < 10""",
+                "10-14": """ age >=  10 AND age < 15""",
+                "15-44": """ age >=  15 AND age < 45""",
+                "45-64": """ age >=  45 AND age < 65""",
+                "65-74": """ age >=  65 AND age < 75""",
+                "75+": """ age >=  75 AND age < 120""",
+            },
+            return_expectations={
+                "rate": "universal",
+                "category": {
+                    "ratios": {
+                        "missing": 0.05,
+                        "0-4": 0.25,
+                        "5-9": 0.3,
+                        "10-14": 0.1,
+                        "15-44": 0.1,
+                        "45-64": 0.1,
+                        "75+": 0.1,
+                    }
+                },
+            },
+        )
+    ),
+}
+
+
+clinical_events = [
+    {
+        f"event_{clinical_key}": patients.with_these_clinical_events(
+            codelist=clinical_codelist,
+            between=["index_date", ENDDATE],
+            returning="binary_flag",
+            return_expectations={"incidence": 0.1},
+        ),
+    }
+    for clinical_key, clinical_codelist in clinical_event_codelists.items()
+]
+
+
+medication_events = [
+    {
+        f"event_{medication_key}": patients.with_these_medications(
+            codelist=medication_codelist,
+            between=["index_date", ENDDATE],
+            returning="binary_flag",
+            return_expectations={"incidence": 0.1},
+        ),
+    }
+    for medication_key, medication_codelist in medication_codelists.items()
+]
+# convert list of dicts into a single dict
+medication_variables = {k: v for d in medication_events for k, v in d.items()}
+clinical_event_variables = {
+    k: v for d in clinical_events for k, v in d.items()
+}
+
+study = StudyDefinition(
+    index_date="2019-01-01",
+    default_expectations={
+        "date": {"earliest": start_date, "latest": end_date},
+        "rate": "exponential_increase",
+        "incidence": 0.1,
+    },
+    population=patients.all(),
+    registered=patients.registered_as_of(
+        "index_date",
+        return_expectations={"incidence": 0.9},
+    ),
+    died=patients.died_from_any_cause(
+        on_or_before="index_date",
+        returning="binary_flag",
+        return_expectations={"incidence": 0.1},
+    ),
+    age=patients.age_as_of(
+        "index_date",
+        return_expectations={
+            "rate": "universal",
+            "int": {"distribution": "population_ages"},
+        },
+    ),
+    **demographics,
+    **clinical_event_variables,
+    **medication_variables,
+    **generate_all_medications(),
+    **generate_all_clinical(),
+    included=patients.satisfying(
+        """
+        registered AND
+        NOT died AND
+        age_band != "missing" AND
+        (sex = "M" OR sex = "F")
+        """
+    ),
+)
diff --git a/project.yaml b/project.yaml
@@ -112,6 +112,29 @@ actions:
   ### End curation check ###
 
   ### MONTHLY ###
+  generate_study_population_report_excluded:
+    run: cohortextractor:latest generate_cohort
+      --study-definition study_definition_excluded
+      --index-date-range "2023-03-01 to 2023-03-01 by month"
+      --param frequency=monthly
+      --output-dir=output/report
+      --output-format=csv.gz
+    outputs:
+      highly_sensitive:
+        cohort: output/report/input_excluded_2023-03-01.csv.gz
+
+  count_excluded:
+      run: python:latest python analysis/count_excluded.py
+           --input-files output/report/input_excluded_2023-03-01.csv.gz
+           --output-dir output/report/results/paper
+           --output-name "excluded.csv"
+           --redact
+      needs: [generate_study_population_report_excluded]
+      outputs:
+        moderately_sensitive:
+          # Only output the single summary file
+          measure_csv: output/report/results/paper/excluded.csv
+
   generate_study_population_report_monthly_0:
     run: cohortextractor:latest generate_cohort
       --study-definition study_definition_report