Merge pull request #81 from opensafely/count_excluded

Count the number excluded by each criteria
opensafely · Aug 23, 2023 · c184925 · c184925
2 parents a4868fc + 0f7ae06
commit c184925
Show file tree

Hide file tree

Showing 3 changed files with 386 additions and 0 deletions.
diff --git a/analysis/count_excluded.py b/analysis/count_excluded.py
@@ -0,0 +1,165 @@
+import argparse
+import pathlib
+import re
+import glob
+import pandas
+import itertools
+from report.report_utils import round_values
+
+FNAME_REGEX = re.compile(r"input_excluded_(?P<id>\S+)\.csv.gz")
+
+
+def _round_table(table, round_to, redact=False, redaction_threshold=5):
+    table = table.astype(float)
+
+    table = table.apply(
+        lambda x: round_values(
+            x, round_to, redact=redact, redaction_threshold=redaction_threshold
+        )
+    )
+    table = table.fillna("[REDACTED]")
+    return table
+
+
+def _join_tables(tables):
+    return pandas.concat(tables)
+
+
+def get_input_tables(input_files, exclude_files):
+    all_files = set(itertools.chain(*input_files))
+    all_exclude = set(itertools.chain(*exclude_files))
+    all_files = all_files - all_exclude
+    for input_file in all_files:
+        measure_fname_match = re.match(FNAME_REGEX, input_file.name)
+        if measure_fname_match is not None:
+            # The `date` column is assigned by the measures framework.
+            measure_table = pandas.read_csv(input_file)
+
+            # We can reconstruct the parameters passed to `Measure` without
+            # the study definition.
+            measure_table.attrs["id"] = measure_fname_match.group("id")
+            yield measure_table
+
+
+def compute_excluded(input_table):
+    d = {}
+    d["total"] = len(input_table)
+    registered = input_table[input_table.registered == 1]
+    d["not_registered"] = len(input_table) - len(registered)
+    alive = registered[registered.died == 0]
+    d["died"] = len(registered) - len(alive)
+    age = alive[alive.age != "missing"]
+    d["unknown_age"] = len(alive) - len(age)
+    sex = age[(age.sex == "M") | (age.sex == "F")]
+    d["unknown_sex"] = len(age) - len(sex)
+    excluded = input_table[input_table.included == 0]
+    d["total_excluded"] = len(excluded)
+    d["clinical_any"] = (excluded.event_clinical_any == 1).sum()
+    d["medication_any"] = (excluded.event_medication_any == 1).sum()
+    counts = pandas.Series(d)
+    counts.name = "count"
+    counts.index.name = "attribute"
+    return counts
+
+
+def write_table(measure_table, path, filename):
+    create_dir(path)
+    measure_table.to_csv(path / filename, index=False, header=True)
+
+
+def create_dir(path):
+    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
+
+
+def get_path(*args):
+    return pathlib.Path(*args).resolve()
+
+
+def match_input(input_list):
+    path = get_path(input_list)
+    if path.exists():
+        return path
+
+
+def match_paths(pattern):
+    return [get_path(x) for x in glob.glob(pattern)]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-files",
+        required=True,
+        type=match_paths,
+        action="append",
+        help="Glob pattern(s) for matching one or more input files",
+    )
+    parser.add_argument(
+        "--exclude-files",
+        required=False,
+        type=match_paths,
+        action="append",
+        default=[],
+        help="Glob pattern(s) to exclude one or more input files",
+    )
+    parser.add_argument(
+        "--output-dir",
+        required=True,
+        type=pathlib.Path,
+        help="Path to the output directory",
+    )
+    parser.add_argument(
+        "--output-name",
+        required=True,
+        help="Name for joined measures file",
+    )
+    parser.add_argument(
+        "--round-to",
+        required=False,
+        default=10,
+        type=int,
+        help="Round to the nearest",
+    )
+    parser.add_argument(
+        "--redact",
+        action="store_true",
+        help="Redact values below a threshold",
+    )
+    parser.add_argument(
+        "--redaction-threshold",
+        required=False,
+        default=5,
+        type=int,
+        help="Redact values below or equal to this threshold",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    input_files = args.input_files
+    exclude_files = args.exclude_files
+    output_dir = args.output_dir
+    output_name = args.output_name
+    round_to = args.round_to
+    redact = args.redact
+    redaction_threshold = args.redaction_threshold
+
+    tables = []
+    for input_table in get_input_tables(input_files, exclude_files):
+        table_date = input_table.attrs["id"]
+        excluded_counts = compute_excluded(input_table)
+        redacted_and_rounded = _round_table(
+            excluded_counts, round_to, redact, redaction_threshold
+        )
+        df = redacted_and_rounded.reset_index()
+        df["date"] = table_date
+        tables.append(df)
+
+    output = _join_tables(tables)
+
+    write_table(output, output_dir, output_name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/study_definition_excluded.py b/analysis/study_definition_excluded.py
@@ -0,0 +1,198 @@
+from cohortextractor import (
+    StudyDefinition,
+    patients,
+    params,
+    combine_codelists,
+)
+
+from codelists import (
+    amoxicillin_codes,
+    azithromycin_codes,
+    clarithromycin_codes,
+    erythromycin_codes,
+    phenoxymethylpenicillin_codes,
+    cefalexin_codes,
+    co_amoxiclav_codes,
+    flucloxacillin_codes,
+    scarlet_fever_codes,
+    invasive_strep_a_codes,
+    sore_throat_tonsillitis_codes,
+)
+
+# Import so we can inspect metadata logs for correct variable expansion
+import logging
+import json
+
+
+medication_codelists = {
+    "amoxicillin": amoxicillin_codes,
+    "azithromycin": azithromycin_codes,
+    "clarithromycin": clarithromycin_codes,
+    "erythromycin": erythromycin_codes,
+    "phenoxymethylpenicillin": phenoxymethylpenicillin_codes,
+    "cefalexin": cefalexin_codes,
+    "co_amoxiclav": co_amoxiclav_codes,
+    "flucloxacillin": flucloxacillin_codes,
+}
+
+
+clinical_event_codelists = {
+    "scarlet_fever": scarlet_fever_codes,
+    "invasive_strep_a": invasive_strep_a_codes,
+    "sore_throat_tonsillitis": sore_throat_tonsillitis_codes,
+}
+
+all_medication_codes = combine_codelists(*list(medication_codelists.values()))
+all_clinical_codes = combine_codelists(
+    *list(clinical_event_codelists.values())
+)
+
+
+frequency = params.get("frequency", None)
+if frequency == "weekly":
+    ENDDATE = "index_date + 6 days"
+else:
+    ENDDATE = "last_day_of_month(index_date)"
+
+
+def generate_all_medications():
+    var = {
+        "event_medication_any": patients.satisfying(
+            " OR ".join(
+                list(map(lambda x: f"event_{x}", medication_codelists.keys()))
+            )
+        ),
+    }
+    logging.info(json.dumps(var, indent=4))
+    return var
+
+
+def generate_all_clinical():
+    var = {
+        "event_clinical_any": patients.satisfying(
+            " OR ".join(
+                list(
+                    map(
+                        lambda x: f"event_{x}", clinical_event_codelists.keys()
+                    )
+                )
+            )
+        ),
+    }
+    logging.info(json.dumps(var, indent=4))
+    return var
+
+
+if frequency == "weekly":
+    start_date = "2022-09-01"
+    end_date = "2023-02-15"
+else:
+    start_date = "2018-01-01"
+    end_date = "2022-01-01"
+
+demographics = {
+    "sex": patients.sex(
+        return_expectations={
+            "rate": "universal",
+            "category": {"ratios": {"M": 0.49, "F": 0.50, "U": 0.01}},
+        }
+    ),
+    "age_band": (
+        patients.categorised_as(
+            {
+                "missing": "DEFAULT",
+                "0-4": """ age >=  0 AND age < 5""",
+                "5-9": """ age >=  5 AND age < 10""",
+                "10-14": """ age >=  10 AND age < 15""",
+                "15-44": """ age >=  15 AND age < 45""",
+                "45-64": """ age >=  45 AND age < 65""",
+                "65-74": """ age >=  65 AND age < 75""",
+                "75+": """ age >=  75 AND age < 120""",
+            },
+            return_expectations={
+                "rate": "universal",
+                "category": {
+                    "ratios": {
+                        "missing": 0.05,
+                        "0-4": 0.25,
+                        "5-9": 0.3,
+                        "10-14": 0.1,
+                        "15-44": 0.1,
+                        "45-64": 0.1,
+                        "75+": 0.1,
+                    }
+                },
+            },
+        )
+    ),
+}
+
+
+clinical_events = [
+    {
+        f"event_{clinical_key}": patients.with_these_clinical_events(
+            codelist=clinical_codelist,
+            between=["index_date", ENDDATE],
+            returning="binary_flag",
+            return_expectations={"incidence": 0.1},
+        ),
+    }
+    for clinical_key, clinical_codelist in clinical_event_codelists.items()
+]
+
+
+medication_events = [
+    {
+        f"event_{medication_key}": patients.with_these_medications(
+            codelist=medication_codelist,
+            between=["index_date", ENDDATE],
+            returning="binary_flag",
+            return_expectations={"incidence": 0.1},
+        ),
+    }
+    for medication_key, medication_codelist in medication_codelists.items()
+]
+# convert list of dicts into a single dict
+medication_variables = {k: v for d in medication_events for k, v in d.items()}
+clinical_event_variables = {
+    k: v for d in clinical_events for k, v in d.items()
+}
+
+study = StudyDefinition(
+    index_date="2019-01-01",
+    default_expectations={
+        "date": {"earliest": start_date, "latest": end_date},
+        "rate": "exponential_increase",
+        "incidence": 0.1,
+    },
+    population=patients.all(),
+    registered=patients.registered_as_of(
+        "index_date",
+        return_expectations={"incidence": 0.9},
+    ),
+    died=patients.died_from_any_cause(
+        on_or_before="index_date",
+        returning="binary_flag",
+        return_expectations={"incidence": 0.1},
+    ),
+    age=patients.age_as_of(
+        "index_date",
+        return_expectations={
+            "rate": "universal",
+            "int": {"distribution": "population_ages"},
+        },
+    ),
+    **demographics,
+    **clinical_event_variables,
+    **medication_variables,
+    **generate_all_medications(),
+    **generate_all_clinical(),
+    included=patients.satisfying(
+        """
+        registered AND
+        NOT died AND
+        age_band != "missing" AND
+        (sex = "M" OR sex = "F")
+        """
+    ),
+)
diff --git a/project.yaml b/project.yaml
@@ -112,6 +112,29 @@ actions:
   ### End curation check ###
 
   ### MONTHLY ###
+  generate_study_population_report_excluded:
+    run: cohortextractor:latest generate_cohort
+      --study-definition study_definition_excluded
+      --index-date-range "2023-03-01 to 2023-03-01 by month"
+      --param frequency=monthly
+      --output-dir=output/report
+      --output-format=csv.gz
+    outputs:
+      highly_sensitive:
+        cohort: output/report/input_excluded_2023-03-01.csv.gz
+
+  count_excluded:
+      run: python:latest python analysis/count_excluded.py
+           --input-files output/report/input_excluded_2023-03-01.csv.gz
+           --output-dir output/report/results/paper
+           --output-name "excluded.csv"
+           --redact
+      needs: [generate_study_population_report_excluded]
+      outputs:
+        moderately_sensitive:
+          # Only output the single summary file
+          measure_csv: output/report/results/paper/excluded.csv
+
   generate_study_population_report_monthly_0:
     run: cohortextractor:latest generate_cohort
       --study-definition study_definition_report