Skip to content

Commit

Permalink
Count the number excluded by each criteria
Browse files Browse the repository at this point in the history
Add a new study def so we can count the excluded population.
This could be in one study def (to ensure they get run at the same
time), but that would mean that all variables are extracted for the
excluded population, and all measures would need to add an AND included
to the numerator and denominator.

Add a processing script to count and then redact and round the criteria
of interest. We could also use the measures framework, but this is
faster for now.
  • Loading branch information
ccunningham101 committed Aug 23, 2023
1 parent a4868fc commit 0f7ae06
Show file tree
Hide file tree
Showing 3 changed files with 386 additions and 0 deletions.
165 changes: 165 additions & 0 deletions analysis/count_excluded.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import argparse
import pathlib
import re
import glob
import pandas
import itertools
from report.report_utils import round_values

FNAME_REGEX = re.compile(r"input_excluded_(?P<id>\S+)\.csv.gz")


def _round_table(table, round_to, redact=False, redaction_threshold=5):
table = table.astype(float)

table = table.apply(
lambda x: round_values(
x, round_to, redact=redact, redaction_threshold=redaction_threshold
)
)
table = table.fillna("[REDACTED]")
return table


def _join_tables(tables):
return pandas.concat(tables)


def get_input_tables(input_files, exclude_files):
all_files = set(itertools.chain(*input_files))
all_exclude = set(itertools.chain(*exclude_files))
all_files = all_files - all_exclude
for input_file in all_files:
measure_fname_match = re.match(FNAME_REGEX, input_file.name)
if measure_fname_match is not None:
# The `date` column is assigned by the measures framework.
measure_table = pandas.read_csv(input_file)

# We can reconstruct the parameters passed to `Measure` without
# the study definition.
measure_table.attrs["id"] = measure_fname_match.group("id")
yield measure_table


def compute_excluded(input_table):
d = {}
d["total"] = len(input_table)
registered = input_table[input_table.registered == 1]
d["not_registered"] = len(input_table) - len(registered)
alive = registered[registered.died == 0]
d["died"] = len(registered) - len(alive)
age = alive[alive.age != "missing"]
d["unknown_age"] = len(alive) - len(age)
sex = age[(age.sex == "M") | (age.sex == "F")]
d["unknown_sex"] = len(age) - len(sex)
excluded = input_table[input_table.included == 0]
d["total_excluded"] = len(excluded)
d["clinical_any"] = (excluded.event_clinical_any == 1).sum()
d["medication_any"] = (excluded.event_medication_any == 1).sum()
counts = pandas.Series(d)
counts.name = "count"
counts.index.name = "attribute"
return counts


def write_table(measure_table, path, filename):
create_dir(path)
measure_table.to_csv(path / filename, index=False, header=True)


def create_dir(path):
pathlib.Path(path).mkdir(parents=True, exist_ok=True)


def get_path(*args):
return pathlib.Path(*args).resolve()


def match_input(input_list):
path = get_path(input_list)
if path.exists():
return path


def match_paths(pattern):
return [get_path(x) for x in glob.glob(pattern)]


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input-files",
required=True,
type=match_paths,
action="append",
help="Glob pattern(s) for matching one or more input files",
)
parser.add_argument(
"--exclude-files",
required=False,
type=match_paths,
action="append",
default=[],
help="Glob pattern(s) to exclude one or more input files",
)
parser.add_argument(
"--output-dir",
required=True,
type=pathlib.Path,
help="Path to the output directory",
)
parser.add_argument(
"--output-name",
required=True,
help="Name for joined measures file",
)
parser.add_argument(
"--round-to",
required=False,
default=10,
type=int,
help="Round to the nearest",
)
parser.add_argument(
"--redact",
action="store_true",
help="Redact values below a threshold",
)
parser.add_argument(
"--redaction-threshold",
required=False,
default=5,
type=int,
help="Redact values below or equal to this threshold",
)
return parser.parse_args()


def main():
args = parse_args()
input_files = args.input_files
exclude_files = args.exclude_files
output_dir = args.output_dir
output_name = args.output_name
round_to = args.round_to
redact = args.redact
redaction_threshold = args.redaction_threshold

tables = []
for input_table in get_input_tables(input_files, exclude_files):
table_date = input_table.attrs["id"]
excluded_counts = compute_excluded(input_table)
redacted_and_rounded = _round_table(
excluded_counts, round_to, redact, redaction_threshold
)
df = redacted_and_rounded.reset_index()
df["date"] = table_date
tables.append(df)

output = _join_tables(tables)

write_table(output, output_dir, output_name)


if __name__ == "__main__":
main()
198 changes: 198 additions & 0 deletions analysis/study_definition_excluded.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
from cohortextractor import (
StudyDefinition,
patients,
params,
combine_codelists,
)

from codelists import (
amoxicillin_codes,
azithromycin_codes,
clarithromycin_codes,
erythromycin_codes,
phenoxymethylpenicillin_codes,
cefalexin_codes,
co_amoxiclav_codes,
flucloxacillin_codes,
scarlet_fever_codes,
invasive_strep_a_codes,
sore_throat_tonsillitis_codes,
)

# Import so we can inspect metadata logs for correct variable expansion
import logging
import json


medication_codelists = {
"amoxicillin": amoxicillin_codes,
"azithromycin": azithromycin_codes,
"clarithromycin": clarithromycin_codes,
"erythromycin": erythromycin_codes,
"phenoxymethylpenicillin": phenoxymethylpenicillin_codes,
"cefalexin": cefalexin_codes,
"co_amoxiclav": co_amoxiclav_codes,
"flucloxacillin": flucloxacillin_codes,
}


clinical_event_codelists = {
"scarlet_fever": scarlet_fever_codes,
"invasive_strep_a": invasive_strep_a_codes,
"sore_throat_tonsillitis": sore_throat_tonsillitis_codes,
}

all_medication_codes = combine_codelists(*list(medication_codelists.values()))
all_clinical_codes = combine_codelists(
*list(clinical_event_codelists.values())
)


frequency = params.get("frequency", None)
if frequency == "weekly":
ENDDATE = "index_date + 6 days"
else:
ENDDATE = "last_day_of_month(index_date)"


def generate_all_medications():
var = {
"event_medication_any": patients.satisfying(
" OR ".join(
list(map(lambda x: f"event_{x}", medication_codelists.keys()))
)
),
}
logging.info(json.dumps(var, indent=4))
return var


def generate_all_clinical():
var = {
"event_clinical_any": patients.satisfying(
" OR ".join(
list(
map(
lambda x: f"event_{x}", clinical_event_codelists.keys()
)
)
)
),
}
logging.info(json.dumps(var, indent=4))
return var


if frequency == "weekly":
start_date = "2022-09-01"
end_date = "2023-02-15"
else:
start_date = "2018-01-01"
end_date = "2022-01-01"

demographics = {
"sex": patients.sex(
return_expectations={
"rate": "universal",
"category": {"ratios": {"M": 0.49, "F": 0.50, "U": 0.01}},
}
),
"age_band": (
patients.categorised_as(
{
"missing": "DEFAULT",
"0-4": """ age >= 0 AND age < 5""",
"5-9": """ age >= 5 AND age < 10""",
"10-14": """ age >= 10 AND age < 15""",
"15-44": """ age >= 15 AND age < 45""",
"45-64": """ age >= 45 AND age < 65""",
"65-74": """ age >= 65 AND age < 75""",
"75+": """ age >= 75 AND age < 120""",
},
return_expectations={
"rate": "universal",
"category": {
"ratios": {
"missing": 0.05,
"0-4": 0.25,
"5-9": 0.3,
"10-14": 0.1,
"15-44": 0.1,
"45-64": 0.1,
"75+": 0.1,
}
},
},
)
),
}


clinical_events = [
{
f"event_{clinical_key}": patients.with_these_clinical_events(
codelist=clinical_codelist,
between=["index_date", ENDDATE],
returning="binary_flag",
return_expectations={"incidence": 0.1},
),
}
for clinical_key, clinical_codelist in clinical_event_codelists.items()
]


medication_events = [
{
f"event_{medication_key}": patients.with_these_medications(
codelist=medication_codelist,
between=["index_date", ENDDATE],
returning="binary_flag",
return_expectations={"incidence": 0.1},
),
}
for medication_key, medication_codelist in medication_codelists.items()
]
# convert list of dicts into a single dict
medication_variables = {k: v for d in medication_events for k, v in d.items()}
clinical_event_variables = {
k: v for d in clinical_events for k, v in d.items()
}

study = StudyDefinition(
index_date="2019-01-01",
default_expectations={
"date": {"earliest": start_date, "latest": end_date},
"rate": "exponential_increase",
"incidence": 0.1,
},
population=patients.all(),
registered=patients.registered_as_of(
"index_date",
return_expectations={"incidence": 0.9},
),
died=patients.died_from_any_cause(
on_or_before="index_date",
returning="binary_flag",
return_expectations={"incidence": 0.1},
),
age=patients.age_as_of(
"index_date",
return_expectations={
"rate": "universal",
"int": {"distribution": "population_ages"},
},
),
**demographics,
**clinical_event_variables,
**medication_variables,
**generate_all_medications(),
**generate_all_clinical(),
included=patients.satisfying(
"""
registered AND
NOT died AND
age_band != "missing" AND
(sex = "M" OR sex = "F")
"""
),
)
23 changes: 23 additions & 0 deletions project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,29 @@ actions:
### End curation check ###

### MONTHLY ###
generate_study_population_report_excluded:
run: cohortextractor:latest generate_cohort
--study-definition study_definition_excluded
--index-date-range "2023-03-01 to 2023-03-01 by month"
--param frequency=monthly
--output-dir=output/report
--output-format=csv.gz
outputs:
highly_sensitive:
cohort: output/report/input_excluded_2023-03-01.csv.gz

count_excluded:
run: python:latest python analysis/count_excluded.py
--input-files output/report/input_excluded_2023-03-01.csv.gz
--output-dir output/report/results/paper
--output-name "excluded.csv"
--redact
needs: [generate_study_population_report_excluded]
outputs:
moderately_sensitive:
# Only output the single summary file
measure_csv: output/report/results/paper/excluded.csv

generate_study_population_report_monthly_0:
run: cohortextractor:latest generate_cohort
--study-definition study_definition_report
Expand Down

0 comments on commit 0f7ae06

Please sign in to comment.