Skip to content

Commit

Permalink
Merge pull request #81 from opensafely/count_excluded
Browse files Browse the repository at this point in the history
Count the number excluded by each criteria
  • Loading branch information
ccunningham101 committed Aug 23, 2023
2 parents a4868fc + 0f7ae06 commit c184925
Show file tree
Hide file tree
Showing 3 changed files with 386 additions and 0 deletions.
165 changes: 165 additions & 0 deletions analysis/count_excluded.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import argparse
import pathlib
import re
import glob
import pandas
import itertools
from report.report_utils import round_values

FNAME_REGEX = re.compile(r"input_excluded_(?P<id>\S+)\.csv.gz")


def _round_table(table, round_to, redact=False, redaction_threshold=5):
table = table.astype(float)

table = table.apply(
lambda x: round_values(
x, round_to, redact=redact, redaction_threshold=redaction_threshold
)
)
table = table.fillna("[REDACTED]")
return table


def _join_tables(tables):
return pandas.concat(tables)


def get_input_tables(input_files, exclude_files):
all_files = set(itertools.chain(*input_files))
all_exclude = set(itertools.chain(*exclude_files))
all_files = all_files - all_exclude
for input_file in all_files:
measure_fname_match = re.match(FNAME_REGEX, input_file.name)
if measure_fname_match is not None:
# The `date` column is assigned by the measures framework.
measure_table = pandas.read_csv(input_file)

# We can reconstruct the parameters passed to `Measure` without
# the study definition.
measure_table.attrs["id"] = measure_fname_match.group("id")
yield measure_table


def compute_excluded(input_table):
d = {}
d["total"] = len(input_table)
registered = input_table[input_table.registered == 1]
d["not_registered"] = len(input_table) - len(registered)
alive = registered[registered.died == 0]
d["died"] = len(registered) - len(alive)
age = alive[alive.age != "missing"]
d["unknown_age"] = len(alive) - len(age)
sex = age[(age.sex == "M") | (age.sex == "F")]
d["unknown_sex"] = len(age) - len(sex)
excluded = input_table[input_table.included == 0]
d["total_excluded"] = len(excluded)
d["clinical_any"] = (excluded.event_clinical_any == 1).sum()
d["medication_any"] = (excluded.event_medication_any == 1).sum()
counts = pandas.Series(d)
counts.name = "count"
counts.index.name = "attribute"
return counts


def write_table(measure_table, path, filename):
create_dir(path)
measure_table.to_csv(path / filename, index=False, header=True)


def create_dir(path):
pathlib.Path(path).mkdir(parents=True, exist_ok=True)


def get_path(*args):
return pathlib.Path(*args).resolve()


def match_input(input_list):
path = get_path(input_list)
if path.exists():
return path


def match_paths(pattern):
return [get_path(x) for x in glob.glob(pattern)]


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input-files",
required=True,
type=match_paths,
action="append",
help="Glob pattern(s) for matching one or more input files",
)
parser.add_argument(
"--exclude-files",
required=False,
type=match_paths,
action="append",
default=[],
help="Glob pattern(s) to exclude one or more input files",
)
parser.add_argument(
"--output-dir",
required=True,
type=pathlib.Path,
help="Path to the output directory",
)
parser.add_argument(
"--output-name",
required=True,
help="Name for joined measures file",
)
parser.add_argument(
"--round-to",
required=False,
default=10,
type=int,
help="Round to the nearest",
)
parser.add_argument(
"--redact",
action="store_true",
help="Redact values below a threshold",
)
parser.add_argument(
"--redaction-threshold",
required=False,
default=5,
type=int,
help="Redact values below or equal to this threshold",
)
return parser.parse_args()


def main():
args = parse_args()
input_files = args.input_files
exclude_files = args.exclude_files
output_dir = args.output_dir
output_name = args.output_name
round_to = args.round_to
redact = args.redact
redaction_threshold = args.redaction_threshold

tables = []
for input_table in get_input_tables(input_files, exclude_files):
table_date = input_table.attrs["id"]
excluded_counts = compute_excluded(input_table)
redacted_and_rounded = _round_table(
excluded_counts, round_to, redact, redaction_threshold
)
df = redacted_and_rounded.reset_index()
df["date"] = table_date
tables.append(df)

output = _join_tables(tables)

write_table(output, output_dir, output_name)


if __name__ == "__main__":
main()
198 changes: 198 additions & 0 deletions analysis/study_definition_excluded.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
from cohortextractor import (
StudyDefinition,
patients,
params,
combine_codelists,
)

from codelists import (
amoxicillin_codes,
azithromycin_codes,
clarithromycin_codes,
erythromycin_codes,
phenoxymethylpenicillin_codes,
cefalexin_codes,
co_amoxiclav_codes,
flucloxacillin_codes,
scarlet_fever_codes,
invasive_strep_a_codes,
sore_throat_tonsillitis_codes,
)

# Import so we can inspect metadata logs for correct variable expansion
import logging
import json


medication_codelists = {
"amoxicillin": amoxicillin_codes,
"azithromycin": azithromycin_codes,
"clarithromycin": clarithromycin_codes,
"erythromycin": erythromycin_codes,
"phenoxymethylpenicillin": phenoxymethylpenicillin_codes,
"cefalexin": cefalexin_codes,
"co_amoxiclav": co_amoxiclav_codes,
"flucloxacillin": flucloxacillin_codes,
}


clinical_event_codelists = {
"scarlet_fever": scarlet_fever_codes,
"invasive_strep_a": invasive_strep_a_codes,
"sore_throat_tonsillitis": sore_throat_tonsillitis_codes,
}

all_medication_codes = combine_codelists(*list(medication_codelists.values()))
all_clinical_codes = combine_codelists(
*list(clinical_event_codelists.values())
)


frequency = params.get("frequency", None)
if frequency == "weekly":
ENDDATE = "index_date + 6 days"
else:
ENDDATE = "last_day_of_month(index_date)"


def generate_all_medications():
var = {
"event_medication_any": patients.satisfying(
" OR ".join(
list(map(lambda x: f"event_{x}", medication_codelists.keys()))
)
),
}
logging.info(json.dumps(var, indent=4))
return var


def generate_all_clinical():
var = {
"event_clinical_any": patients.satisfying(
" OR ".join(
list(
map(
lambda x: f"event_{x}", clinical_event_codelists.keys()
)
)
)
),
}
logging.info(json.dumps(var, indent=4))
return var


if frequency == "weekly":
start_date = "2022-09-01"
end_date = "2023-02-15"
else:
start_date = "2018-01-01"
end_date = "2022-01-01"

demographics = {
"sex": patients.sex(
return_expectations={
"rate": "universal",
"category": {"ratios": {"M": 0.49, "F": 0.50, "U": 0.01}},
}
),
"age_band": (
patients.categorised_as(
{
"missing": "DEFAULT",
"0-4": """ age >= 0 AND age < 5""",
"5-9": """ age >= 5 AND age < 10""",
"10-14": """ age >= 10 AND age < 15""",
"15-44": """ age >= 15 AND age < 45""",
"45-64": """ age >= 45 AND age < 65""",
"65-74": """ age >= 65 AND age < 75""",
"75+": """ age >= 75 AND age < 120""",
},
return_expectations={
"rate": "universal",
"category": {
"ratios": {
"missing": 0.05,
"0-4": 0.25,
"5-9": 0.3,
"10-14": 0.1,
"15-44": 0.1,
"45-64": 0.1,
"75+": 0.1,
}
},
},
)
),
}


clinical_events = [
{
f"event_{clinical_key}": patients.with_these_clinical_events(
codelist=clinical_codelist,
between=["index_date", ENDDATE],
returning="binary_flag",
return_expectations={"incidence": 0.1},
),
}
for clinical_key, clinical_codelist in clinical_event_codelists.items()
]


medication_events = [
{
f"event_{medication_key}": patients.with_these_medications(
codelist=medication_codelist,
between=["index_date", ENDDATE],
returning="binary_flag",
return_expectations={"incidence": 0.1},
),
}
for medication_key, medication_codelist in medication_codelists.items()
]
# convert list of dicts into a single dict
medication_variables = {k: v for d in medication_events for k, v in d.items()}
clinical_event_variables = {
k: v for d in clinical_events for k, v in d.items()
}

study = StudyDefinition(
index_date="2019-01-01",
default_expectations={
"date": {"earliest": start_date, "latest": end_date},
"rate": "exponential_increase",
"incidence": 0.1,
},
population=patients.all(),
registered=patients.registered_as_of(
"index_date",
return_expectations={"incidence": 0.9},
),
died=patients.died_from_any_cause(
on_or_before="index_date",
returning="binary_flag",
return_expectations={"incidence": 0.1},
),
age=patients.age_as_of(
"index_date",
return_expectations={
"rate": "universal",
"int": {"distribution": "population_ages"},
},
),
**demographics,
**clinical_event_variables,
**medication_variables,
**generate_all_medications(),
**generate_all_clinical(),
included=patients.satisfying(
"""
registered AND
NOT died AND
age_band != "missing" AND
(sex = "M" OR sex = "F")
"""
),
)
23 changes: 23 additions & 0 deletions project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,29 @@ actions:
### End curation check ###

### MONTHLY ###
generate_study_population_report_excluded:
run: cohortextractor:latest generate_cohort
--study-definition study_definition_excluded
--index-date-range "2023-03-01 to 2023-03-01 by month"
--param frequency=monthly
--output-dir=output/report
--output-format=csv.gz
outputs:
highly_sensitive:
cohort: output/report/input_excluded_2023-03-01.csv.gz

count_excluded:
run: python:latest python analysis/count_excluded.py
--input-files output/report/input_excluded_2023-03-01.csv.gz
--output-dir output/report/results/paper
--output-name "excluded.csv"
--redact
needs: [generate_study_population_report_excluded]
outputs:
moderately_sensitive:
# Only output the single summary file
measure_csv: output/report/results/paper/excluded.csv

generate_study_population_report_monthly_0:
run: cohortextractor:latest generate_cohort
--study-definition study_definition_report
Expand Down

0 comments on commit c184925

Please sign in to comment.