generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #81 from opensafely/count_excluded
Count the number excluded by each criteria
- Loading branch information
Showing
3 changed files
with
386 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
import argparse | ||
import pathlib | ||
import re | ||
import glob | ||
import pandas | ||
import itertools | ||
from report.report_utils import round_values | ||
|
||
FNAME_REGEX = re.compile(r"input_excluded_(?P<id>\S+)\.csv.gz") | ||
|
||
|
||
def _round_table(table, round_to, redact=False, redaction_threshold=5): | ||
table = table.astype(float) | ||
|
||
table = table.apply( | ||
lambda x: round_values( | ||
x, round_to, redact=redact, redaction_threshold=redaction_threshold | ||
) | ||
) | ||
table = table.fillna("[REDACTED]") | ||
return table | ||
|
||
|
||
def _join_tables(tables): | ||
return pandas.concat(tables) | ||
|
||
|
||
def get_input_tables(input_files, exclude_files): | ||
all_files = set(itertools.chain(*input_files)) | ||
all_exclude = set(itertools.chain(*exclude_files)) | ||
all_files = all_files - all_exclude | ||
for input_file in all_files: | ||
measure_fname_match = re.match(FNAME_REGEX, input_file.name) | ||
if measure_fname_match is not None: | ||
# The `date` column is assigned by the measures framework. | ||
measure_table = pandas.read_csv(input_file) | ||
|
||
# We can reconstruct the parameters passed to `Measure` without | ||
# the study definition. | ||
measure_table.attrs["id"] = measure_fname_match.group("id") | ||
yield measure_table | ||
|
||
|
||
def compute_excluded(input_table): | ||
d = {} | ||
d["total"] = len(input_table) | ||
registered = input_table[input_table.registered == 1] | ||
d["not_registered"] = len(input_table) - len(registered) | ||
alive = registered[registered.died == 0] | ||
d["died"] = len(registered) - len(alive) | ||
age = alive[alive.age != "missing"] | ||
d["unknown_age"] = len(alive) - len(age) | ||
sex = age[(age.sex == "M") | (age.sex == "F")] | ||
d["unknown_sex"] = len(age) - len(sex) | ||
excluded = input_table[input_table.included == 0] | ||
d["total_excluded"] = len(excluded) | ||
d["clinical_any"] = (excluded.event_clinical_any == 1).sum() | ||
d["medication_any"] = (excluded.event_medication_any == 1).sum() | ||
counts = pandas.Series(d) | ||
counts.name = "count" | ||
counts.index.name = "attribute" | ||
return counts | ||
|
||
|
||
def write_table(measure_table, path, filename): | ||
create_dir(path) | ||
measure_table.to_csv(path / filename, index=False, header=True) | ||
|
||
|
||
def create_dir(path): | ||
pathlib.Path(path).mkdir(parents=True, exist_ok=True) | ||
|
||
|
||
def get_path(*args): | ||
return pathlib.Path(*args).resolve() | ||
|
||
|
||
def match_input(input_list): | ||
path = get_path(input_list) | ||
if path.exists(): | ||
return path | ||
|
||
|
||
def match_paths(pattern): | ||
return [get_path(x) for x in glob.glob(pattern)] | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--input-files", | ||
required=True, | ||
type=match_paths, | ||
action="append", | ||
help="Glob pattern(s) for matching one or more input files", | ||
) | ||
parser.add_argument( | ||
"--exclude-files", | ||
required=False, | ||
type=match_paths, | ||
action="append", | ||
default=[], | ||
help="Glob pattern(s) to exclude one or more input files", | ||
) | ||
parser.add_argument( | ||
"--output-dir", | ||
required=True, | ||
type=pathlib.Path, | ||
help="Path to the output directory", | ||
) | ||
parser.add_argument( | ||
"--output-name", | ||
required=True, | ||
help="Name for joined measures file", | ||
) | ||
parser.add_argument( | ||
"--round-to", | ||
required=False, | ||
default=10, | ||
type=int, | ||
help="Round to the nearest", | ||
) | ||
parser.add_argument( | ||
"--redact", | ||
action="store_true", | ||
help="Redact values below a threshold", | ||
) | ||
parser.add_argument( | ||
"--redaction-threshold", | ||
required=False, | ||
default=5, | ||
type=int, | ||
help="Redact values below or equal to this threshold", | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
input_files = args.input_files | ||
exclude_files = args.exclude_files | ||
output_dir = args.output_dir | ||
output_name = args.output_name | ||
round_to = args.round_to | ||
redact = args.redact | ||
redaction_threshold = args.redaction_threshold | ||
|
||
tables = [] | ||
for input_table in get_input_tables(input_files, exclude_files): | ||
table_date = input_table.attrs["id"] | ||
excluded_counts = compute_excluded(input_table) | ||
redacted_and_rounded = _round_table( | ||
excluded_counts, round_to, redact, redaction_threshold | ||
) | ||
df = redacted_and_rounded.reset_index() | ||
df["date"] = table_date | ||
tables.append(df) | ||
|
||
output = _join_tables(tables) | ||
|
||
write_table(output, output_dir, output_name) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
from cohortextractor import ( | ||
StudyDefinition, | ||
patients, | ||
params, | ||
combine_codelists, | ||
) | ||
|
||
from codelists import ( | ||
amoxicillin_codes, | ||
azithromycin_codes, | ||
clarithromycin_codes, | ||
erythromycin_codes, | ||
phenoxymethylpenicillin_codes, | ||
cefalexin_codes, | ||
co_amoxiclav_codes, | ||
flucloxacillin_codes, | ||
scarlet_fever_codes, | ||
invasive_strep_a_codes, | ||
sore_throat_tonsillitis_codes, | ||
) | ||
|
||
# Import so we can inspect metadata logs for correct variable expansion | ||
import logging | ||
import json | ||
|
||
|
||
medication_codelists = { | ||
"amoxicillin": amoxicillin_codes, | ||
"azithromycin": azithromycin_codes, | ||
"clarithromycin": clarithromycin_codes, | ||
"erythromycin": erythromycin_codes, | ||
"phenoxymethylpenicillin": phenoxymethylpenicillin_codes, | ||
"cefalexin": cefalexin_codes, | ||
"co_amoxiclav": co_amoxiclav_codes, | ||
"flucloxacillin": flucloxacillin_codes, | ||
} | ||
|
||
|
||
clinical_event_codelists = { | ||
"scarlet_fever": scarlet_fever_codes, | ||
"invasive_strep_a": invasive_strep_a_codes, | ||
"sore_throat_tonsillitis": sore_throat_tonsillitis_codes, | ||
} | ||
|
||
all_medication_codes = combine_codelists(*list(medication_codelists.values())) | ||
all_clinical_codes = combine_codelists( | ||
*list(clinical_event_codelists.values()) | ||
) | ||
|
||
|
||
frequency = params.get("frequency", None) | ||
if frequency == "weekly": | ||
ENDDATE = "index_date + 6 days" | ||
else: | ||
ENDDATE = "last_day_of_month(index_date)" | ||
|
||
|
||
def generate_all_medications(): | ||
var = { | ||
"event_medication_any": patients.satisfying( | ||
" OR ".join( | ||
list(map(lambda x: f"event_{x}", medication_codelists.keys())) | ||
) | ||
), | ||
} | ||
logging.info(json.dumps(var, indent=4)) | ||
return var | ||
|
||
|
||
def generate_all_clinical(): | ||
var = { | ||
"event_clinical_any": patients.satisfying( | ||
" OR ".join( | ||
list( | ||
map( | ||
lambda x: f"event_{x}", clinical_event_codelists.keys() | ||
) | ||
) | ||
) | ||
), | ||
} | ||
logging.info(json.dumps(var, indent=4)) | ||
return var | ||
|
||
|
||
if frequency == "weekly": | ||
start_date = "2022-09-01" | ||
end_date = "2023-02-15" | ||
else: | ||
start_date = "2018-01-01" | ||
end_date = "2022-01-01" | ||
|
||
demographics = { | ||
"sex": patients.sex( | ||
return_expectations={ | ||
"rate": "universal", | ||
"category": {"ratios": {"M": 0.49, "F": 0.50, "U": 0.01}}, | ||
} | ||
), | ||
"age_band": ( | ||
patients.categorised_as( | ||
{ | ||
"missing": "DEFAULT", | ||
"0-4": """ age >= 0 AND age < 5""", | ||
"5-9": """ age >= 5 AND age < 10""", | ||
"10-14": """ age >= 10 AND age < 15""", | ||
"15-44": """ age >= 15 AND age < 45""", | ||
"45-64": """ age >= 45 AND age < 65""", | ||
"65-74": """ age >= 65 AND age < 75""", | ||
"75+": """ age >= 75 AND age < 120""", | ||
}, | ||
return_expectations={ | ||
"rate": "universal", | ||
"category": { | ||
"ratios": { | ||
"missing": 0.05, | ||
"0-4": 0.25, | ||
"5-9": 0.3, | ||
"10-14": 0.1, | ||
"15-44": 0.1, | ||
"45-64": 0.1, | ||
"75+": 0.1, | ||
} | ||
}, | ||
}, | ||
) | ||
), | ||
} | ||
|
||
|
||
clinical_events = [ | ||
{ | ||
f"event_{clinical_key}": patients.with_these_clinical_events( | ||
codelist=clinical_codelist, | ||
between=["index_date", ENDDATE], | ||
returning="binary_flag", | ||
return_expectations={"incidence": 0.1}, | ||
), | ||
} | ||
for clinical_key, clinical_codelist in clinical_event_codelists.items() | ||
] | ||
|
||
|
||
medication_events = [ | ||
{ | ||
f"event_{medication_key}": patients.with_these_medications( | ||
codelist=medication_codelist, | ||
between=["index_date", ENDDATE], | ||
returning="binary_flag", | ||
return_expectations={"incidence": 0.1}, | ||
), | ||
} | ||
for medication_key, medication_codelist in medication_codelists.items() | ||
] | ||
# convert list of dicts into a single dict | ||
medication_variables = {k: v for d in medication_events for k, v in d.items()} | ||
clinical_event_variables = { | ||
k: v for d in clinical_events for k, v in d.items() | ||
} | ||
|
||
study = StudyDefinition( | ||
index_date="2019-01-01", | ||
default_expectations={ | ||
"date": {"earliest": start_date, "latest": end_date}, | ||
"rate": "exponential_increase", | ||
"incidence": 0.1, | ||
}, | ||
population=patients.all(), | ||
registered=patients.registered_as_of( | ||
"index_date", | ||
return_expectations={"incidence": 0.9}, | ||
), | ||
died=patients.died_from_any_cause( | ||
on_or_before="index_date", | ||
returning="binary_flag", | ||
return_expectations={"incidence": 0.1}, | ||
), | ||
age=patients.age_as_of( | ||
"index_date", | ||
return_expectations={ | ||
"rate": "universal", | ||
"int": {"distribution": "population_ages"}, | ||
}, | ||
), | ||
**demographics, | ||
**clinical_event_variables, | ||
**medication_variables, | ||
**generate_all_medications(), | ||
**generate_all_clinical(), | ||
included=patients.satisfying( | ||
""" | ||
registered AND | ||
NOT died AND | ||
age_band != "missing" AND | ||
(sex = "M" OR sex = "F") | ||
""" | ||
), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters