generated from opensafely/research-template
/
calculate_measures.py
84 lines (45 loc) · 2.86 KB
/
calculate_measures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import pandas as pd
from utilities import *
demographics = ['region', 'age_band', 'imd', 'sex', 'ethnicity']
sentinel_measures = ["qrisk2", "asthma", "copd", "sodium", "cholesterol", "alt", "tsh", "alt", "rbc", 'hba1c', 'systolic_bp', 'medication_review']
for file in OUTPUT_DIR.iterdir():
if match_input_files(file.name):
file_path = OUTPUT_DIR / file.name
date = get_date_input_file(file.name)
df = pd.read_feather(file_path)
df['date'] = pd.to_datetime(date)
for d in demographics:
if d=='age_band':
population = df.groupby(by=[d, 'date']).size().reset_index(name='population')
else:
population = df.groupby(by=['age_band', d, 'date']).size().reset_index(name='population')
for measure in sentinel_measures:
if d =='age_band':
event = df.groupby(by=[d, 'date'])[[measure, 'date']].sum().reset_index()
measures_df = population.merge(event, on=[d, 'date'])
else:
event = df.groupby(by=['age_band', d, 'date'])[[measure, 'date']].sum().reset_index()
measures_df = population.merge(event, on=['age_band', d, 'date'])
measures_df = measures_df[measures_df["age_band"] != "missing"]
measures_df = measures_df.replace({True: 1, False: 0})
counts = measures_df.groupby(by=[d, "date"])[[measure, "population"]].sum().reset_index()
if d == "ethnicity":
measures_df = convert_ethnicity(measures_df)
calculate_rate(measures_df, measure, 'population')
measures_df = measures_df.groupby(by=[d, "date"])["rate"].mean().reset_index()
measures_df = measures_df.merge(counts, on=[d, "date"], how="outer")
if d == 'sex':
measures_df = measures_df[measures_df['sex'].isin(['M', 'F'])]
measures_df = redact_small_numbers(measures_df, 5, measure, 'population', 'rate', 'date')
measures_df.to_csv(f'output/measure_{measure}_{d}_{date}.csv')
for sentinel_measure in sentinel_measures:
for d in demographics:
#load all measures for that sentinel measure and demographic
data = []
for file in os.listdir('output'):
if f'measure_{sentinel_measure}_{d}' in file:
df = pd.read_csv(os.path.join('output', file))
data.append(df)
df = pd.concat(data)
df.to_csv(f'output/combined_measure_{sentinel_measure}_{d}.csv')