generated from opensafely/research-template
/
demographic_change.py
152 lines (93 loc) · 6.03 KB
/
demographic_change.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
from utilities import *
sentinel_measures = ["qrisk2", "asthma", "copd", "sodium", "cholesterol", "tsh", "alt", "rbc", 'hba1c', 'systolic_bp', 'medication_review']
demographics = ['region', 'age_band', 'imd', 'sex', 'ethnicity']
values_dict = {}
date_lists = [['2019-01-01', '2019-02-01', '2019-03-01'], ['2020-01-01', '2020-02-01', '2020-03-01'], ['2021-01-01','2021-02-01', '2021-03-01']]
dates_for_classification = ['2019-04-01', '2020-04-01', '2021-04-01']
differences_list = []
def classify_changes(values, baseline):
"""Classifies list of values by % changes
Args:
changes: list of values
baseline: baseline value
"""
changes = [((x-baseline) / baseline)*100 for x in values]
#between baseline both time periods
if (-15 <= changes[0] < 15) and (-15 <= changes[1] < 15):
classification = 'no change'
#increase at both time periods
elif (changes[0] >= 15) and (changes[1] >= 15):
classification = 'increase'
#increase at first then return to normal
elif (changes[0] >= 15) and (-15 <= changes[1] < 15):
classification = 'increase then return'
#no change then increase
elif (-15 <= changes[0] < 15) and (changes[1] >= 15):
classification = 'no change then increase'
#no change then decrease
elif (-15 <= changes[0] < 15) and (changes[1] <= -15):
classification = 'no change then decrease'
#drop that stays low
elif (changes[0] <= -15) and (changes[1] <= -15):
classification = 'sustained drop'
#drop that recovers
elif (changes[0] <= -15) and (-15 <= changes[1] < 15) :
classification = 'recovery'
#drop then increase
elif (changes[0] <= -15) and (changes[1] > 15) :
classification = 'decrease then increase'
#increase then drop
elif (changes[0] > 15) and (changes[1] <= -15):
classification = 'increase then decrease'
#anything else
else:
classification = 'none'
return classification
for measure in sentinel_measures:
for d in demographics:
df = pd.read_csv(f'output/combined_measure_{measure}_{d}.csv', parse_dates=['date']).sort_values(['date'])
if d == 'ethnicity':
# replace with strings
ethnicity_codes = {1.0: "White", 2.0: "Mixed", 3.0: "Asian", 4.0: "Black", 5.0:"Other", 0.0: "Missing"}
df = df.replace({"ethnicity": ethnicity_codes})
elif d == 'age_band':
df = df[df['age_band'] != 'missing']
if d != 'age_band':
df['rate'] = df[measure]/(df['population']/1000)
# get population rate
total_population = df.groupby(by=['date'])[['population']].sum().reset_index()
total_events = df.groupby(by=['date'])[[measure]].sum().reset_index()
total_df = total_events.merge(total_population, on='date')
total_df['rate'] = total_df[measure]/(total_df['population']/1000)
#dict containing mean rate for each first quarter each year
totals_dict = {}
for dates in date_lists:
val = total_df[total_df['date'].isin(dates)]['rate'].mean()
totals_dict[dates[1]] = val
for unique_category in df[d].unique():
df_subset = df[df[d] == unique_category]
classification_date_values = {}
#get rate in April each year
for date in dates_for_classification:
classification_val = df_subset.loc[df_subset['date'] == date, 'rate'].values[0]
classification_date_values[date] = classification_val
#make classification
classification = classify_changes([classification_date_values["2020-04-01"], classification_date_values["2021-04-01"]], classification_date_values["2019-04-01"])
# get mean deviation from population rate in first quarter each year
date_values = {}
date_changes = {}
for dates in date_lists:
#subgroup value and population value
val = df_subset[df_subset['date'].isin(dates)]['rate'].mean()
total_val = totals_dict[dates[1]]
#mean deviation from population
difference = round(((val - total_val) / total_val)*100, 2)
date_values[dates[1]]=val
date_changes[dates[1]] = difference
row = [measure, d, unique_category, classification_date_values["2019-04-01"], classification_date_values["2020-04-01"], classification_date_values["2021-04-01"], classification, date_values["2019-02-01"], totals_dict["2019-02-01"],date_changes["2019-02-01"], date_values["2020-02-01"], totals_dict["2020-02-01"], date_changes["2020-02-01"], date_values["2021-02-01"], totals_dict["2021-02-01"], date_changes["2021-02-01"], date_changes["2021-02-01"] - date_changes["2019-02-01"]]
differences_list.append(row)
differences_df =pd.DataFrame(differences_list, columns=['measure', 'demographic', 'demographic_subset', 'april_2019_rate', 'april_2020_rate', 'april_2021_rate', 'activity_classification','2019_q1_rate_per_1000', '2019_q1_population_rate', '2019_q1_percentage_difference_from_population_rate', '2020_q1_rate_per_1000', '2020_q1_population_rate', '2020_q1_percentage_difference_from_population_rate', '2021_q1_rate_per_1000', '2021_q1_population_rate', '2021_q1_percentage_difference_from_population_rate', 'difference_in_percentage_distance_from_population_rate_2019_to_2021'])
differences_df.to_csv('output/demographics_differences.csv')
differences_df_sorted = differences_df.reindex(differences_df['difference_in_percentage_distance_from_population_rate_2019_to_2021'].sort_values(ascending=False).index)
differences_df_sorted.to_csv('output/demographics_differences_sorted.csv')