generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
02_hhClassif_an_data_checks.do
317 lines (247 loc) · 8.29 KB
/
02_hhClassif_an_data_checks.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/*==============================================================================
DO FILE NAME: 02_hhClassif_an_data_checks
PROJECT: HH COVID risk classification
AUTHOR: K Wing adapted from H Forbes, A Wong, A Schultze, C Rentsch
K Baskharan, E Williamson
DATE: 26 Jan 2021
DESCRIPTION OF FILE: Run sanity checks on all variables
- Check variables take expected ranges
- Cross-check logical relationships
- Explore expected relationships
- Check stsettings
DATASETS USED: ./output/hhClassif_analysis_dataset.dta
DATASETS CREATED: None
OTHER OUTPUT: Log file: ./released_outputs/02_hhClassif_an_data_checks.log
==============================================================================*/
sysdir set PLUS ./analysis/adofiles
sysdir set PERSONAL ./analysis/adofiles
*first argument main W2
local dataset `1'
if "`dataset'"=="MAIN" local fileextension
else local fileextension "_`1'"
local inputfile "hhClassif_analysis_dataset`dataset'"
* Open a log file
capture log close
log using ./logs/02_hhClassif_an_data_checks`fileextension', replace t
* Open Stata dataset
use ./output/`inputfile', clear
/*
* Open a log file
capture log close
log using ./released_outputs/02_hhClassif_an_data_checks.log, replace t
* Open Stata dataset
use ./output/hhClassif_analysis_dataset.dta, clear
*/
*run ssc install if not on local machine - server needs datacheck.ado file
*ssc install datacheck
*Duplicate patient check
datacheck _n==1, by(patient_id) nol
/* CHECK INCLUSION AND EXCLUSION CRITERIA=====================================*/
* DATA STRUCTURE: Confirm one row per patient
duplicates tag patient_id, generate(dup_check)
assert dup_check == 0
drop dup_check
* INCLUSION 1: >=18 and <=110 at 1 March 2020
assert age < .
assert age >= 18
assert age <= 110
* INCLUSION 2: M or F gender at 1 March 2020
assert inlist(sex, 1, 2)
* EXCLUDE 1: MISSING IMD
assert inlist(imd, 1, 2, 3, 4, 5, .u)
* EXCLUDE 2: HH with more than 10 people
datacheck inlist(hh_size, 1, 2, 3, 4, 5,6, 7, 8, 9, 10, .u), nol
/* EXPECTED VALUES============================================================*/
*HH composition variables
*hhRiskCat (the generic starting variable)
datacheck hhRiskCat<., nol
datacheck inlist(hhRiskCat, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), nol
*hhRiskCatBROAD
datacheck inlist(hhRiskCatBROAD, 1, 2, 3, .), nol
*hhRiskCat67PLUS
datacheck inlist(hhRiskCat67PLUS, 1, 2, 3, 4, 5, 6, 7, 8, .), nol
*hhRiskCat33TO66
datacheck inlist(hhRiskCat33TO66, 1, 2, 3, 4, 5, 6, 7, 8, .), nol
*hhRiskCat18TO29
datacheck inlist(hhRiskCat18TO29, 1, 2, 3, 4, 5, 6, 7, 8, .), nol
* Age
datacheck age<., nol
datacheck inlist(ageCatHHRisk, 0, 1, 2, 3), nol
* Sex
datacheck inlist(sex, 1, 2), nol
* BMI
datacheck inlist(obese4cat, 1, 2, 3, 4), nol
datacheck inlist(bmicat, 1, 2, 3, 4, 5, 6, .u), nol
* IMD
datacheck inlist(imd, 1, 2, 3, 4, 5, .u), nol
* Ethnicity
*eth5
datacheck inlist(eth5, 1, 2, 3, 4, 5, .), nol
*eth16
datacheck inlist(eth5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, .), nol
* Smoking
datacheck inlist(smoke, 1, 2, 3, 4), nol
datacheck inlist(smoke_nomiss, 1, 2, 3), nol
* Check date ranges for all comorbidities
/*
foreach var of varlist chronic_respiratory_disease ///
asthma ///
chronic_cardiac_disease ///
dm ///
cancer_nonhaemPrevYear ///
cancer_haemPrev5Years ///
chronic_liver_disease ///
stroke_dementia ///
egfr60 ///
organ_transplant ///
asplenia ///
other_immuno ///
{
summ `var'_date, format
}
*/
foreach comorb in $varlist {
local comorb: subinstr local comorb "i." ""
safetab `comorb', m
}
*summarise end dates for each outcome
foreach outcome in covidDeathCase covidHospCase nonCOVIDDeathCase {
sum `outcome', format
}
foreach outcome in covidDeathCase covidHospCase nonCOVIDDeathCase {
gen `outcome'_month=mofd(`outcome')
lab define `outcome'_month 721 feb 722 mar 723 apr 724 may 725 june 726 jul 727 aug 728 sept 729 oct
lab val `outcome'_month `outcome'_month
tab `outcome'_month
drop `outcome'_month
}
*Outcome dates
di d(1feb2020)
* 21946
di d(01apr2020)
* 22006
di d(01june2020)
* 22067
di d(01aug2020)
* 22128
di d(01oct2020)
* 22189
foreach outcome of any covidDeathCase covidHospCase nonCOVIDDeathCase {
summ `outcome', format d
summ patient_id if `outcome'==1
local total_`outcome'=`r(N)'
hist `outcome'Date, saving(`outcome', replace) ///
xlabel(21946 22006 22067 22128 22189,labsize(tiny)) xtitle(, size(vsmall)) ///
graphregion(color(white)) legend(off) freq ///
yscale(range(0 3000)) ylab(0 (500) 6000, labsize(vsmall)) ytitle("Number", size(vsmall)) ///
title("N=`total_`outcome''", size(vsmall))
}
* Combine histograms
graph combine covidDeathCase.gph covidHospCase.gph nonCOVIDDeathCase.gph, graphregion(color(white))
erase covidDeathCase.gph
erase covidHospCase.gph
erase nonCOVIDDeathCase.gph
graph export ./output/01_histogram_outcomes.svg, as(svg) replace
*censor dates
summ study_end_censor, format
/* LOGICAL RELATIONSHIPS======================================================*/
*HH variables
safetab hhRiskCat hh_total_cat
safetab hhRiskCat67PLUS hh_total_cat
safetab hhRiskCat33TO66 hh_total_cat
safetab hhRiskCat18TO29 hh_total_cat
* BMI
bysort bmicat: summ bmi
safetab bmicat obese4cat, m
* Age
*bysort ageCatHHRisk: summ age
*safetab ageCatHHRisk age66, m
* Smoking
safetab smoke smoke_nomiss, m
* Diabetes
*safetab diabcat diabetes, m
* CKD
*safetab reduced egfr_cat, m
* CKD
*safetab reduced esrd, m
*comorbidities
safetab coMorbCat
/* EXPECTED RELATIONSHIPS=====================================================*/
/* Relationships between demographic/lifestyle variables */
safetab ageCatHHRisk bmicat, row
safetab ageCatHHRisk smoke, row
safetab ageCatHHRisk ethnicity, row
safetab ageCatHHRisk imd, row
*safetab ageCatHHRisk shield, row
safetab bmicat smoke, row
safetab bmicat ethnicity, row
safetab bmicat imd, row
safetab bmicat hypertension, row
*safetab bmicat shield, row
safetab smoke ethnicity, row
safetab smoke imd, row
safetab smoke hypertension, row
*safetab smoke shield, row
safetab ethnicity imd, row
*safetab shield imd, row
*safetab shield ethnicity, row
* Relationships with age
foreach var of varlist ///
chronic_respiratory_disease ///
asthma_severe ///
chronic_cardiac_disease ///
dm ///
cancer_nonhaemPrevYear ///
cancer_haemPrev5Years ///
chronic_liver_disease ///
stroke_dementia ///
egfr60 ///
organ_transplant ///
asplenia ///
other_immuno ///
{
safetab ageCatHHRisk `var', row
}
*Relationships with sex
foreach var of varlist ///
chronic_respiratory_disease ///
asthma_severe ///
chronic_cardiac_disease ///
dm ///
cancer_nonhaemPrevYear ///
cancer_haemPrev5Years ///
chronic_liver_disease ///
stroke_dementia ///
egfr60 ///
organ_transplant ///
asplenia ///
other_immuno ///
{
safetab male `var', row
}
*Relationships with smoking
foreach var of varlist ///
chronic_respiratory_disease ///
asthma_severe ///
chronic_cardiac_disease ///
dm ///
cancer_nonhaemPrevYear ///
cancer_haemPrev5Years ///
chronic_liver_disease ///
stroke_dementia ///
egfr60 ///
organ_transplant ///
asplenia ///
other_immuno ///
{
safetab smoke `var', row
}
/* SENSE CHECK OUTCOMES=======================================================*/
safetab covidDeathCase covidHospCase , row col
safetab covidDeathCase nonCOVIDDeathCase , row col
safetab nonCOVIDDeathCase covidHospCase , row col
safecount if covidHospCase==1 & covidDeathCase==1
safecount if covidDeathCase==1 & nonCOVIDDeathCase==1
safecount if covidHospCase==1 & nonCOVIDDeathCase==1
* Close log file
log close