generated from opensafely/research-template
/
010_cleaning.do
319 lines (271 loc) · 12.1 KB
/
010_cleaning.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
clear
do `c(pwd)'/analysis/000_filepaths.do
import delimited $outdir/input_part1.csv
gen date_studyend="2021-09-30"
replace date_birth=date_birth+"-15"
rename hba1c_date date_hba1c
rename hba1c_percentage_date date_hba1c_percent
describe date_*, varlist
foreach myvar in `r(varlist)' {
gen temp=date(`myvar',"YMD")
format temp %td
drop `myvar'
rename temp `myvar'
format `myvar' %td
}
order patient_id practice_id date_discharged_covid date_discharged_pneum date_patient_index has_follow_up date_diabetes_diagnosis date_birth sex region date_deregistered date_death
sort patient_id
replace date_patient_index=min(date_discharged_covid, date_discharged_pneum)
replace date_diabetes_diagnosis=min(date_t1dm_gp_first, date_t2dm_gp_first, date_unknown_diabetes_gp_first)
drop if (has_follow_up!=1 | min(date_deregistered, date_death)<=date_patient_index)
drop has_follow_up
**// Exposure group
gen group=.
replace group=1 if date_discharged_covid<=date_discharged_pneum & (date_diabetes_diagnosis<=date_patient_index | min(date_t1dm_hospital_first, date_t2dm_hospital_first)<=date_patient_index)
replace group=2 if date_discharged_covid<=date_discharged_pneum & date_diabetes_diagnosis> date_patient_index & min(date_t1dm_hospital_first, date_t2dm_hospital_first)> date_patient_index
replace group=3 if date_discharged_covid> date_discharged_pneum & (date_diabetes_diagnosis<=date_patient_index | min(date_t1dm_hospital_first, date_t2dm_hospital_first)<=date_patient_index)
replace group=4 if date_discharged_covid> date_discharged_pneum & date_diabetes_diagnosis> date_patient_index & min(date_t1dm_hospital_first, date_t2dm_hospital_first)> date_patient_index
label define grouplab 1 "COVID-19 with diabetes" 2 "COVID-19 without diabetes" 3 "Pneumonia with diabetes" 4 "Pneumonia without diabetes"
label values group grouplab
**// Censoring
gen date_first_covid=min(date_covid_test, date_covid_hospital)
gen date_censor=date_admitted_pneum if (group==1 | group==2)
replace date_censor=date_first_covid if (group==3 | group==4)
replace date_censor=min(date_deregistered, date_death, date_studyend) if min(date_deregistered, date_death, date_studyend)<date_censor
drop date_covid_test date_admitted_pneum date_first_covid date_deregistered
**// Sex
gen cat_sex=1 if sex=="F"
replace cat_sex=2 if sex=="M"
drop if cat_sex==.
label define cat_sexlab 1 "Female" 2 "Male"
label values cat_sex cat_sexlab
drop sex
**// Age group
gen age=(date_patient_index-date_birth)/365.25
gen cat_age=.
replace cat_age=1 if age>=18
replace cat_age=2 if age>=50
replace cat_age=3 if age>=60
replace cat_age=4 if age>=70
replace cat_age=5 if age>=80
drop if cat_age==.
label define cat_agelab 1 "18-49" 2 "50-59" 3 "60-69" 4 "70-79" 5 "80+"
label values cat_age cat_agelab
drop age
**// Ethnicity
capture gen cat_ethnic=ethnicity_gp
capture replace cat_ethnic=ethnicity_sus if cat_ethnic==.
if _rc==0 {
recode cat_ethnic .=6
label define cat_ethniclab 1 "White" 2 "Mixed" 3 "Asian/Asian British" 4 "Black" 5 "Other" 6 "Unknown"
label values cat_ethnic cat_ethniclab
drop ethnicity_gp ethnicity_sus
}
**// IMD
capture describe imd
if _rc==0 {
egen cat_imd=cut(imd), group(5) icodes
replace cat_imd=cat_imd+1
replace cat_imd=. if imd==-1
replace cat_imd=6-cat_imd
label define cat_imdlab 1 "1 (least deprived)" 2 "2" 3 "3" 4 "4" 5 "5 (most deprived)" .u "Unknown"
label values cat_imd cat_imdlab
drop imd
}
**// Type of diabetes
gen temp1=(min(date_t1dm_gp_first, date_t1dm_hospital_first)<=date_patient_index)
gen temp2=(min(date_t2dm_gp_first, date_t2dm_hospital_first)<=date_patient_index)
gen temp3=(date_unknown_diabetes_gp_first<=date_patient_index)
gen cat_diabetes=2
replace cat_diabetes=1 if temp1==1 & temp2!=1
replace cat_diabetes=1 if temp1==1 & temp2==1 & insulin_lastyear==1 & antidiabetic_lastyear!=1
replace cat_diabetes=1 if temp1!=1 & temp2!=1 & temp3==1 & insulin_lastyear==1 & antidiabetic_lastyear!=1
replace cat_diabetes=3 if temp1!=1 & temp2!=1 & temp3!=1
label define cat_diablab 1 "1" 2 "2" 3 "None"
label values cat_diabetes cat_diablab
drop temp* date_diabetes* date_t1dm* date_t2dm* date_unknown_diabetes* antidiabetic_lastyear insulin_lastyear
**// History of CVD
capture gen cat_hist_cvd=max(hist_cvd_gp, hist_cvd_hospital, hist_cvd_opcs2)+1
if _rc==0 {
recode cat_hist_cvd .=3
label define cat_hist_cvdlab 1 "No" 2 "Yes" 3 "Unknown"
label values cat_hist_cvd cat_hist_cvdlab
drop hist_cvd_gp hist_cvd_hospital hist_cvd_opcs2
}
**// History of renal disease
gen gfr_flag=.
capture describe creatinine
if _rc==0 {
gen temp_age=(date_patient_index-date_birth)/365.25
gen temp_female=(cat_sex==1)
gen temp_black=(cat_ethnic==4)
gen gfr=175*((creatinine/88.4)^-1.154)*(temp_age^-0.203)*(1-(1-0.742)*temp_female)*(1+0.212*temp_black)
replace gfr_flag=(gfr<60 & creatinine>0 & creatinine!=.)
drop creatinine temp_age temp_female temp_black gfr
}
recode gfr_flag .=0
capture gen cat_hist_renal=max(gfr_flag, ckd_gp, ckd_hospital, hist_rrt)+1
if _rc==0 {
recode cat_hist_renal .=3
label define cat_hist_renallab 1 "No" 2 "Yes" 3 "Unknown"
label values cat_hist_renal cat_hist_renallab
drop gfr_flag ckd_gp ckd_hospital hist_rrt
}
**// Required critical care (during hospitalisation)
capture describe critical_care_days
if _rc==0 {
gen cat_critical=1
replace cat_critical=2 if critical_care_days>0 & critical_care_days!=.
}
label define cat_criticallab 1 "No" 2 "Yes"
label values cat_critical cat_criticallab
drop critical_care_days
**// COVID-19 vaccination status (at baseline)
gen cat_vaccin=1
capture replace cat_vaccin=2 if date_vaccin_gp_1<date_covid_hospital
capture replace cat_vaccin=3 if date_vaccin_gp_2<date_covid_hospital
label define cat_vaccinlab 1 "None" 2 "One dose" 3 "Two doses"
label value cat_vaccin cat_vaccinlab
drop date_vaccin_* date_covid_hospital
**// Smoking status
capture describe latest_smoking ever_smoked
if _rc==0 {
gen cat_smoking=1 if latest_smoking=="N" & (ever_smoked==0 | ever_smoked==.)
replace cat_smoking=2 if latest_smoking=="E" | (latest_smoking=="N" & ever_smoked==1)
replace cat_smoking=3 if latest_smoking=="S"
recode cat_smoking .=4
label define cat_smoklab 1 "Never" 2 "Ex" 3 "Current" 4 "Unknown"
label values cat_smoking cat_smoklab
drop latest_smoking ever_smoked
}
**// Hazardous alcohol consumption
capture gen cat_alcohol=haz_alcohol+1
if _rc==0 {
recode cat_alcohol .=3
label define cat_alcohollab 1 "No" 2 "Yes" 3 "Unknown"
label values cat_alcohol cat_alcohollab
drop haz_alcohol
}
**// BMI
capture describe bmi
if _rc==0 {
drop if (bmi==. | bmi<0)
gen cat_bmi=.
recode cat_bmi .=1 if bmi<18.5
recode cat_bmi .=2 if bmi<25
recode cat_bmi .=3 if bmi<30
recode cat_bmi .=4 if bmi!=.
recode cat_bmi .=5
label define cat_bmilab 1 "Underweight" 2 "Healthy" 3 "Overweight" 4 "Obese" 5 "Unknown"
label values cat_bmi cat_bmilab
drop bmi*
}
**// HbA1c
capture describe hba1c
if _rc==0 {
gen cat_hba1c=.
recode cat_hba1c .=1 if hba1c>0 & hba1c<42
recode cat_hba1c .=2 if hba1c<48
recode cat_hba1c .=3 if hba1c!=.
recode cat_hba1c .=4
}
capture describe hba1c_percent
if _rc==0 {
gen cat_hba1c_percent=.
recode cat_hba1c_percent .=1 if hba1c_percent>0 & hba1c_percent<6
recode cat_hba1c_percent .=2 if hba1c_percent<6.5
recode cat_hba1c_percent .=3 if hba1c_percent!=.
recode cat_hba1c_percent .=4
}
capture replace cat_hba1c=cat_hba1c_percent if date_hba1c_percent>date_hba1c & date_hba1c_percent!=. & cat_hba1c_percent<4
label define cat_hba1clab 1 "Normal" 2 "Prediabetes" 3 "Diabetes" 4 "Unknown"
capture label values cat_hba1c cat_hba1clab
capture drop hba1c* cat_hba1c_percent date_hba1c*
**// OUTCOMES
**///////////////
**// Cardiovascular/Cerebrovascular
**// Stroke - Thrombotic/Ischaemic
gen date_stroke_thrombotic=min(date_stroke_thrombotic_gp, date_stroke_thrombotic_hospital, date_stroke_thrombotic_ons)
drop date_stroke_thrombotic_*
**// Stroke - Haemorrhagic
gen date_stroke_haemorrhagic=min(date_stroke_haemorr_gp, date_stroke_haemorr_hospital, date_stroke_haemorr_ons)
drop date_stroke_haemorr_*
**// Stroke - TIA
gen date_stroke_tia=min(date_stroke_tia_gp, date_stroke_tia_hospital, date_stroke_tia_ons)
drop date_stroke_tia_*
**// Stroke - In pregnancy or puerperium
gen date_stroke_pregnancy=min(date_stroke_pregnancy_gp, date_stroke_pregnancy_hospital, date_stroke_pregnancy_ons)
drop date_stroke_pregnancy_*
**// Stroke - Any
gen date_stroke_any=min(date_stroke_thrombotic, date_stroke_haemorrhagic, date_stroke_tia, date_stroke_pregnancy)
**// Myocardial Infarction (MI)
gen date_mi=min(date_mi_gp, date_mi_hospital, date_mi_ons)
drop date_mi_*
**// Deep Vein Thrombosis (DVT) - Non-pregnancy-related
gen date_dvt_nopregnancy=min(date_dvt_nopregnancy_gp, date_dvt_nopregnancy_hospital, date_dvt_nopregnancy_ons)
drop date_dvt_nopregnancy_gp date_dvt_nopregnancy_hospital date_dvt_nopregnancy_ons
**// Deep Vein Thrombosis (DVT) - In pregnancy or puerperium
gen date_dvt_pregnancy=min(date_dvt_pregnancy_gp, date_dvt_pregnancy_hospital, date_dvt_pregnancy_ons)
drop date_dvt_pregnancy_gp date_dvt_pregnancy_hospital date_dvt_pregnancy_ons
**// Deep Vein Thrombosis (DVT) - Cerebral venous thrombosis in pregnancy
gen date_dvt_pregnancy_cvt=min(date_dvt_pregnancy_cvt_gp, date_dvt_pregnancy_cvt_hospital, date_dvt_pregnancy_cvt_ons)
drop date_dvt_pregnancy_cvt_gp date_dvt_pregnancy_cvt_hospital date_dvt_pregnancy_cvt_ons
**// Deep Vein Thrombosis (DVT) - Any
gen date_dvt_any=min(date_dvt_nopregnancy, date_dvt_pregnancy, date_dvt_pregnancy_cvt)
**// Pulmonary Embolism (PE) - Non-pregnancy-related
gen date_pe_nopregnancy=min(date_pe_nopregnancy_gp, date_pe_nopregnancy_hospital, date_pe_nopregnancy_ons)
drop date_pe_nopregnancy_gp date_pe_nopregnancy_hospital date_pe_nopregnancy_ons
**// Pulmonary Embolism (PE) - In pregnancy or puerperium
gen date_pe_pregnancy=min(date_pe_pregnancy_gp, date_pe_pregnancy_hospital, date_pe_pregnancy_ons)
drop date_pe_pregnancy_gp date_pe_pregnancy_hospital date_pe_pregnancy_ons
**// Pulmonary Embolism (PE) - Any
gen date_pe_any=min(date_pe_nopregnancy, date_pe_pregnancy)
**// Heart Failure
gen date_hf=min(date_hf_gp, date_hf_hospital, date_hf_ons)
drop date_hf_*
**// Any Cardiovascular/Cerebrovascular (non-pregnancy-related)
gen date_any_cvd=min(date_stroke_any, date_mi, date_dvt_any, date_pe_any, date_hf)
**// Renal
**// Acute Kidney Injury (AKI) - Non-pregnancy-related
gen date_aki_nopregnancy=min(date_aki_nopregnancy_gp, date_aki_nopregnancy_hospital, date_aki_nopregnancy_ons)
drop date_aki_nopregnancy_gp date_aki_nopregnancy_hospital date_aki_nopregnancy_ons
**// Acute Kidney Injury (AKI) - In pregnancy or puerperium
gen date_aki_pregnancy=min(date_aki_pregnancy_gp, date_aki_pregnancy_hospital, date_aki_pregnancy_ons)
drop date_aki_pregnancy_gp date_aki_pregnancy_hospital date_aki_pregnancy_ons
**// Acute Kidney Injury (AKI) - Any
gen date_aki_any=min(date_aki_nopregnancy, date_aki_pregnancy)
**// Hepatic
**// Liver disease/failure
gen date_liver=min(date_liver_gp, date_liver_hospital, date_liver_ons)
drop date_liver_*
**// Mental Illness
**// Anxiety
gen date_anxiety=min(date_anxiety_gp, date_anxiety_hospital, date_anxiety_ons)
drop date_anxiety_*
**// Depression
gen date_depression=min(date_depression_gp, date_depression_hospital, date_depression_ons)
drop date_depression_*
**// Psychosis
gen date_psychosis=min(date_psychosis_gp, date_psychosis_hospital, date_psychosis_ons)
drop date_psychosis_*
**// Psychotropic medication (written into the cohort derivation)
**// Antidepressants
**// Anxiolytics
**// Antipsychotics
**// Mood stabilisers
order date_antidepressant date_anxiolytic date_antipsychotic date_mood_stabiliser, after(date_psychosis)
**// Symptoms of post-COVID syndrome outcome
**// Insomnia
gen date_sleep_insomnia=min(date_sleep_insomnia_gp, date_sleep_insomnia_hospital, date_sleep_insomnia_ons)
drop date_sleep_insomnia_*
**// Hypersomnia
gen date_sleep_hypersomnia=min(date_sleep_hypersomnia_gp, date_sleep_hypersomnia_hospital, date_sleep_hypersomnia_ons)
drop date_sleep_hypersomnia_*
**// Sleep apnoea
gen date_sleep_apnoea=min(date_sleep_apnoea_gp, date_sleep_apnoea_hospital, date_sleep_apnoea_ons)
drop date_sleep_apnoea_*
**// Fatigue
gen date_fatigue=min(date_fatigue_gp, date_fatigue_hospital, date_fatigue_ons)
drop date_fatigue_*
format date_* %td
save $outdir/input_part1_clean.dta, replace