-
Notifications
You must be signed in to change notification settings - Fork 0
/
Simulation study
561 lines (373 loc) · 18.1 KB
/
Simulation study
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
***********Simulations (explaining Stata code step by step)
* Create a folder in which the simulated datasets will be saved will be saved
global data = "C:\Users\micha\Desktop\Papers\New\Estimate change in a risk factor\Simulation\Final\Results"
* Outcome models used for methods 1-3 to estimate the Hazard Ratios
global outcome_model_HR1= "ib2.weight_change_cat c.t c.t#c.t c.t#c.t#c.t i.sm_status0 sex age diuretics0 fh_CVD BMI0"
global outcome_model_HR2= "ib2.weight_change_cat c.t c.t#c.t c.t#c.t#c.t sex age diuretics1 fh_CVD BMI0 sm_cess1"
global outcome_model_HR3= "ib2.weight_change_cat c.t c.t#c.t c.t#c.t#c.t i.sm_status0 sex age diuretics0 diuretics1 fh_CVD BMI0 sm_cess1"
* Outcome models used for methods 1-3 to estimate the incidence risk curves
global outcome_model_rc1= "weight_loss weight_loss#c.t weight_loss#c.t#c.t weight_loss#c.t#c.t#c.t weight_gain weight_gain#c.t weight_gain#c.t#c.t weight_gain#c.t#c.t#c.t c.t c.t#c.t c.t#c.t#c.t i.sm_status0 sex age diuretics0 fh_CVD BMI0"
global outcome_model_rc2= "weight_loss weight_loss#c.t weight_loss#c.t#c.t weight_loss#c.t#c.t#c.t weight_gain weight_gain#c.t weight_gain#c.t#c.t weight_gain#c.t#c.t#c.t c.t c.t#c.t c.t#c.t#c.t sex age diuretics1 fh_CVD BMI0 sm_cess1"
global outcome_model_rc3= "weight_loss weight_loss#c.t weight_loss#c.t#c.t weight_loss#c.t#c.t#c.t weight_gain weight_gain#c.t weight_gain#c.t#c.t weight_gain#c.t#c.t#c.t c.t c.t#c.t c.t#c.t#c.t i.sm_status0 sex age diuretics0 diuretics1 fh_CVD BMI0 sm_cess1"
* Before running the code, please make sure that you create two extra folders in your desktop
* 1) $data\Datasets
* 2) $data\Datasets2
* so that all the simulated datasets can be stored from the following script
clear all
global datasetnum = 1
capture program drop sim_weight_change
program sim_weight_change, rclass
clear
set obs 10000
gen id=_n
*simulate smoking status at baseline so that
* 32% non smokers
* 20% former smokers
* 48% current smokers
gen a=rbinomial(1,0.2)
gen b=rbinomial(1,0.6) if a==0
replace b=0 if a==1
gen sm_status0=a+2*b
count if sm_status0==0
return scalar sm_status0_0=r(N)
count if sm_status0==1
return scalar sm_status0_1=r(N)
count if sm_status0==2
return scalar sm_status0_2=r(N)
*simulate sex
gen sex=rbinomial(1,0.5)
count if sex==0
return scalar sex0_0=r(N)
*simulate age
gen age=rnormal(50,2.5)
sum age, d
return scalar age_m=r(mean)
return scalar age_sd=r(sd)
*simulate family history of CVD
gen fh_CVD=uniform()<+0.12+rnormal(0,0.05)
sum fh_CVD, d
return scalar fh_CVD_m=r(mean)
*simulate diuretics
gen diuretics0=uniform()<-1.4+0.03*age+0.1*sex+rnormal(0,0.04)
sum diuretics0, d
return scalar diuretics0_m=r(mean)
*simulate initial BMI at baseline
gen BMI0=25.8+0.06*(age-18)+0.1*sex+0.5*fh_CVD-0.1*sm_status0-0.2*diuretics0+rnormal(0,0.56)
sum BMI0, d
return scalar BMI0_m=r(mean)
return scalar BMI0_sd=r(sd)
*simulate smoking cessation between baseline and the 1st follow-up
gen sm_cess1=0 if b==0
replace sm_cess1=uniform()<-1.8+0.041*age-0.01*sex+rnormal(0,0.04)+0.2*diuretics0 if b==1
sum sm_cess1, d
return scalar sm_cess1_m=r(mean)
*generate smoking status 1st follow-up, given smoking status at baseline and smoking cessation
gen sm_status1=sm_status0
replace sm_status1=1 if sm_status0==2 & sm_cess1==1
*simulate diuretics at 1st follow-up
gen diuretics1=uniform()<-1.4+0.03*age+0.1*sex+0.1*diuretics0+rnormal(0,0.04)
sum diuretics1, d
return scalar diuretics1_m=r(mean)
*simulate CVD (outcome) at 1st follow-up
gen CVD1=uniform()<-1.9+0.035*age+0.03*sex+rnormal(0,0.03) +0.3*fh_CVD+0.1*diuretics0
sum CVD1, d
return scalar CVD1_m=r(mean)
*simulate BMI at 1st follow-up
gen BMI1=BMI0-0.9*CVD1+ 0.1* sm_status0 +2.5*sm_cess1-1.8*diuretics0-0.9*diuretics1+0.005*age+rnormal(0,1.4)
* simulate height
gen height=rnormal(1.63,0.03) if sex==0
replace height=rnormal(1.80,0.03) if sex==1
sum height if sex==0, d
return scalar height_f_m=r(mean)
return scalar height_f_sd=r(sd)
sum height if sex==1, d
return scalar height_m_m=r(mean)
return scalar height_m_sd=r(sd)
* generate weight
gen weight0=BMI0*(height)^2
gen weight1=BMI1*(height)^2
sum weight0 if sex==0, d
return scalar weight0_f_m=r(mean)
return scalar weight0_f_sd=r(sd)
sum weight0 if sex==1, d
return scalar weight0_m_m=r(mean)
return scalar weight0_m_sd=r(sd)
*generate BMI change and BMI change categories
gen weight_change=weight1-weight0
gen weight_change_cat=1 if weight_change/weight0 < - 0.05
replace weight_change_cat=2 if weight_change/weight0 >= - 0.05 & weight_change/weight0 <= 0.05
replace weight_change_cat=3 if weight_change/weight0 > 0.05
count if weight_change_cat==1
return scalar weight_change_cat_1=r(N)
count if weight_change_cat==2
return scalar weight_change_cat_2=r(N)
count if weight_change_cat==3
return scalar weight_change_cat_3=r(N)
*generate BMI loss, maintenance and gain from BMI change categories
gen weight_loss=0
replace weight_loss=1 if weight_change_cat==1
gen weight_maint=0
replace weight_maint=1 if weight_change_cat==2
gen weight_gain=0
replace weight_gain=1 if weight_change_cat==3
*save dataset with everybody
save "$data\Datasets\All (without stime)_$datasetnum.dta", replace
* keep only those with CVD cases that happened during the baseline period
keep if CVD1==1
save "$data\Datasets\CVD at baseline period_$datasetnum.dta", replace
*use dataset with everybody
use "$data\Datasets\All (without stime)_$datasetnum.dta", replace
* keep only those without CVD cases that happened during the baseline period
keep if CVD1==0
*simulate survival time assuming no effect of BMI loss vs maintenance (lambda=0.0000001 and gamma=1.6 from the Weibull distribution)
survsim stime , lambdas(0.0001) gammas(1.1) covariates(a 1 b 1.5 sex 0.2 age 0.03 diuretics0 0.6 diuretics1 0.2 fh_CVD 0.2 BMI0 0.02 sm_cess1 -0.4 weight_gain 0.3)
*the follow-up time of the modelling period is 18 years
generate CVD = stime <= 18
replace stime = 18 if CVD == 0
*save dataset without CVD during baseline period
save "$data\Datasets\No CVD at baseline period (with stime)_$datasetnum.dta", replace
*append data with individuals with CVD during baseline period
append using "$data\Datasets\CVD at baseline period_$datasetnum.dta"
*replace stime to 0, if it’s missing (for those individuals with CVD during the baseline period)
replace stime=0 if stime==.
replace CVD=1 if CVD1==1
*generate new survival time, equal to stime+2 (to add the baseline period)
gen surv_time=stime+2
*save final dataset
save "$data\Datasets\Final dataset_$datasetnum.dta", replace
**************************************************
* create a ready-to-analyse dataset in a long format, so that pooled logistic regression can be applied directly
use "$data\Datasets\Final dataset_$datasetnum.dta", replace
expand 9
sort id
by id: gen obs_n=_n
sort id obs_n
order id obs_n
* create the outcome variable CVD_pooled to be used in the pooled logistic regression
gen CVD_pooled=0
replace CVD_pooled=1 if CVD==1 & surv_time<=(obs_n+1)*2
*drop time point in which the outcome has already been observed in the previous time points
drop if CVD_pooled==1 & CVD_pooled[_n-1]==1 & id==id[_n-1]
* create variable for time, time^2 and time^3
gen t=2*(obs_n-1)
gen t2=t^2
gen t3=t^3
* save the dataset in a long format, so that pooled logistic regression can be applied
save "$data\Datasets\Final dataset (pooled)_$datasetnum.dta", replace
************
* Method 1 *
************
*** Calculation of the (log-)hazard ratios
use "$data\Datasets\Final dataset (pooled)_$datasetnum.dta", replace
logistic CVD_pooled $outcome_model_HR1, cluster(id)
return scalar m1_b_weight_loss=_b[1.weight_change_cat]
return scalar m1_se_weight_loss=_se[1.weight_change_cat]
return scalar m1_b_weight_gain=_b[3.weight_change_cat]
return scalar m1_se_weight_gain=_se[3.weight_change_cat]
*** Calculation of the risk curves
logistic CVD_pooled $outcome_model_rc1, cluster(id)
* Create a dataset with all time points for each treatment
* Re-expand data for all treatments
keep if t==0
expand 9 if t==0
bysort id: replace t=2*(_n-1)
* Create 3 copies of each subjects, one for each treatment
expand 3
sort id t
by id t: gen intervention=_n
replace weight_loss=0
replace weight_loss=1 if intervention==1
replace weight_gain=0
replace weight_gain=1 if intervention==3
sort id intervention t
* Generate the probability of developing the event of interest "p_event_k" and the probability of remaining disease free "p_survival_k", which is 1-"p_event_k"
predict p_event_k, pr
gen p_survival_k=1-p_event_k
* The probability of remaining disease-free at time k is the product of the probabilities of remaining disease-free up to time t
sort id intervention t
gen p_survival=p_survival_k if t==0
bysort id intervention: replace p_survival=p_survival_k*p_survival[_n-1] if t>0
* The probability of the developing the event at time k is one minus the probability of remaining disease-free at time k
gen p_risk=1-p_survival
by intervention, sort: summarize p_risk if t==16
sort intervention t id
by intervention t: egen p_mean_risk=mean(p_risk)
* We want to generate the graph of standardised risk curves over time, undr the 3 interventions
* We need to average the adjusted risk curves across all values
expand 3 if t==0, generate(newtime)
recode newtime 1=-1
sort id intervention t newtime
replace p_mean_risk=0 if newtime==-1 & t==0
gen time2=t+4
replace time2=0 if newtime==-1 & t==0
replace time2=2 if time2[_n-1]==0
replace p_mean_risk=p_mean_risk*100
separate p_mean_risk, by(intervention)
keep if _n<=3*11
keep p_mean_risk1 p_mean_risk2 p_mean_risk3 time2
forvalues i=0(2)20{
forvalues j=1(1)3{
sum p_mean_risk`j' if time2==`i'
return scalar m1_risk`j'_t`i'=r(mean)
}
}
save "$data\Datasets2\Results incidence curves (method 1)_$datasetnum.dta", replace
************
* Method 2 *
************
*** Calculation of the (log-)hazard ratios
use "$data\Datasets\Final dataset (pooled)_$datasetnum.dta", replace
drop if CVD1==1
logistic CVD_pooled $outcome_model_HR2, cluster(id)
return scalar m2_b_weight_loss=_b[1.weight_change_cat]
return scalar m2_se_weight_loss=_se[1.weight_change_cat]
return scalar m2_b_weight_gain=_b[3.weight_change_cat]
return scalar m2_se_weight_gain=_se[3.weight_change_cat]
*** Calculation of the risk curves
logistic CVD_pooled $outcome_model_rc2, cluster(id)
* Create a dataset with all time points for each treatment
* Re-expand data for all treatments
keep if t==0
expand 9 if t==0
bysort id: replace t=2*(_n-1)
* Create 3 copies of each subjects, one for each treatment
expand 3
sort id t
by id t: gen intervention=_n
replace weight_loss=0
replace weight_loss=1 if intervention==1
replace weight_gain=0
replace weight_gain=1 if intervention==3
sort id intervention t
* Generate the probability of developing the event of interest "p_event_k" and the probability of remaining disease free "p_survival_k", which is 1-"p_event_k"
predict p_event_k, pr
gen p_survival_k=1-p_event_k
* The probability of remaining disease-free at time k is the product of the probabilities of remaining disease-free up to time t
sort id intervention t
gen p_survival=p_survival_k if t==0
bysort id intervention: replace p_survival=p_survival_k*p_survival[_n-1] if t>0
* The probability of the developing the event at time k is one minus the probability of remaining disease-free at time k
gen p_risk=1-p_survival
by intervention, sort: summarize p_risk if t==16
sort intervention t id
by intervention t: egen p_mean_risk=mean(p_risk)
* We want to generate the graph of standardised risk curves over time, undr the 3 interventions
* We need to average the adjusted risk curves across all values
expand 2 if t==0, generate(newtime)
sort id intervention t newtime
replace p_mean_risk=0 if newtime==0 & t==0
gen time2=t+2
replace time2=0 if newtime==0 & t==0
replace p_mean_risk=p_mean_risk*100
separate p_mean_risk, by(intervention)
keep if _n<=3*10
keep p_mean_risk1 p_mean_risk2 p_mean_risk3 time2
forvalues i=0(2)18{
forvalues j=1(1)3{
sum p_mean_risk`j' if time2==`i'
return scalar m2_risk`j'_t`i'=r(mean)
}
}
save "$data\Datasets2\Results incidence curves (method 2)_$datasetnum.dta", replace
************
* Method 3 *
************
*** Calculation of the (log-)hazard ratios
use "$data\Datasets\Final dataset (pooled)_$datasetnum.dta", replace
drop if CVD1==1
logistic CVD_pooled $outcome_model_HR3, cluster(id)
return scalar m3_b_weight_loss=_b[1.weight_change_cat]
return scalar m3_se_weight_loss=_se[1.weight_change_cat]
return scalar m3_b_weight_gain=_b[3.weight_change_cat]
return scalar m3_se_weight_gain=_se[3.weight_change_cat]
*** Calculation of the risk curves
logistic CVD_pooled $outcome_model_rc3, cluster(id)
keep if t==0
* Create a dataset with all time points for each treatment
* Re-expand data for all treatments
expand 9 if t==0
bysort id: replace t=2*(_n-1)
* Create 3 copies of each subjects, one for each treatment
expand 3
sort id t
by id t: gen intervention=_n
replace weight_loss=0
replace weight_loss=1 if intervention==1
replace weight_gain=0
replace weight_gain=1 if intervention==3
sort id intervention t
* Generate the probability of developing the event of interest "p_event_k" and the probability of remaining disease free "p_survival_k", which is 1-"p_event_k"
predict p_event_k, pr
gen p_survival_k=1-p_event_k
* The probability of remaining disease-free at time k is the product of the probabilities of remaining disease-free up to time t
sort id intervention t
gen p_survival=p_survival_k if t==0
bysort id intervention: replace p_survival=p_survival_k*p_survival[_n-1] if t>0
* The probability of the developing the event at time k is one minus the probability of remaining disease-free at time k
gen p_risk=1-p_survival
by intervention, sort: summarize p_risk if t==16
sort intervention t id
by intervention t: egen p_mean_risk=mean(p_risk)
* We want to generate the graph of standardised risk curves over time, undr the 3 interventions
* We need to average the adjusted risk curves across all values
* Important note: We add two time points (time=0 & time=1), so that the risk curves begin with p_risk(time==0)=0 & p_risk(time==1)=0
expand 3 if t==0, generate(newtime)
recode newtime 1=-1
sort id intervention t newtime
replace p_mean_risk=0 if newtime==-1 & t==0
gen time2=t+4
replace time2=0 if newtime==-1 & t==0
replace time2=2 if time2[_n-1]==0
replace p_mean_risk=p_mean_risk*100
separate p_mean_risk, by(intervention)
keep if _n<=3*11
keep p_mean_risk1 p_mean_risk2 p_mean_risk3 time2
forvalues i=0(2)20{
forvalues j=1(1)3{
sum p_mean_risk`j' if time2==`i'
return scalar m3_risk`j'_t`i'=r(mean)
}
}
save "$data\Datasets2\Results incidence curves (method 3)_$datasetnum.dta", replace
global datasetnum = $datasetnum + 1
end
simulate sm_st0_0=r(sm_status0_0) sm_st0_1=r(sm_status0_1) sm_st0_2=r(sm_status0_2) ///
sex_c=r(sex0_0) age_mean=r(age_m) age_sd=r(age_sd) fh_CVD_mean=r(fh_CVD_m) ///
diuretics0_mean=r(diuretics0_m) BMI0_mean=r(BMI0_m) BMI0_sd=r(BMI0_sd) sm_cess1_mean=r(sm_cess1_m) ///
diuretics1_mean=r(diuretics1_m) CVD1_mean=r(CVD1_m) height_f_mean=r(height_f_m) height_f_sd=r(height_f_sd) ///
height_m_mean=r(height_m_m) height_m_sd=r(height_m_sd) weight0_f_mean=r(weight0_f_m) weight0_f_sd=r(weight0_f_sd) weight0_m_mean=r(weight0_m_m) weight0_m_sd=r(weight0_m_sd) ///
wc_cat_1_c=r(weight_change_cat_1) wc_cat_2_c=r(weight_change_cat_2) wc_cat_3_c=r(weight_change_cat_3) ///
m1_logHR_wl=r(m1_b_weight_loss) se_m1_logHR_wl=r(m1_se_weight_loss) m2_logHR_wl=r(m2_b_weight_loss) se_m2_logHR_wl=r(m2_se_weight_loss) m3_logHR_wl=r(m3_b_weight_loss) se_m3_logHR_wl=r(m3_se_weight_loss) ///
m1_logHR_wg=r(m1_b_weight_gain) se_m1_logHR_wg=r(m1_se_weight_gain) m2_logHR_wg=r(m2_b_weight_gain) se_m2_logHR_wg=r(m2_se_weight_gain) m3_logHR_wg=r(m3_b_weight_gain) se_m3_logHR_wg=r(m3_se_weight_gain) ///
m1_risk_wl=r(m1_risk1_t20) m1_risk_wm=r(m1_risk2_t20) m1_risk_wg=r(m1_risk3_t20) ///
m2_risk_wl=r(m2_risk1_t18) m2_risk_wm=r(m2_risk2_t18) m2_risk_wg=r(m2_risk3_t18) ///
m3_risk_wl=r(m3_risk1_t20) m3_risk_wm=r(m3_risk2_t20) m3_risk_wg=r(m3_risk3_t20) ///
, dots(1) reps(1000) seed(11051982): sim_weight_change
* save the results
save "$data/Final Results (simulation study).dta", replace
generate rmse_wl_1 = (m1_logHR_wl-0)^2
generate bias_wl_1 = (m1_logHR_wl-0)
generate coverage_wl_1 = (m1_logHR_wl + invnorm(0.975)*se_m1_logHR_wl>0 & m1_logHR_wl - invnorm(0.975)*se_m1_logHR_wl<0)
generate rmse_wl_2 = (m2_logHR_wl-0)^2
generate bias_wl_2 = (m2_logHR_wl-0)
generate coverage_wl_2 = (m2_logHR_wl + invnorm(0.975)*se_m2_logHR_wl>0 & m2_logHR_wl - invnorm(0.975)*se_m2_logHR_wl<0)
generate rmse_wl_3 = (m3_logHR_wl-0)^2
generate bias_wl_3 = (m3_logHR_wl-0)
generate coverage_wl_3 = (m3_logHR_wl + invnorm(0.975)*se_m3_logHR_wl>0 & m3_logHR_wl - invnorm(0.975)*se_m3_logHR_wl<0)
********************************************
generate rmse_wg_1 = (m1_logHR_wg-0.3)^2
generate bias_wg_1 = (m1_logHR_wg-0.3)
generate coverage_wg_1 = (m1_logHR_wg + invnorm(0.975)*se_m1_logHR_wg>0.3 & m1_logHR_wg - invnorm(0.975)*se_m1_logHR_wg<0.3)
generate rmse_wg_2 = (m2_logHR_wg-0.3)^2
generate bias_wg_2 = (m2_logHR_wg-0.3)
generate coverage_wg_2 = (m2_logHR_wg + invnorm(0.975)*se_m2_logHR_wg>0.3 & m2_logHR_wg - invnorm(0.975)*se_m2_logHR_wg<0.3)
generate rmse_wg_3 = (m3_logHR_wg-0.3)^2
generate bias_wg_3 = (m3_logHR_wg-0.3)
generate coverage_wg_3 = (m3_logHR_wg + invnorm(0.975)*se_m3_logHR_wg>0.3 & m3_logHR_wg - invnorm(0.975)*se_m3_logHR_wg<0.3)
* check results from the simulation
summarize m1_logHR_wl bias_wl_1 coverage_wl_1 m2_logHR_wl bias_wl_2 coverage_wl_2 m3_logHR_wl bias_wl_3 coverage_wl_3
summarize m1_logHR_wg bias_wg_1 coverage_wg_1 m2_logHR_wg bias_wg_2 coverage_wg_2 m3_logHR_wg bias_wg_3 coverage_wg_3
save "$data/Final Results.dta", replace
*********************************************************************************************************************************************
*********************************************************************************************************************************************
*********************************************************************************************************************************************