-
Notifications
You must be signed in to change notification settings - Fork 1
/
lesson20.sas
371 lines (326 loc) · 9.97 KB
/
lesson20.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
libname library 'C:\MyGithub\N736Fall2017_HELPdataset\' ;
proc format library = library ;
value TREAT
0 = 'usual care'
1 = 'HELP clinic' ;
value FEMALE
0 = 'Male'
1 = 'Female' ;
value HOMELESS
0 = 'no'
1 = 'yes' ;
value G1B
0 = 'no'
1 = 'yes' ;
value F1A
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1B
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1C
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1D
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1E
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1F
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1G
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1H
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1I
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1J
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1K
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1L
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1M
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1N
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1O
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1P
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1Q
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1R
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1S
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value F1T
0 = 'Not at all or less than 1 day'
1 = '1-2 days'
2 = '3-4 days'
3 = '5-7 days or nearly every day for 2 weeks' ;
value SATREAT
0 = 'no'
1 = 'yes' ;
value DRINKSTATUS
0 = 'no'
1 = 'yes' ;
value ANYSUBSTATUS
0 = 'no'
1 = 'yes' ;
value LINKSTATUS
0 = 'no'
1 = 'yes' ;
proc datasets library = library;
modify helpmkh / correctencoding="WLATIN1";
format treat TREAT.;
format female FEMALE.;
format homeless HOMELESS.;
format g1b G1B.;
format f1a F1A.;
format f1b F1B.;
format f1c F1C.;
format f1d F1D.;
format f1e F1E.;
format f1f F1F.;
format f1g F1G.;
format f1h F1H.;
format f1i F1I.;
format f1j F1J.;
format f1k F1K.;
format f1l F1L.;
format f1m F1M.;
format f1n F1N.;
format f1o F1O.;
format f1p F1P.;
format f1q F1Q.;
format f1r F1R.;
format f1s F1S.;
format f1t F1T.;
format satreat SATREAT.;
format drinkstatus DRINKSTATUS.;
format anysubstatus ANYSUBSTATUS.;
format linkstatus LINKSTATUS.;
quit;
* make a copy to WORK;
data helpmkh;
set library.helpmkh;
run;
* ============================================.
* LESSON 20 - Poisson and
* Negative Binomial Regression
* for count data
*
* Melinda Higgins, PhD
* dated 11/5/2017
* ============================================.
* ============================================.
* For this lesson we'll use the helpmkh dataset
*
* Let's focus on d1
How many times hositalized for
medical problems (lifetime)
* ============================================;
* ============================================
let's look at the distribution of d1
pay attention to the mean and standard deviation
Poisson distributed variables should have
a mean that is equal to the standard deviation
If the standard deviation is larger than then mean
then the variable has overdisperson
Often in the presence of overdispersion you should
also try fitting a negative binomial model
and there are also zero-inflated versions of both
of these distributions
============================================;
proc univariate data=helpmkh plots;
var d1;
histogram d1 / normal kernel;
run;
* Fit a poisson generalized linear model
for the pcs associated with number of times
hospitalized for medical problems;
proc genmod data = helpmkh;
model d1 = pcs / type3 dist=poisson;
output out = poisson_pred predicted = pred1;
run;
proc sort data = poisson_pred;
by pred1;
run;
proc sgplot data = poisson_pred;
series x=pcs y=pred1;
run;
proc sgplot data=helpmkh;
scatter x=pcs y=d1;
loess x=pcs y=d1;
yaxis values=(0 to 14 by 2);
run;
* Fit a negative binomial generalized linear model
for the pcs associated with number of times
hospitalized for medical problems;
proc genmod data = helpmkh;
model d1 = pcs / type3 dist=nb;
output out = nb_pred predicted = pred1;
run;
proc sort data = nb_pred;
by pred1;
run;
proc sgplot data = nb_pred;
series x=pcs y=pred1;
run;
proc sgplot data=helpmkh;
scatter x=pcs y=d1;
loess x=pcs y=d1;
yaxis values=(0 to 14 by 2);
run;
* ===================================================
see http://sasnrd.com/fit-discrete-distribution/
===================================================
Code adapted from the SAS code provided here
* ===================================================;
/*****************************************************************************************************************
SAS file name: Discrete_Dist_Fit.sas
File location:
_________________________________________________________________________________________________________________
Purpose: Fit discrete distributions to univariate data using PROC GENMOD
Author: Peter Clemmensen
Creation Date: 12/07/2017
This program supports the blog post "Fit Discrete Distributions to Univariate Data" on SASnrd.com
*****************************************************************************************************************/
/* Tabulate counts and plot data */
proc freq data=helpmkh noprint;
tables d1 / out=CountMinMax;
run;
data _null_;
set CountMinMax end=eof;
if _N_=1 then call symputx('minCount', count);
if eof then call symputx('maxCount', count);
run;
%put min=&minCount max=&maxCount;
/* Visualize the data */
title 'Frequency Plot of Count Dataset';
proc sgplot data=helpmkh;
vbar d1;
xaxis display=(nolabel);
yaxis display=(nolabel);
run;
title;
/* Fit Poisson distribution to data */
proc genmod data=helpmkh;
model d1= /dist=Poisson; /* No variables are specified, only mean is estimated */
output out=PoissonFit p=lambda;
run;
/* Save Poisson parameter lambda in macro variables */
data _null_;
set PoissonFit(obs=1);
call symputx('lambda', lambda);
run;
/* Use Min/Max values and the fitted Lambda to create theoretical Poisson Data */
data TheoreticalPoisson;
do d1=0 to 15;
po=pdf('Poisson', d1, &lambda);
output;
end;
run;
/* Negative Binomial Example */
/* Fit Negative Binomial distribution to data */
proc genmod data=helpmkh;
model d1= /dist=NegBin; /* No variables are specified, only mean is estimated */
ods output parameterestimates=NegBinParameters;
run;
/* Transpose Data */
proc transpose data=NegBinParameters out=NegBinParameters;
var estimate;
id parameter;
run;
/* Calculate k and p from intercept and dispersion parameters */
data NegBinParameters;
set NegBinParameters;
k = 1/dispersion;
p = 1/(1+exp(intercept)*dispersion);
run;
/* Save k and p in macro variables */
data _null_;
set NegBinParameters;
call symputx('k', k);
call symputx('p', p);
run;
/* Calculate theoretical Negative Binomial PMF based on fitted k and p */
data TheoreticalNegBin;
do d1=0 to 15;
NegBin=pdf('NegBinomial', d1, &p, &k);
output;
end;
run;
/* Merge The datasets for plotting */
data PlotData(keep=d1 freq po negbin);
merge TheoreticalPoisson TheoreticalNegBin CountMinMax;
by d1;
freq = PERCENT/100;
run;
/* Overlay fitted Poisson density with original data */
title 'Count data overlaid with fitted distributions';
proc sgplot data=PlotData noautolegend;
vbarparm category=d1 response=freq / legendlabel='Count Data';
series x=d1 y=po / markers markerattrs=(symbol=circlefilled color=red)
lineattrs=(color=red)legendlabel='Fitted Poisson PMF';
series x=d1 y=NegBin / markers markerattrs=(symbol=squarefilled color=green)
lineattrs=(color=green)legendlabel='Fitted Negative Binomial PMF';
xaxis display=(nolabel);
yaxis display=(nolabel);
keylegend / location=inside position=NE across=1;
run;
title;