# Statistics using SAS

* proc freq
* proc means 
* proc corr
* proc sgplot 
* proc sgscatter 
* proc anova
* proc univariate 
* proc ttest
* proc reg
* proc glm
* proc glmselect
* proc plm
* ods graphics on/off

        PROC REG < options > ;
        < label: > MODEL dependents=<regressors></ options > ;
        BY variables ;
        FREQ variable ;
        ID variables ;
        VAR variables ;
        WEIGHT variable ;
        ADD variables ;
        DELETE variables ;
        < label: > MTEST <equation, : : : ,equation> </ options > ;
        OUTPUT < OUT=SAS-data-set > keyword=names
        < : : : keyword=names > ;
        PAINT <condition j ALLOBS>
        < / options > j < STATUS | UNDO> ;
        PLOT <yvariable*xvariable> <=symbol>
        < : : :yvariable*xvariable> <=symbol> </ options > ;
        PRINT < options > < ANOVA > < MODELDATA > ;
        REFIT;
        RESTRICT equation, : : : ,equation ;
        REWEIGHT <condition j ALLOBS>
        < / options > j < STATUS | UNDO> ;
        < label: > TEST equation,<; : : :,equation> </ option > ;

In [1]:
%let path=/folders/myfolders/ECST131;
libname statdata "&path";

### Proc Corr

Standard error for correlation: 

$$St_{\rho} = \sqrt{\frac{1-r^{2}}{n-2}}$$

The NOSIMPLE option tells the procedure
that you do not want the default output of means and standard deviations for each of the
variables in the VAR and WITH lists. The RANK option says to order the correlations
from largest to smallest (by their absolute values). 

Two types of plots are available when you use ODS Graphics with PROC CORR, a panel
or matrix of scatter plots (as the one above) or individual scatter plots. 

The ONLY option says that you want only separate bivariate plots for each variable pair,
rather than the scatter plot matrix that is produced by default. By default, the maximum
number of individual plots is also set at five. 

You can use the same NVAR= option to request additional scatter plots as you used with the matrix plot.

The RANK option says to order the correlations
from largest to smallest (by their absolute values).

In [2]:
proc print data=statdata.fitness;
run;

Obs,Name,Gender,RunTime,Age,Weight,Oxygen_Consumption,Run_Pulse,Rest_Pulse,Maximum_Pulse,Performance
1,Donna,F,8.17,42,68.15,59.57,166,40,172,90
2,Gracie,F,8.63,38,81.87,60.06,170,48,186,94
3,Luanne,F,8.65,43,85.84,54.3,156,45,168,83
4,Mimi,F,8.92,50,70.87,54.63,146,48,155,67
5,Chris,M,8.95,49,81.42,49.16,180,44,185,72
6,Allen,M,9.22,38,89.02,49.87,178,55,180,92
7,Nancy,F,9.4,49,76.32,48.67,186,56,188,64
8,Patty,F,9.63,52,76.32,45.44,164,48,166,56
9,Suzanne,F,9.93,57,59.08,50.55,148,49,155,43
10,Teresa,F,10.0,51,77.91,46.67,162,48,168,54


In [3]:
proc corr data=statdata.fitness rank pearson spearman
     plots(only)=scatter(nvar=all ellipse=none);
   var RunTime Age Weight Run_Pulse
       Rest_Pulse Maximum_Pulse Performance;
   with Oxygen_Consumption;
   title "Correlations and Scatter Plots with Oxygen_Consumption";
run;
title;

0,1
1 With Variables:,Oxygen_Consumption
7 Variables:,RunTime Age Weight Run_Pulse Rest_Pulse Maximum_Pulse Performance

Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics
Variable,N,Mean,Std Dev,Median,Minimum,Maximum
Oxygen_Consumption,31,47.37581,5.32777,46.77,37.39,60.06
RunTime,31,10.58613,1.38741,10.47,8.17,14.03
Age,31,47.67742,5.26236,48.0,38.0,57.0
Weight,31,77.44452,8.32857,77.45,59.08,91.63
Run_Pulse,31,169.64516,10.25199,170.0,146.0,186.0
Rest_Pulse,31,53.45161,7.61944,52.0,40.0,70.0
Maximum_Pulse,31,173.77419,9.1641,172.0,155.0,192.0
Performance,31,56.64516,18.32584,56.0,20.0,94.0

"Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.1","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.2","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.3","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.4","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.5","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.6","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.7"
Oxygen_Consumption,RunTime -0.86219 <.0001,Performance 0.77890 <.0001,Rest_Pulse -0.39935 0.0260,Run_Pulse -0.39808 0.0266,Age -0.31162 0.0879,Maximum_Pulse -0.23677 0.1997,Weight -0.16289 0.3813

"Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.1","Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.2","Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.3","Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.4","Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.5","Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.6","Spearman Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0.7"
Oxygen_Consumption,RunTime -0.80806 <.0001,Performance 0.65503 <.0001,Run_Pulse -0.43748 0.0138,Rest_Pulse -0.38028 0.0348,Maximum_Pulse -0.32239 0.0769,Age -0.19327 0.2975,Weight -0.09318 0.6181


IMAGEMAP=ON option after a slash in the ODS GRAPHICS statement enables the tooltip feature. 

In [4]:
ods graphics on / imagemap=on;
proc corr data=statdata.fitness 
     plots=matrix(nvar=all histogram); 
   var RunTime Age Weight Run_Pulse
       Rest_Pulse Maximum_Pulse Performance;
   id name;
   title "Correlation Matrix and Scatter Plot Matrix of Fitness Predictors";
run;
title;

0,1
7 Variables:,RunTime Age Weight Run_Pulse Rest_Pulse Maximum_Pulse Performance

Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics
Variable,N,Mean,Std Dev,Sum,Minimum,Maximum
RunTime,31,10.58613,1.38741,328.17,8.17,14.03
Age,31,47.67742,5.26236,1478.0,38.0,57.0
Weight,31,77.44452,8.32857,2401.0,59.08,91.63
Run_Pulse,31,169.64516,10.25199,5259.0,146.0,186.0
Rest_Pulse,31,53.45161,7.61944,1657.0,40.0,70.0
Maximum_Pulse,31,173.77419,9.1641,5387.0,155.0,192.0
Performance,31,56.64516,18.32584,1756.0,20.0,94.0

"Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0"
Unnamed: 0_level_1,RunTime,Age,Weight,Run_Pulse,Rest_Pulse,Maximum_Pulse,Performance
RunTime,1.00000,0.19523 0.2926,0.14351 0.4412,0.31365 0.0858,0.45038 0.0110,0.22610 0.2213,-0.82049 <.0001
Age,0.19523 0.2926,1.00000,-0.24050 0.1925,-0.31607 0.0832,-0.15087 0.4178,-0.41490 0.0203,-0.71257 <.0001
Weight,0.14351 0.4412,-0.24050 0.1925,1.00000,0.18152 0.3284,0.04397 0.8143,0.24938 0.1761,0.08974 0.6312
Run_Pulse,0.31365 0.0858,-0.31607 0.0832,0.18152 0.3284,1.00000,0.35246 0.0518,0.92975 <.0001,-0.02943 0.8751
Rest_Pulse,0.45038 0.0110,-0.15087 0.4178,0.04397 0.8143,0.35246 0.0518,1.00000,0.30512 0.0951,-0.22560 0.2224
Maximum_Pulse,0.22610 0.2213,-0.41490 0.0203,0.24938 0.1761,0.92975 <.0001,0.30512 0.0951,1.00000,0.09002 0.6301
Performance,-0.82049 <.0001,-0.71257 <.0001,0.08974 0.6312,-0.02943 0.8751,-0.22560 0.2224,0.09002 0.6301,1.00000


In [5]:
ods graphics on / imagemap=off;
proc corr data=statdata.fitness 
   plots=(matrix scatter); 
   var RunTime Age Weight Run_Pulse
       Rest_Pulse Maximum_Pulse Performance;
   id name;
   title "Correlation Matrix and Scatter Plot Matrix of Fitness Predictors";
run;
title;

0,1
7 Variables:,RunTime Age Weight Run_Pulse Rest_Pulse Maximum_Pulse Performance

Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics
Variable,N,Mean,Std Dev,Sum,Minimum,Maximum
RunTime,31,10.58613,1.38741,328.17,8.17,14.03
Age,31,47.67742,5.26236,1478.0,38.0,57.0
Weight,31,77.44452,8.32857,2401.0,59.08,91.63
Run_Pulse,31,169.64516,10.25199,5259.0,146.0,186.0
Rest_Pulse,31,53.45161,7.61944,1657.0,40.0,70.0
Maximum_Pulse,31,173.77419,9.1641,5387.0,155.0,192.0
Performance,31,56.64516,18.32584,1756.0,20.0,94.0

"Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 31 Prob > |r| under H0: Rho=0"
Unnamed: 0_level_1,RunTime,Age,Weight,Run_Pulse,Rest_Pulse,Maximum_Pulse,Performance
RunTime,1.00000,0.19523 0.2926,0.14351 0.4412,0.31365 0.0858,0.45038 0.0110,0.22610 0.2213,-0.82049 <.0001
Age,0.19523 0.2926,1.00000,-0.24050 0.1925,-0.31607 0.0832,-0.15087 0.4178,-0.41490 0.0203,-0.71257 <.0001
Weight,0.14351 0.4412,-0.24050 0.1925,1.00000,0.18152 0.3284,0.04397 0.8143,0.24938 0.1761,0.08974 0.6312
Run_Pulse,0.31365 0.0858,-0.31607 0.0832,0.18152 0.3284,1.00000,0.35246 0.0518,0.92975 <.0001,-0.02943 0.8751
Rest_Pulse,0.45038 0.0110,-0.15087 0.4178,0.04397 0.8143,0.35246 0.0518,1.00000,0.30512 0.0951,-0.22560 0.2224
Maximum_Pulse,0.22610 0.2213,-0.41490 0.0203,0.24938 0.1761,0.92975 <.0001,0.30512 0.0951,1.00000,0.09002 0.6301
Performance,-0.82049 <.0001,-0.71257 <.0001,0.08974 0.6312,-0.02943 0.8751,-0.22560 0.2224,0.09002 0.6301,1.00000


In [6]:
ods graphics on;
title "Computing Pearson Correlation Coefficients";
proc corr data=exercise nosimple rank
/*plots = matrix(nvar=all);*/
 plots(only)=scatter (ellipse = confidence);
/*plots(only) = scatter(ellipse = none);*/
var Rest_Pulse Max_Pulse Run_Pulse Age;
with Pushups;      /*****/
run;
ods graphics off;

## Simple Regression Model 


See Ramon Littell, Walter Stroup, Rudolf Freund-SAS for Linear Models, Fourth Edition-SAS Publishing (2002) P11

The CLM option yields a confidence interval for the subpopulation mean, and the CLI
option yields a prediction interval for a value to be drawn at random from the subpopulation. The
CLI limits are always wider than the CLM limits, because the CLM limits accommodate only
variability in $\widehat{y}$, whereas the CLI limits accommodate variability in $\widehat{y}$ and variability in the future
value of y. This is true even though $\widehat{y}$ is used as an estimate of the subpopulation mean as well as a
predictor of the future value.

where NOINT is the option that specifies that no intercept be included. In other words, the fitted
regression plane is forced to pass through the origin.

Corresponding complications arise regarding the R-square statistic with no-intercept models.
Note that R-Square=0.9829 for the no-intercept model in Output 2.10 is greater than
R-Square=0.9373 for the model in Output 2.6, although the latter has two more parameters than
the former. This seems contrary to the general phenomenon that adding terms to a model causes
the R-square to increase. This seeming contradiction occurs because the denominator of the Rsquare
is the Uncorrected Total SS when the NOINT option is used. This is the reason for the
message that R-square is redefined at the top of Output 2.10. It is, therefore, not meaningful to
compare an R-square for a model that contains an intercept with an R-square for a model that does
not contain an intercept

In [7]:
proc reg data=statdata.fitness;
   model Oxygen_Consumption = RunTime / p cli clm influence r xpx i;
   id name RunTime;
   title 'Predicting Oxygen_Consumption from RunTime';
run;
quit;
title;


Model Crossproducts X'X X'Y Y'Y,Model Crossproducts X'X X'Y Y'Y,Model Crossproducts X'X X'Y Y'Y,Model Crossproducts X'X X'Y Y'Y
Variable,Intercept,RunTime,Oxygen_Consumption
Intercept,31.0,328.17,1468.65
RunTime,328.17,3531.7975,15356.1247
Oxygen_Consumption,1468.65,15356.1247,70430.0327

0,1
Number of Observations Read,31
Number of Observations Used,31

"X'X Inverse, Parameter Estimates, and SSE","X'X Inverse, Parameter Estimates, and SSE","X'X Inverse, Parameter Estimates, and SSE","X'X Inverse, Parameter Estimates, and SSE"
Variable,Intercept,RunTime,Oxygen_Consumption
Intercept,1.9728798928,-0.183317417,82.424942238
RunTime,-0.183317417,0.0173167563,-3.310854768
Oxygen_Consumption,82.424942238,-3.310854768,218.53997081

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,1,633.01458,633.01458,84.0,<.0001
Error,29,218.53997,7.53586,,
Corrected Total,30,851.55455,,,

0,1,2,3
Root MSE,2.74515,R-Square,0.7434
Dependent Mean,47.37581,Adj R-Sq,0.7345
Coeff Var,5.79442,,

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Variable,DF,Parameter Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,82.42494,3.85582,21.38,<.0001
RunTime,1,-3.31085,0.36124,-9.17,<.0001

Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics,Output Statistics
Obs,Name,RunTime,Dependent Variable,Predicted Value,Std Error Mean Predict,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Residual,Std Error Residual,Student Residual,Cook's D,RStudent,Hat Diag H,Cov Ratio,DFFITS,DFBETAS,DFBETAS
Obs,Name,RunTime,Dependent Variable,Predicted Value,Std Error Mean Predict,95% CL Mean,95% CL Mean,95% CL Predict,95% CL Predict,Residual,Std Error Residual,Student Residual,Cook's D,RStudent,Hat Diag H,Cov Ratio,DFFITS,Intercept,RunTime
1,Donna,8.17,59.6,55.3753,1.0024,53.325,57.4255,49.3982,61.3524,4.1947,2.556,1.641,0.207,1.6934,0.1333,1.0185,0.6643,0.6154,-0.5784
2,Gracie,8.63,60.1,53.8523,0.8616,52.09,55.6145,47.9677,59.7368,6.2077,2.606,2.382,0.31,2.6094,0.0985,0.77,0.8626,0.7647,-0.7074
3,Luanne,8.65,54.3,53.786,0.8557,52.0359,55.5362,47.9051,59.667,0.514,2.608,0.197,0.002,0.1937,0.0972,1.185,0.0636,0.0562,-0.052
4,Mimi,8.92,54.6,52.8921,0.778,51.3008,54.4834,47.0565,58.7277,1.7379,2.633,0.66,0.019,0.6536,0.0803,1.1316,0.1932,0.1639,-0.1494
5,Chris,8.95,49.2,52.7928,0.7697,51.2186,54.367,46.9618,58.6238,-3.6328,2.635,-1.379,0.081,-1.4014,0.0786,1.0166,-0.4093,-0.3453,0.3143
6,Allen,9.22,49.9,51.8989,0.6976,50.4721,53.3256,46.1059,57.6918,-2.0289,2.655,-0.764,0.02,-0.7585,0.0646,1.101,-0.1993,-0.1578,0.141
7,Nancy,9.4,48.7,51.3029,0.6532,49.9669,52.6389,45.5317,57.0741,-2.6329,2.666,-0.987,0.029,-0.987,0.0566,1.0619,-0.2418,-0.1807,0.1586
8,Patty,9.63,45.4,50.5414,0.602,49.3102,51.7726,44.7935,56.2893,-5.1014,2.678,-1.905,0.092,-2.0009,0.0481,0.8626,-0.4497,-0.303,0.258
9,Suzanne,9.93,50.6,49.5482,0.5471,48.4293,50.667,43.8233,55.273,1.0018,2.69,0.372,0.003,0.3668,0.0397,1.1064,0.0746,0.0407,-0.0323
10,Teresa,10.0,46.7,49.3164,0.5366,48.219,50.4138,43.5957,55.0371,-2.6464,2.692,-0.983,0.019,-0.9824,0.0382,1.0422,-0.1958,-0.0996,0.0773

0,1
Sum of Residuals,0.0
Sum of Squared Residuals,218.53997
Predicted Residual SS (PRESS),250.97516


The Model Sum of Squares is 633.01. This is the amount of variability that the model explains.

The Error Sum of Squares is 218.54. This is the amount of variability that the model does not explain.

The Total Sum of Squares is 851.55, which is the total amount of variability in the response.

The Mean Square column indicates the ratio of the sum of squares and the degrees of freedom.The mean square model is 633.01. This is calculated by dividing the model sum of squares by the model DF, which gives us the average sum of squares for the model. The mean square error is 7.54, which is an estimate of the population variance. This is calculated by dividing the error sum of squares by the error DF, which gives us the average sum of squares for
the error.

The Root MSE is 2.75. This is the square root of the mean square error in the Analysis of Variance table. 
The Root MSE is a measure of the standard deviation of Oxygen_Consumption at each value of RunTime.

The Dependent Mean is 47.38, which is the average of Oxygen_Consumption for all 31 subjects.

The Coefficient of Variation is 5.79. This is the size of the standard deviation relative to the mean.

The R-square value is .743, which is calculated by dividing the mean square for the model by the total 
sum of squares. The R-square value is between 0 and 1 and measures the proportion of variation 
observed in the response that the regression line explains.

Mean Square Between and Mean Square Within are used to calculate the F-ratio: 

If you create a 95% prediction interval, the interpretation is that you are 95% confident that 
your interval contains the new observation. 

For a given set of data, why is a prediction interval wider than a confidence interval? 
A prediction interval is wider than a confidence interval because single observations have 
more variability than sample means.

**The difference between a prediction interval and a confidence interval is the standard error.**

The standard error for a confidence interval on the mean takes into account the uncertainty 
due to sampling. The line you computed from your sample will be different from the line that 
would have been computed if you had the entire population, the standard error takes this 
uncertainty into account.

The standard error for a prediction interval on an individual observation takes into account 
the uncertainty due to sampling like above, but also takes into account the variability of the 
individuals around the predicted mean. The standard error for the prediction interval will be 
wider than for the confidence interval and hence the prediction interval will be wider than 
the confidence interval.

### Storing Parameter Estimates and Scoring

First, create a data set containing the values of the independent variable for which you want to make predictions. Concatenate the new data set with the original data set. Fit a simple linear regression model to the new data set and specify the P option in the MODEL statement. Because the concatenated observations contain missing values for the response variable, PROC REG doesn't include these observations when fitting the regression model. However, PROC REG does produce predicted values for these observations.

When you use a model to predict future values of the response variable given certain values of the predictor variable, you must stay within the range of values for the predictor variable used to create the model. For example, in the original Fitness data set, values of RunTime range from a little over 8 minutes to a little over 14 minutes. Based on that data, you shouldn't try to predict what Oxygen_Consumption would be for a RunTime value outside that range. The relationship between the predictor variable and the response variable might be different beyond the range of the data.


    PROC SCORE DATA=SAS-data-set 
       SCORE=SAS-data-set 
       OUT=SAS-data-set 
       TYPE=name 
       <options>;
    VAR variable(s);
    RUN;
    QUIT;
    
In the PROC SCORE statement, the DATA= option specifies the data set containing the observations to score, which is Need_Predictions. The SCORE= option specifies the data set containing the parameter estimates, which is Estimates. The OUT= option specifies the data set that PROC SCORE creates. Let's call this data set Scored. Finally, the TYPE= option tells PROC SCORE what type of data the SCORE= data set contains. In this case, specifying TYPE=PARMS tells SAS to use the parameter estimates in the Estimates data set. The VAR statement specifies the numeric variables to use in computing scores. These variables must appear in both the DATA= and SCORE= input data sets. If you don't specify a VAR statement, PROC SCORE uses all the numeric variables in the SCORE= data set. So it's important to specify a VAR statement with PROC SCORE, because you rarely use all the numeric variables in your data set to compute scores. We'll use RunTime. Next, let's see this process in action.

    

In [8]:
data need_predictions;
   input RunTime @@;
   datalines;
9 10 11 12 13 14 15
;
run;

proc reg data=statdata.fitness noprint outest=estimates; 
   model Oxygen_Consumption=RunTime;
run;
quit;
 
proc print data=estimates;
   title "OUTEST= Data Set from PROC REG";
run;
title;

proc print data = need_predictions;
 title "need_predictions Data Set";
run;
  
proc score data=need_predictions /*dataset to score*/ 
           score=estimates  /*dataset containing the parmeter estimates*/
           out=scored       /*the output dataset*/
           type=parms;      /*tells PROC SCORE what type of data the SCORE= data set contains.*/
   var RunTime; 
   /*The VAR statement specifies the numeric variables to use in computing scores. 
   These variables must appear in both the DATA= and SCORE= input data sets*/ 
run;
 
proc print data=Scored;
   title "Scored New Observations";
run;
title;

Obs,_MODEL_,_TYPE_,_DEPVAR_,_RMSE_,Intercept,RunTime,Oxygen_Consumption
1,MODEL1,PARMS,Oxygen_Consumption,2.74515,82.4249,-3.31085,-1

Obs,RunTime
1,9
2,10
3,11
4,12
5,13
6,14
7,15

Obs,RunTime,MODEL1
1,9,52.6272
2,10,49.3164
3,11,46.0055
4,12,42.6947
5,13,39.3838
6,14,36.073
7,15,32.7621


In [9]:
proc reg data=statdata.bodyfat2 outest=estimates;
   model PctBodyFat2=Weight;
   title "Regression of % Body Fat on Weight";
run;

data toscore;
   input Weight @@;
   datalines;
125 150 175 200 225
;
run;

proc score data=toscore score=estimates
     out=scored type=parms;
   var Weight;
run;

proc print data=scored;
   title "Predicted % Body Fat from Weight 125 150 175 200 225";
run;
title;

0,1
Number of Observations Read,252
Number of Observations Used,252

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,1,6593.01614,6593.01614,150.03,<.0001
Error,250,10986.0,43.94389,,
Corrected Total,251,17579.0,,,

0,1,2,3
Root MSE,6.62902,R-Square,0.3751
Dependent Mean,19.15079,Adj R-Sq,0.3726
Coeff Var,34.61485,,

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Variable,DF,Parameter Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,-12.05158,2.58139,-4.67,<.0001
Weight,1,0.17439,0.01424,12.25,<.0001

Obs,Weight,MODEL1
1,125,9.747
2,150,14.1067
3,175,18.4664
4,200,22.8261
5,225,27.1859


### The PLM Procedure

The PLM procedure performs post-fitting statistical analyses and plotting for the contents of a SAS item store that were previously created with the STORE statement in some other SAS/STAT procedure. 

The statements that are available in the PLM procedure are designed to reveal the contents of the source item store via the Output Delivery System (ODS) and to perform post-fitting tasks.

The use of item stores and PROC PLM enables you to separate common post-processing tasks, such as testing for treatment differences and predicting new observations under a fitted model, from the process of model building and fitting. A numerically expensive model fitting technique can be applied once to produce a source item store. The PLM procedure can then be called multiple times, and the results of the fitted model are analyzed without incurring the model fitting expenditure again.

Selected PROC PLM option:

* RESTORE	specifies the source item store for processing.
* Selected PROC PLM procedure statements:
* EFFECTPLOT produces a display of the fitted model and provides options for changing and enhancing the displays.
* LSMEANS computes and compares least squares means (LS-means) of fixed effects.
* LSMESTIMATE	provides custom hypothesis tests among least squares means.
* SHOW uses the Output Delivery System to display contents of the item store. This statement is useful for verifying   that the contents of the item store apply to the analysis and for generating ODS tables.
* SLICE provides a general mechanism for performing a partitioned analysis of the LS-means for an interaction. This analysis is also known as an analysis of simple effects. The SLICE statement uses the same options as the LSMEANS statement.
* WHERE is used in the PLM procedure when the item store contains BY-variable information and you want to apply the PROC PLM statements to only a subset of the BY groups.


    PROC PLM RESTORE=item-store-specification<options>;
        EFFECTPLOT <plot-type <(plot-definition options)>> 
              </ options>;
        LSMEANS <model-effects > </ options>;
        LSMESTIMATE model-effect <'label'> values 
          <divisor=n><,...<'label'> values
          <divisor=n>> </ options>;
    SHOW options;
    SLICE model-effect </ options>;
    WHERE expression ;
    RUN;

## Multiple Regression Model


In [10]:
proc sql outobs = 20; 
select *
from statdata.ameshousing3 ;
run;

proc univariate data=statdata.ameshousing3;
var SalePrice Basement_Area Lot_Area;
run;

PID,Lot size in square feet,Style of dwelling,Overall material and finish of the house,Overall condition of the house,Original construction year,Heating quality and condition,Presence of central air conditioning,Above grade (ground) living area square feet,Bedrooms above grade,Number of fireplaces,Size of garage in square feet,Month Sold (MM),Year Sold (YYYY),Sale price in dollars,Basement area in square feet,Number of full bathrooms,Number of half bathrooms,Total number of bathrooms (half bathrooms counted 10%),Total area of decks and porches in square feet,"Age of house when sold, in years",Season when house sold,Garage attached or detached,Foundation Type,Masonry veneer or not,Regular or irregular lot shape,Style of dwelling.1,Overall material and finish of the house.1,Overall condition of the house.1,Natural log of the sale price,"Sale Price > $175,000",score
527127150,4920,1Story,8,5,2001,Ex,Y,1338,2,0,582,4,2010,213500,1338,3,0,3.0,0,9,2,Attached,Concrete/Slab,N,Regular,1Story,6,5,12.271392112,1,.
527145080,5005,1Story,8,5,1992,Ex,Y,1280,2,0,506,1,2010,191500,1280,2,0,2.0,226,18,1,Attached,Concrete/Slab,N,Irregular,1Story,6,5,12.162643088,1,.
527425090,10500,1Story,4,5,1971,TA,Y,864,3,1,0,4,2010,115000,864,1,0,1.0,0,39,2,,Cinder Block,N,Regular,1Story,4,5,11.652687407,0,.
528228285,3203,1Story,7,5,2006,Ex,Y,1145,2,0,437,1,2010,160000,1145,2,0,2.0,216,4,1,Attached,Concrete/Slab,Y,Regular,1Story,6,5,11.982929094,0,.
528250100,7750,SLvl,7,5,2000,Ex,Y,1430,3,1,400,4,2010,180000,384,2,1,2.1,180,10,2,Attached,Concrete/Slab,N,Irregular,SLvl,6,5,12.10071213,1,.
531452050,7175,1Story,6,5,1984,TA,Y,752,2,0,264,2,2010,125000,744,2,0,2.0,443,26,1,Attached,Cinder Block,N,Regular,1Story,6,5,11.736069016,0,.
533253210,3880,1Story,8,6,1978,TA,Y,1226,1,1,484,1,2010,206000,1226,2,0,2.0,301,32,1,Attached,Cinder Block,N,Irregular,1Story,6,6,12.235631448,1,.
534401110,9900,1Story,5,5,1966,Gd,Y,1209,3,0,504,4,2010,159000,1209,2,0,2.0,0,44,2,Attached,Concrete/Slab,N,Regular,1Story,5,5,11.976659481,0,.
534403410,14112,SLvl,5,7,1964,TA,Y,1152,3,1,484,4,2010,180500,1152,2,0,2.0,227,46,2,Attached,Concrete/Slab,Y,Irregular,SLvl,5,6,12.103486057,1,.
534430080,9717,1Story,5,6,1950,Gd,Y,1078,2,0,240,4,2010,142125,1078,2,0,2.0,366,60,2,Attached,Cinder Block,N,Regular,1Story,5,6,11.864462231,0,.

Moments,Moments.1,Moments.2,Moments.3
N,300.0,Sum Weights,300.0
Mean,137524.867,Sum Observations,41257460.0
Std Deviation,37622.6431,Variance,1415463276.0
Skewness,0.29726388,Kurtosis,0.72287774
Uncorrected SS,6097150000000.0,Corrected SS,423224000000.0
Coeff Variation,27.3569748,Std Error Mean,2172.14431

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,137524.9,Std Deviation,37623
Median,135000.0,Variance,1415463276
Mode,110000.0,Range,255000
,,Interquartile Range,45475

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,63.31295,Pr > |t|,<.0001
Sign,M,150.0,Pr >= |M|,<.0001
Signed Rank,S,22575.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,290000
99%,227500
95%,207000
90%,187300
75% Q3,159475
50% Median,135000
25% Q1,114000
10%,91150
5%,80000
1%,48500

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
35000,294,218000,184
39300,190,220000,106
45000,77,235000,151
52000,130,245000,54
59000,70,290000,123

Moments,Moments.1,Moments.2,Moments.3
N,300.0,Sum Weights,300.0
Mean,882.31,Sum Observations,264693.0
Std Deviation,359.783966,Variance,129444.502
Skewness,-0.5476589,Kurtosis,0.13741949
Uncorrected SS,272245187.0,Corrected SS,38703906.2
Coeff Variation,40.7775007,Std Error Mean,20.772137

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,882.31,Std Deviation,359.78397
Median,912.0,Variance,129445.0
Mode,0.0,Range,1645.0
,,Interquartile Range,471.5

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,42.47565,Pr > |t|,<.0001
Sign,M,142.0,Pr >= |M|,<.0001
Signed Rank,S,20235.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,1645.0
99%,1488.0
95%,1430.5
90%,1337.5
75% Q3,1143.5
50% Median,912.0
25% Q1,672.0
10%,406.0
5%,0.0
1%,0.0

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
0,285,1486,95
0,269,1487,249
0,268,1489,222
0,233,1602,151
0,219,1645,105

Moments,Moments.1,Moments.2,Moments.3
N,300.0,Sum Weights,300.0
Mean,8294.13667,Sum Observations,2488241.0
Std Deviation,3323.78787,Variance,11047565.8
Skewness,1.00934511,Kurtosis,4.57577642
Uncorrected SS,23941000000.0,Corrected SS,3303222171.0
Coeff Variation,40.0739462,Std Error Mean,191.898982

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,8294.137,Std Deviation,3324
Median,8265.0,Variance,11047566
Mode,7200.0,Range,24647
,,Interquartile Range,3816

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,43.22137,Pr > |t|,<.0001
Sign,M,150.0,Pr >= |M|,<.0001
Signed Rank,S,22575.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,26142.0
99%,18631.5
95%,13109.0
90%,12036.0
75% Q3,10110.0
50% Median,8265.0
25% Q1,6294.5
10%,4252.0
5%,2956.5
1%,1638.0

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
1495,241,16285,91
1533,110,17755,292
1596,173,19508,120
1680,252,25339,218
1680,251,26142,38


Run the same model in PROC GLM. When you run a linear regression model with only two predictor variables, the output includes a contour fit plot by default. We specify CONTOURFIT to tell SAS to overlay the contour plot with a scatter plot of the observed data.

Here is the contour fit plot with the overlaid scatter plot that we requested. We can use this plot to see how well your model predicts observed values. The plot shows predicted values of SalePrice as gradations of the background color from blue, representing low values, to red, representing high values. The dots, which are similarly colored, represent the actual data. Observations that are perfectly fit would show the same color within the circle as outside the circle. The lines on the graph help you read the actual predictions at even intervals.

For example, this point near the upper-right represents an observation with a basement area of about 1,500 square feet, a lot size of about 17,000 square feet, and a predicted value of over \$180,000 for sale price. However, the dot’s color shows that its observed sale price is actually closer to about \$160,000.

In [11]:
ods graphics on;

proc reg data=statdata.ameshousing3 ;
    model SalePrice=Basement_Area Lot_Area;
    title "Model with Basement Area and Lot Area";
run;
quit;

proc glm data=statdata.ameshousing3 
         plots(only)=(contourfit);
    model SalePrice=Basement_Area Lot_Area;
    contrast 'Basement_Area=0' Basement_Area 1; 
    contrast 'Basement_Area=Lot_Area' Basement_Area 1 Lot_Area -1;
    contrast 'Basement_Area=Lot_Area=0' Basement_Area 1,  Lot_Area 1;
    
    /*CONTRAST statements can be used to test hypotheses about
    any linear combination of parameters in the model.*/
    
   estimate 'Basement_Area=0' Basement_Area 1; 
   estimate 'Basement_Area=Lot_Area' Basement_Area 1 Lot_Area -1; 
    
    /*The ESTIMATE statement is used in essentially the same way as the CONTRAST statement. 
But instead of F-tests for linear combinations, you get estimates of them along with standard errors.
However, the ESTIMATE statement can estimate only one linear combination at a time, whereas the
CONTRAST statement could be used to test two or more linear combinations simultaneously */
    
    store out=multiple;
    title "Model with Basement Area and Gross Living Area";
run;
quit;

proc plm restore=multiple plots=all;
    effectplot contour (y=Basement_Area x=Lot_Area);
    effectplot slicefit(x=Lot_Area sliceby=Basement_Area=250 to 1000 by 250);
run; 

title;

0,1
Number of Observations Read,300
Number of Observations Used,300

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,2,203220600000.0,101610300000.0,137.17,<.0001
Error,297,220002900000.0,740750509.0,,
Corrected Total,299,423223500000.0,,,

0,1,2,3
Root MSE,27217.0,R-Square,0.4802
Dependent Mean,137525.0,Adj R-Sq,0.4767
Coeff Var,19.79041,,

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Variable,Label,DF,Parameter Estimate,Standard Error,t Value,Pr > |t|
Intercept,Intercept,1,69016.0,5129.52179,13.45,<.0001
Basement_Area,Basement area in square feet,1,70.0868,4.54618,15.42,<.0001
Lot_Area,Lot size in square feet,1,0.8043,0.4921,1.63,0.1032

0,1
Number of Observations Read,300
Number of Observations Used,300

Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,2,203220618262,101610309131.0,137.17,<.0001
Error,297,220002901249,740750509.26,,
Corrected Total,299,423223519511,,,

R-Square,Coeff Var,Root MSE,SalePrice Mean
0.480173,19.79041,27216.73,137524.9

Source,DF,Type I SS,Mean Square,F Value,Pr > F
Basement_Area,1,201241844480.0,201241844480.0,271.67,<.0001
Lot_Area,1,1978773781.7,1978773781.7,2.67,0.1032

Source,DF,Type III SS,Mean Square,F Value,Pr > F
Basement_Area,1,176055907089.0,176055907089.0,237.67,<.0001
Lot_Area,1,1978773781.7,1978773781.7,2.67,0.1032

Contrast,DF,Contrast SS,Mean Square,F Value,Pr > F
Basement_Area=0,1,176055907089,176055907089,237.67,<.0001
Basement_Area=Lot_Area,1,160693644810,160693644810,216.93,<.0001
Basement_Area=Lot_Area=0,2,203220618262,101610309131,137.17,<.0001

Parameter,Estimate,Standard Error,t Value,Pr > |t|
Basement_Area=0,70.0868031,4.54618316,15.42,<.0001
Basement_Area=Lot_Area,69.2825041,4.70392301,14.73,<.0001

Parameter,Estimate,Standard Error,t Value,Pr > |t|
Intercept,69015.6136,5129.52179,13.45,<.0001
Basement_Area,70.0868,4.546183,15.42,<.0001
Lot_Area,0.8043,0.492102,1.63,0.1032

Store Information,Store Information.1
Item Store,WORK.MULTIPLE
Data Set Created From,STATDATA.AMESHOUSING3
Created By,PROC GLM
Date Created,03JUL17:23:17:52
Response Variable,SalePrice
Model Effects,Intercept Basement_Area Lot_Area


## Model Selection 

### Automatic Model Selection

In the MODEL statement, following a forward slash, you add the SELECTION= option to specify the method used to select the model. The default is NONE, which in this case would calculate the full regression model, because you specified all the variables in the MODEL statement. To calculate the all-possible regression model instead, you specify the CP, RSQUARE, or ADJRSQ statistics as the SELECTION= value. Here all three are specified. The first statistic that you list here determines the sorting order in the output. 

Here's a question: For this PROC REG step, how are the models sorted? Specifying CP as the first statistic sorts the models by the value of CP. To produce only a specific number of models, you can specify the BEST= option in the MODEL statement. For example, BEST=20 displays the 20 best models based on your sorting statistic, which in this case is CP. 

Finally, you can add an optional label to the MODEL statement to label your output. For this all-possible regression model, let's add the label ALL_REG. Notice that the label must end in a colon.

Each star in the Cp plot represents the best model for a given number of parameters




In [12]:
ods graphics / imagemap=on;
proc reg data=statdata.fitness plots(only)=(cp);
   ALL_REG: model Oxygen_Consumption= 
   Performance RunTime Age Weight
   Run_Pulse Rest_Pulse Maximum_Pulse
   / selection=cp rsquare adjrsq best=20;
title 'Best Models Using All-Regression Option';
run;
quit;
title;

0,1
Number of Observations Read,31
Number of Observations Used,31

Model Index,Number in Model,C(p),R-Square,Adjusted R-Square,Variables in Model
1,4,4.0004,0.8355,0.8102,RunTime Age Run_Pulse Maximum_Pulse
2,5,4.2598,0.8469,0.8163,RunTime Age Weight Run_Pulse Maximum_Pulse
3,5,4.7158,0.8439,0.8127,Performance RunTime Weight Run_Pulse Maximum_Pulse
4,5,4.7168,0.8439,0.8127,Performance RunTime Age Run_Pulse Maximum_Pulse
5,4,4.9567,0.8292,0.8029,Performance RunTime Run_Pulse Maximum_Pulse
6,3,5.857,0.8101,0.789,RunTime Run_Pulse Maximum_Pulse
7,3,5.9367,0.8096,0.7884,RunTime Age Run_Pulse
8,5,5.9783,0.8356,0.8027,RunTime Age Run_Pulse Rest_Pulse Maximum_Pulse
9,5,5.9856,0.8356,0.8027,Performance Age Weight Run_Pulse Maximum_Pulse
10,6,6.0492,0.8483,0.8104,Performance RunTime Age Weight Run_Pulse Maximum_Pulse


In [13]:
proc reg data=statdata.fitness;
   PREDICT_mpc: model Oxygen_Consumption= 
                  RunTime Age Run_Pulse Maximum_Pulse; 
   EXPLAIN_hcp: model Oxygen_Consumption= 
                  RunTime Age Weight Run_Pulse Maximum_Pulse; 
   title 'Check "Best" Two Candidate Models';
run;
quit;
title;

0,1
Number of Observations Read,31
Number of Observations Used,31

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,4,711.45087,177.86272,33.01,<.0001
Error,26,140.10368,5.3886,,
Corrected Total,30,851.55455,,,

0,1,2,3
Root MSE,2.32134,R-Square,0.8355
Dependent Mean,47.37581,Adj R-Sq,0.8102
Coeff Var,4.89984,,

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Variable,DF,Parameter Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,97.16952,11.65703,8.34,<.0001
RunTime,1,-2.77576,0.34159,-8.13,<.0001
Age,1,-0.18903,0.09439,-2.0,0.0557
Run_Pulse,1,-0.34568,0.1182,-2.92,0.0071
Maximum_Pulse,1,0.27188,0.13438,2.02,0.0534

0,1
Number of Observations Read,31
Number of Observations Used,31

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,5,721.20532,144.24106,27.66,<.0001
Error,25,130.34923,5.21397,,
Corrected Total,30,851.55455,,,

0,1,2,3
Root MSE,2.28341,R-Square,0.8469
Dependent Mean,47.37581,Adj R-Sq,0.8163
Coeff Var,4.81978,,

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Variable,DF,Parameter Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,101.33835,11.86474,8.54,<.0001
RunTime,1,-2.68846,0.34202,-7.86,<.0001
Age,1,-0.21217,0.09437,-2.25,0.0336
Weight,1,-0.07332,0.0536,-1.37,0.1836
Run_Pulse,1,-0.37071,0.1177,-3.15,0.0042
Maximum_Pulse,1,0.30603,0.13452,2.28,0.0317


### Stepwise selection methods

Stepwise selection methods include forward, backward, and stepwise approaches. In this course, you use these methods to select variables based on their p-values, and we will discuss other methods as well. Let's look at each of these three methods in detail.

Forward selection starts with no predictor variables in the model. It selects the best one-variable model (the most significant variable when run by itself). Then it selects the best two-variable model that includes the variable in the first model. So, after a variable is added to the model, it stays in, even if it becomes insignificant later. Forward selection keeps adding variables, one at a time, until no significant terms are left to add. 

Backward selection, also called backward elimination, starts with all predictor variables in the model. It removes variables one at a time, starting with the most non-significant variable. After a variable is removed from the model, it cannot reenter. Backward selection stops when only significant terms are left in the model. 


Using automated model selection results in biases in parameter estimates, predictions, and standard errors, incorrect calculation of degrees of freedom, and p-values that tend to err on the side of overestimating significance. 

So, how can you avoid these issues? One way is to hold out some of your data in order to perform an honest assessment of how well your model performs on a different sample of data than you used to develop the model. You split your data into two data sets: the training data and the holdout data, which is also called the validation data. You use the training data to build your model, and you use the holdout data to assess and compare potential models.

Other honest assessment approaches include cross-validation or bootstrapping. You might choose to perform cross-validation if your data set isn’t large enough to split and hold out some data for validation. Alternatively, you can use a bootstrapping method to obtain correct standard errors and p-values. Bootstrapping is a resampling method that tries to approximate the distribution of the parameter estimates to estimate the standard error.

One last thing to keep in mind is that the stepwise techniques don’t take any any collinearity
in your model into account. Collinearity means that predictor variables in the same model are 
highly correlated. If collinearity is present in your model, you might want to consider first 
reducing the collinearity as much as possible and then running stepwise methods on the remaining 
variables.

In [14]:
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area 
              Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom ;

ods graphics on;

proc glmselect data=statdata.ameshousing3 plots=all;
   STEPWISE: model SalePrice=&interval / selection=stepwise showpvales
                   details=steps select=SL slstay=0.05 slentry=0.05;
   title "Stepwise Model Selection for SalePrice - SL 0.05";
run;

/*Optional code that will execute forward and backward selection, each with slentry and slstay = 0.05.
proc glmselect data=statdata.ameshousing3 plots=all;
   FORWARD: model SalePrice=&interval / selection=forward details=steps select=SL slentry=0.05;
   title "Forward Model Selection for SalePrice - SL 0.05";
run;

proc glmselect data=statdata.ameshousing3 plots=all;
   BACKWARD: model SalePrice=&interval / selection=backward details=steps select=SL slstay=0.05;
   title "Backward Model Selection for SalePrice - SL 0.05";
run;
*/

0,1
Data Set,STATDATA.AMESHOUSING3
Dependent Variable,SalePrice
Selection Method,Stepwise
Select Criterion,Significance Level
Stop Criterion,Significance Level
Entry Significance Level (SLE),0.05
Stay Significance Level (SLS),0.05
Effect Hierarchy Enforced,

0,1
Number of Observations Read,300
Number of Observations Used,300

Dimensions,Dimensions.1
Number of Effects,9
Number of Parameters,9

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,0,0.0,.,.,.
Error,299,423223500000.0,1415463276,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,37623.0
Dependent Mean,137525.0
R-Square,0.0
Adj R-Sq,0.0
AIC,6624.21515
AICC,6624.25555
SBC,6325.91893

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,137525,2172.144314,63.31,<.0001

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,1,201241800000.0,201241800000.0,270.16,<.0001
Error,298,221981700000.0,744904950.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,27293.0
Dependent Mean,137525.0
R-Square,0.4755
Adj R-Sq,0.4737
AIC,6432.62346
AICC,6432.70454
SBC,6138.03102

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,73904.0,4179.19378,17.68,<.0001
Basement_Area,1,72.107717,4.387055,16.44,<.0001

Entry Candidates,Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,Log pValue,Pr > F
1,Basement_Area,-98.8577,<.0001
2,Gr_Liv_Area,-84.6132,<.0001
3,Age_Sold,-73.5219,<.0001
4,Total_Bathroom,-69.188,<.0001
5,Garage_Area,-63.3558,<.0001
6,Deck_Porch_Area,-34.3105,<.0001
7,Lot_Area,-11.6303,<.0001
8,Bedroom_AbvGr,-5.5339,0.0040

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,2,264483000000.0,132241500000.0,247.42,<.0001
Error,297,158740500000.0,534479711.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,23119.0
Dependent Mean,137525.0
R-Square,0.6249
Adj R-Sq,0.6224
AIC,6334.0262
AICC,6334.16179
SBC,6043.13755

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,12664.0,6650.339855,1.9,0.0578
Gr_Liv_Area,1,69.606974,6.399091,10.88,<.0001
Basement_Area,1,52.309702,4.137885,12.64,<.0001

Entry Candidates,Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,Log pValue,Pr > F
1,Gr_Liv_Area,-52.2496,<.0001
2,Age_Sold,-48.2636,<.0001
3,Garage_Area,-43.6174,<.0001
4,Total_Bathroom,-31.0375,<.0001
5,Deck_Porch_Area,-16.3568,<.0001
6,Lot_Area,-2.2708,0.1032
7,Bedroom_AbvGr,-0.757,0.4691

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,3,320714800000.0,106904900000.0,308.69,<.0001
Error,296,102508700000.0,346313132.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,18609.0
Dependent Mean,137525.0
R-Square,0.7578
Adj R-Sq,0.7553
AIC,6204.82927
AICC,6205.03335
SBC,5917.6444

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,53400.0,6235.076995,8.56,<.0001
Gr_Liv_Area,1,68.106646,5.152294,13.22,<.0001
Basement_Area,1,36.32912,3.559067,10.21,<.0001
Age_Sold,1,-543.493346,42.65184,-12.74,<.0001

Entry Candidates,Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,Log pValue,Pr > F
1,Age_Sold,-67.2828,<.0001
2,Garage_Area,-37.5122,<.0001
3,Total_Bathroom,-21.6266,<.0001
4,Deck_Porch_Area,-12.5097,<.0001
5,Bedroom_AbvGr,-12.4446,<.0001
6,Lot_Area,-0.4524,0.6361

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,4,333571000000.0,83392754480.0,274.4,<.0001
Error,295,89652501590.0,303906785.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,17433.0
Dependent Mean,137525.0
R-Square,0.7882
Adj R-Sq,0.7853
AIC,6166.62734
AICC,6166.91403
SBC,5883.14625

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,43815.0,6023.907004,7.27,<.0001
Gr_Liv_Area,1,61.238136,4.940722,12.39,<.0001
Basement_Area,1,33.430181,3.363709,9.94,<.0001
Garage_Area,1,42.984492,6.608851,6.5,<.0001
Age_Sold,1,-455.704354,42.173481,-10.81,<.0001

Entry Candidates,Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,Log pValue,Pr > F
1,Garage_Area,-21.8203,<.0001
2,Deck_Porch_Area,-12.9294,<.0001
3,Bedroom_AbvGr,-7.8057,0.0004
4,Total_Bathroom,-3.8856,0.0205
5,Lot_Area,-3.698,0.0248

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,5,339278800000.0,67855752389.0,237.65,<.0001
Error,294,83944757568.0,285526386.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,16898.0
Dependent Mean,137525.0
R-Square,0.8017
Adj R-Sq,0.7983
AIC,6148.89269
AICC,6149.27625
SBC,5869.11538

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,46009.0,5859.485517,7.85,<.0001
Gr_Liv_Area,1,58.386514,4.831268,12.09,<.0001
Basement_Area,1,30.55424,3.323249,9.19,<.0001
Garage_Area,1,40.158112,6.436997,6.24,<.0001
Deck_Porch_Area,1,35.720258,7.98924,4.47,<.0001
Age_Sold,1,-447.25404,40.921927,-10.93,<.0001

Entry Candidates,Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,Log pValue,Pr > F
1,Deck_Porch_Area,-11.406,<.0001
2,Bedroom_AbvGr,-6.2737,0.0019
3,Total_Bathroom,-3.1045,0.0448
4,Lot_Area,-1.9476,0.1426

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,6,341074900000.0,56845818595.0,202.75,<.0001
Error,293,82148607939.0,280370676.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,16744.0
Dependent Mean,137525.0
R-Square,0.8059
Adj R-Sq,0.8019
AIC,6144.40398
AICC,6144.89882
SBC,5868.33046

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,48620.0,5897.324643,8.24,<.0001
Gr_Liv_Area,1,65.097413,5.472624,11.9,<.0001
Basement_Area,1,31.279351,3.305546,9.46,<.0001
Garage_Area,1,38.728785,6.403565,6.05,<.0001
Deck_Porch_Area,1,32.487956,8.019119,4.05,<.0001
Age_Sold,1,-434.199118,40.877494,-10.62,<.0001
Bedroom_AbvGr,1,-4189.095026,1655.065743,-2.53,0.0119

Entry Candidates,Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,Log pValue,Pr > F
1,Bedroom_AbvGr,-4.4317,0.0119
2,Total_Bathroom,-2.5664,0.0768
3,Lot_Area,-2.1476,0.1168

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,7,342450800000.0,48921543221.0,176.86,<.0001
Error,292,80772716963.0,276618894.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,16632.0
Dependent Mean,137525.0
R-Square,0.8091
Adj R-Sq,0.8046
AIC,6141.33678
AICC,6141.95747
SBC,5868.96704

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,47463.0,5880.674041,8.07,<.0001
Gr_Liv_Area,1,65.303724,5.436672,12.01,<.0001
Basement_Area,1,29.849078,3.3454,8.92,<.0001
Garage_Area,1,36.309606,6.452405,5.63,<.0001
Deck_Porch_Area,1,32.052554,7.967677,4.02,<.0001
Lot_Area,1,0.708127,0.317512,2.23,0.0265
Age_Sold,1,-447.198682,41.019314,-10.9,<.0001
Bedroom_AbvGr,1,-5042.766498,1687.928168,-2.99,0.0031

Entry Candidates,Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,Log pValue,Pr > F
1,Lot_Area,-3.6309,0.0265
2,Total_Bathroom,-2.2036,0.1104

Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary
Step,Effect Entered,Effect Removed,Number Effects In,F Value,Pr > F
0,Intercept,,1,0.0,1.0000
1,Basement_Area,,2,270.16,<.0001
2,Gr_Liv_Area,,3,118.32,<.0001
3,Age_Sold,,4,162.37,<.0001
4,Garage_Area,,5,42.3,<.0001
5,Deck_Porch_Area,,6,19.99,<.0001
6,Bedroom_AbvGr,,7,6.41,0.0119
7,Lot_Area,,8,4.97,0.0265

0
Selection stopped because the candidate for entry has SLE > 0.05 and the candidate for removal has SLS < 0.05.

Stop Details,Stop Details,Stop Details,Stop Details,Stop Details,Stop Details
Candidate For,Effect,Candidate Significance,Unnamed: 3_level_1,Compare Significance,Unnamed: 5_level_1
Entry,Total_Bathroom,0.1167,>,0.05,(SLE)
Removal,Lot_Area,0.0265,<,0.05,(SLS)

0,1
Effects:,Intercept Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area Lot_Area Age_Sold Bedroom_AbvGr

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,7,342450800000.0,48921543221.0,176.86,<.0001
Error,292,80772716963.0,276618894.0,,
Corrected Total,299,423223500000.0,,,

0,1
Root MSE,16632.0
Dependent Mean,137525.0
R-Square,0.8091
Adj R-Sq,0.8046
AIC,6141.33678
AICC,6141.95747
SBC,5868.96704

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,47463.0,5880.674041,8.07,<.0001
Gr_Liv_Area,1,65.303724,5.436672,12.01,<.0001
Basement_Area,1,29.849078,3.3454,8.92,<.0001
Garage_Area,1,36.309606,6.452405,5.63,<.0001
Deck_Porch_Area,1,32.052554,7.967677,4.02,<.0001
Lot_Area,1,0.708127,0.317512,2.23,0.0265
Age_Sold,1,-447.198682,41.019314,-10.9,<.0001
Bedroom_AbvGr,1,-5042.766498,1687.928168,-2.99,0.0031


In [15]:
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area 
              Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom ;

ods graphics on;
proc glmselect data=statdata.ameshousing3 plots=all;
   STEPWISEAIC: model SalePrice = &interval / selection=stepwise details=steps select=AIC;
   title "Stepwise Model Selection for SalePrice - AIC";
run;

proc glmselect data=statdata.ameshousing3 plots=all;
   STEPWISEBIC: model SalePrice = &interval / selection=stepwise details=steps select=BIC;
   title "Stepwise Model Selection for SalePrice - BIC";
run;

proc glmselect data=statdata.ameshousing3 plots=all;
   STEPWISEAICC: model SalePrice = &interval / selection=stepwise details=steps select=AICC;
   title "Stepwise Model Selection for SalePrice - AICC";
run;

proc glmselect data=statdata.ameshousing3 plots=all;
   STEPWISESBC: model SalePrice = &interval / selection=stepwise details=steps select=SBC;
   title "Stepwise Model Selection for SalePrice - SBC";
run;

0,1
Data Set,STATDATA.AMESHOUSING3
Dependent Variable,SalePrice
Selection Method,Stepwise
Select Criterion,AIC
Stop Criterion,AIC
Effect Hierarchy Enforced,

0,1
Number of Observations Read,300
Number of Observations Used,300

Dimensions,Dimensions.1
Number of Effects,9
Number of Parameters,9

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,0,0.0,.,.
Error,299,423223500000.0,1415463276,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,37623.0
Dependent Mean,137525.0
R-Square,0.0
Adj R-Sq,0.0
AIC,6624.21515
AICC,6624.25555
SBC,6325.91893

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,137525,2172.144314,63.31

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,1,201241800000.0,201241800000.0,270.16
Error,298,221981700000.0,744904950.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,27293.0
Dependent Mean,137525.0
R-Square,0.4755
Adj R-Sq,0.4737
AIC,6432.62346
AICC,6432.70454
SBC,6138.03102

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,73904.0,4179.19378,17.68
Basement_Area,1,72.107717,4.387055,16.44

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Basement_Area,6432.6235
2,Gr_Liv_Area,6461.1877
3,Age_Sold,6483.4097
4,Total_Bathroom,6492.0868
5,Garage_Area,6503.7574
6,Deck_Porch_Area,6561.6989
7,Lot_Area,6606.3138
8,Bedroom_AbvGr,6617.8389

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,2,264483000000.0,132241500000.0,247.42
Error,297,158740500000.0,534479711.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,23119.0
Dependent Mean,137525.0
R-Square,0.6249
Adj R-Sq,0.6224
AIC,6334.0262
AICC,6334.16179
SBC,6043.13755

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,12664.0,6650.339855,1.9
Gr_Liv_Area,1,69.606974,6.399091,10.88
Basement_Area,1,52.309702,4.137885,12.64

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Gr_Liv_Area,6334.0262
2,Age_Sold,6342.0095
3,Garage_Area,6351.3061
4,Total_Bathroom,6376.4084
5,Deck_Porch_Area,6405.4472
6,Lot_Area,6431.9372
7,Bedroom_AbvGr,6434.0931

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,3,320714800000.0,106904900000.0,308.69
Error,296,102508700000.0,346313132.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,18609.0
Dependent Mean,137525.0
R-Square,0.7578
Adj R-Sq,0.7553
AIC,6204.82927
AICC,6205.03335
SBC,5917.6444

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,53400.0,6235.076995,8.56
Gr_Liv_Area,1,68.106646,5.152294,13.22
Basement_Area,1,36.32912,3.559067,10.21
Age_Sold,1,-543.493346,42.65184,-12.74

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Age_Sold,6204.8293
2,Garage_Area,6264.6656
3,Total_Bathroom,6296.3441
4,Deck_Porch_Area,6314.2811
5,Bedroom_AbvGr,6314.4078
6,Lot_Area,6335.7989

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,4,333571000000.0,83392754480.0,274.4
Error,295,89652501590.0,303906785.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,17433.0
Dependent Mean,137525.0
R-Square,0.7882
Adj R-Sq,0.7853
AIC,6166.62734
AICC,6166.91403
SBC,5883.14625

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,43815.0,6023.907004,7.27
Gr_Liv_Area,1,61.238136,4.940722,12.39
Basement_Area,1,33.430181,3.363709,9.94
Garage_Area,1,42.984492,6.608851,6.5
Age_Sold,1,-455.704354,42.173481,-10.81

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Garage_Area,6166.6273
2,Deck_Porch_Area,6184.19
3,Bedroom_AbvGr,6194.098
4,Total_Bathroom,6201.3633
5,Lot_Area,6201.6954

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,5,339278800000.0,67855752389.0,237.65
Error,294,83944757568.0,285526386.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16898.0
Dependent Mean,137525.0
R-Square,0.8017
Adj R-Sq,0.7983
AIC,6148.89269
AICC,6149.27625
SBC,5869.11538

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,46009.0,5859.485517,7.85
Gr_Liv_Area,1,58.386514,4.831268,12.09
Basement_Area,1,30.55424,3.323249,9.19
Garage_Area,1,40.158112,6.436997,6.24
Deck_Porch_Area,1,35.720258,7.98924,4.47
Age_Sold,1,-447.25404,40.921927,-10.93

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Deck_Porch_Area,6148.8927
2,Bedroom_AbvGr,6158.7554
3,Total_Bathroom,6164.5138
4,Lot_Area,6166.4302

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,6,341074900000.0,56845818595.0,202.75
Error,293,82148607939.0,280370676.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16744.0
Dependent Mean,137525.0
R-Square,0.8059
Adj R-Sq,0.8019
AIC,6144.40398
AICC,6144.89882
SBC,5868.33046

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,48620.0,5897.324643,8.24
Gr_Liv_Area,1,65.097413,5.472624,11.9
Basement_Area,1,31.279351,3.305546,9.46
Garage_Area,1,38.728785,6.403565,6.05
Deck_Porch_Area,1,32.487956,8.019119,4.05
Age_Sold,1,-434.199118,40.877494,-10.62
Bedroom_AbvGr,1,-4189.095026,1655.065743,-2.53

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Bedroom_AbvGr,6144.404
2,Total_Bathroom,6147.6813
3,Lot_Area,6148.3694

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,7,342450800000.0,48921543221.0,176.86
Error,292,80772716963.0,276618894.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16632.0
Dependent Mean,137525.0
R-Square,0.8091
Adj R-Sq,0.8046
AIC,6141.33678
AICC,6141.95747
SBC,5868.96704

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,47463.0,5880.674041,8.07
Gr_Liv_Area,1,65.303724,5.436672,12.01
Basement_Area,1,29.849078,3.3454,8.92
Garage_Area,1,36.309606,6.452405,5.63
Deck_Porch_Area,1,32.052554,7.967677,4.02
Lot_Area,1,0.708127,0.317512,2.23
Age_Sold,1,-447.198682,41.019314,-10.9
Bedroom_AbvGr,1,-5042.766498,1687.928168,-2.99

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Lot_Area,6141.3368
2,Total_Bathroom,6143.7813

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,8,343132100000.0,42891512314.0,155.84
Error,291,80091420996.0,275228251.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16590.0
Dependent Mean,137525.0
R-Square,0.8108
Adj R-Sq,0.8056
AIC,6140.79563
AICC,6141.55688
SBC,5872.12967

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,44347.0,6191.271944,7.16
Gr_Liv_Area,1,63.197764,5.585739,11.31
Basement_Area,1,28.692184,3.417034,8.4
Garage_Area,1,35.754191,6.44584,5.55
Deck_Porch_Area,1,31.370539,7.959436,3.94
Lot_Area,1,0.699495,0.316761,2.21
Age_Sold,1,-420.815037,44.219144,-9.52
Bedroom_AbvGr,1,-4834.848748,1688.858227,-2.86
Total_Bathroom,1,3022.124723,1920.839066,1.57

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AIC
1,Total_Bathroom,6140.7956

Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary
Step,Effect Entered,Effect Removed,Number Effects In,AIC
0,Intercept,,1,6624.2151
1,Basement_Area,,2,6432.6235
2,Gr_Liv_Area,,3,6334.0262
3,Age_Sold,,4,6204.8293
4,Garage_Area,,5,6166.6273
5,Deck_Porch_Area,,6,6148.8927
6,Bedroom_AbvGr,,7,6144.4040
7,Lot_Area,,8,6141.3368
8,Total_Bathroom,,9,6140.7956*
* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion

0
Selection stopped because all effects are in the final model.

0,1
Effects:,Intercept Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,8,343132100000.0,42891512314.0,155.84
Error,291,80091420996.0,275228251.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16590.0
Dependent Mean,137525.0
R-Square,0.8108
Adj R-Sq,0.8056
AIC,6140.79563
AICC,6141.55688
SBC,5872.12967

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,44347.0,6191.271944,7.16
Gr_Liv_Area,1,63.197764,5.585739,11.31
Basement_Area,1,28.692184,3.417034,8.4
Garage_Area,1,35.754191,6.44584,5.55
Deck_Porch_Area,1,31.370539,7.959436,3.94
Lot_Area,1,0.699495,0.316761,2.21
Age_Sold,1,-420.815037,44.219144,-9.52
Bedroom_AbvGr,1,-4834.848748,1688.858227,-2.86
Total_Bathroom,1,3022.124723,1920.839066,1.57

0,1
Data Set,STATDATA.AMESHOUSING3
Dependent Variable,SalePrice
Selection Method,Stepwise
Select Criterion,BIC
Stop Criterion,BIC
Effect Hierarchy Enforced,

0,1
Number of Observations Read,300
Number of Observations Used,300

Dimensions,Dimensions.1
Number of Effects,9
Number of Parameters,9

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,0,0.0,.,.
Error,299,423223500000.0,1415463276,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,37623.0
Dependent Mean,137525.0
R-Square,0.0
Adj R-Sq,0.0
AIC,6624.21515
AICC,6624.25555
BIC,6321.30959
C(p),1239.71831
SBC,6325.91893

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,137525,2172.144314,63.31

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,1,201241800000.0,201241800000.0,270.16
Error,298,221981700000.0,744904950.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,27293.0
Dependent Mean,137525.0
R-Square,0.4755
Adj R-Sq,0.4737
AIC,6432.62346
AICC,6432.70454
BIC,6129.32244
C(p),510.53666
SBC,6138.03102

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,73904.0,4179.19378,17.68
Basement_Area,1,72.107717,4.387055,16.44

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Basement_Area,6129.3224
2,Gr_Liv_Area,6157.6644
3,Age_Sold,6179.7247
4,Total_Bathroom,6188.3413
5,Garage_Area,6199.9327
6,Deck_Porch_Area,6257.517
7,Lot_Area,6301.8947
8,Bedroom_AbvGr,6313.3634

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,2,264483000000.0,132241500000.0,247.42
Error,297,158740500000.0,534479711.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,23119.0
Dependent Mean,137525.0
R-Square,0.6249
Adj R-Sq,0.6224
AIC,6334.0262
AICC,6334.16179
BIC,6030.68657
C(p),282.75938
SBC,6043.13755

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,12664.0,6650.339855,1.9
Gr_Liv_Area,1,69.606974,6.399091,10.88
Basement_Area,1,52.309702,4.137885,12.64

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Gr_Liv_Area,6030.6866
2,Age_Sold,6038.5613
3,Garage_Area,6047.7342
4,Total_Bathroom,6072.5166
5,Deck_Porch_Area,6101.2106
6,Lot_Area,6127.4086
7,Bedroom_AbvGr,6129.5416

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,3,320714800000.0,106904900000.0,308.69
Error,296,102508700000.0,346313132.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,18609.0
Dependent Mean,137525.0
R-Square,0.7578
Adj R-Sq,0.7553
AIC,6204.82927
AICC,6205.03335
BIC,5903.19742
C(p),80.44973
SBC,5917.6444

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,53400.0,6235.076995,8.56
Gr_Liv_Area,1,68.106646,5.152294,13.22
Basement_Area,1,36.32912,3.559067,10.21
Age_Sold,1,-543.493346,42.65184,-12.74

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Age_Sold,5903.1974
2,Garage_Area,5961.7128
3,Total_Bathroom,5992.7636
4,Deck_Porch_Area,6010.3666
5,Bedroom_AbvGr,6010.491
6,Lot_Area,6031.5035

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,4,333571000000.0,83392754480.0,274.4
Error,295,89652501590.0,303906785.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,17433.0
Dependent Mean,137525.0
R-Square,0.7882
Adj R-Sq,0.7853
AIC,6166.62734
AICC,6166.91403
BIC,5865.82469
C(p),35.73873
SBC,5883.14625

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,43815.0,6023.907004,7.27
Gr_Liv_Area,1,61.238136,4.940722,12.39
Basement_Area,1,33.430181,3.363709,9.94
Garage_Area,1,42.984492,6.608851,6.5
Age_Sold,1,-455.704354,42.173481,-10.81

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Garage_Area,5865.8247
2,Deck_Porch_Area,5882.8416
3,Bedroom_AbvGr,5892.451
4,Total_Bathroom,5899.5016
5,Lot_Area,5899.824

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,5,339278800000.0,67855752389.0,237.65
Error,294,83944757568.0,285526386.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16898.0
Dependent Mean,137525.0
R-Square,0.8017
Adj R-Sq,0.7983
AIC,6148.89269
AICC,6149.27625
BIC,5848.69541
C(p),17.00051
SBC,5869.11538

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,46009.0,5859.485517,7.85
Gr_Liv_Area,1,58.386514,4.831268,12.09
Basement_Area,1,30.55424,3.323249,9.19
Garage_Area,1,40.158112,6.436997,6.24
Deck_Porch_Area,1,35.720258,7.98924,4.47
Age_Sold,1,-447.25404,40.921927,-10.93

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Deck_Porch_Area,5848.6954
2,Bedroom_AbvGr,5858.1723
3,Total_Bathroom,5863.7094
4,Lot_Area,5865.5528

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,6,341074900000.0,56845818595.0,202.75
Error,293,82148607939.0,280370676.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16744.0
Dependent Mean,137525.0
R-Square,0.8059
Adj R-Sq,0.8019
AIC,6144.40398
AICC,6144.89882
BIC,5844.47548
C(p),12.47448
SBC,5868.33046

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,48620.0,5897.324643,8.24
Gr_Liv_Area,1,65.097413,5.472624,11.9
Basement_Area,1,31.279351,3.305546,9.46
Garage_Area,1,38.728785,6.403565,6.05
Deck_Porch_Area,1,32.487956,8.019119,4.05
Age_Sold,1,-434.199118,40.877494,-10.62
Bedroom_AbvGr,1,-4189.095026,1655.065743,-2.53

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Bedroom_AbvGr,5844.4755
2,Total_Bathroom,5847.5999
3,Lot_Area,5848.2561

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,7,342450800000.0,48921543221.0,176.86
Error,292,80772716963.0,276618894.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16632.0
Dependent Mean,137525.0
R-Square,0.8091
Adj R-Sq,0.8046
AIC,6141.33678
AICC,6141.95747
BIC,5841.69151
C(p),9.47539
SBC,5868.96704

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,47463.0,5880.674041,8.07
Gr_Liv_Area,1,65.303724,5.436672,12.01
Basement_Area,1,29.849078,3.3454,8.92
Garage_Area,1,36.309606,6.452405,5.63
Deck_Porch_Area,1,32.052554,7.967677,4.02
Lot_Area,1,0.708127,0.317512,2.23
Age_Sold,1,-447.198682,41.019314,-10.9
Bedroom_AbvGr,1,-5042.766498,1687.928168,-2.99

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Lot_Area,5841.6915
2,Total_Bathroom,5844.0039

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,8,343132100000.0,42891512314.0,155.84
Error,291,80091420996.0,275228251.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16590.0
Dependent Mean,137525.0
R-Square,0.8108
Adj R-Sq,0.8056
AIC,6140.79563
AICC,6141.55688
BIC,5841.35042
C(p),9.0
SBC,5872.12967

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,44347.0,6191.271944,7.16
Gr_Liv_Area,1,63.197764,5.585739,11.31
Basement_Area,1,28.692184,3.417034,8.4
Garage_Area,1,35.754191,6.44584,5.55
Deck_Porch_Area,1,31.370539,7.959436,3.94
Lot_Area,1,0.699495,0.316761,2.21
Age_Sold,1,-420.815037,44.219144,-9.52
Bedroom_AbvGr,1,-4834.848748,1688.858227,-2.86
Total_Bathroom,1,3022.124723,1920.839066,1.57

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,BIC
1,Total_Bathroom,5841.3504

Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary
Step,Effect Entered,Effect Removed,Number Effects In,BIC
0,Intercept,,1,6321.3096
1,Basement_Area,,2,6129.3224
2,Gr_Liv_Area,,3,6030.6866
3,Age_Sold,,4,5903.1974
4,Garage_Area,,5,5865.8247
5,Deck_Porch_Area,,6,5848.6954
6,Bedroom_AbvGr,,7,5844.4755
7,Lot_Area,,8,5841.6915
8,Total_Bathroom,,9,5841.3504*
* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion

0
Selection stopped because all effects are in the final model.

0,1
Effects:,Intercept Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,8,343132100000.0,42891512314.0,155.84
Error,291,80091420996.0,275228251.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16590.0
Dependent Mean,137525.0
R-Square,0.8108
Adj R-Sq,0.8056
AIC,6140.79563
AICC,6141.55688
BIC,5841.35042
C(p),9.0
SBC,5872.12967

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,44347.0,6191.271944,7.16
Gr_Liv_Area,1,63.197764,5.585739,11.31
Basement_Area,1,28.692184,3.417034,8.4
Garage_Area,1,35.754191,6.44584,5.55
Deck_Porch_Area,1,31.370539,7.959436,3.94
Lot_Area,1,0.699495,0.316761,2.21
Age_Sold,1,-420.815037,44.219144,-9.52
Bedroom_AbvGr,1,-4834.848748,1688.858227,-2.86
Total_Bathroom,1,3022.124723,1920.839066,1.57

0,1
Data Set,STATDATA.AMESHOUSING3
Dependent Variable,SalePrice
Selection Method,Stepwise
Select Criterion,AICC
Stop Criterion,AICC
Effect Hierarchy Enforced,

0,1
Number of Observations Read,300
Number of Observations Used,300

Dimensions,Dimensions.1
Number of Effects,9
Number of Parameters,9

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,0,0.0,.,.
Error,299,423223500000.0,1415463276,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,37623.0
Dependent Mean,137525.0
R-Square,0.0
Adj R-Sq,0.0
AIC,6624.21515
AICC,6624.25555
SBC,6325.91893

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,137525,2172.144314,63.31

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,1,201241800000.0,201241800000.0,270.16
Error,298,221981700000.0,744904950.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,27293.0
Dependent Mean,137525.0
R-Square,0.4755
Adj R-Sq,0.4737
AIC,6432.62346
AICC,6432.70454
SBC,6138.03102

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,73904.0,4179.19378,17.68
Basement_Area,1,72.107717,4.387055,16.44

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Basement_Area,6432.7045
2,Gr_Liv_Area,6461.2688
3,Age_Sold,6483.4907
4,Total_Bathroom,6492.1679
5,Garage_Area,6503.8385
6,Deck_Porch_Area,6561.7799
7,Lot_Area,6606.3949
8,Bedroom_AbvGr,6617.92

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,2,264483000000.0,132241500000.0,247.42
Error,297,158740500000.0,534479711.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,23119.0
Dependent Mean,137525.0
R-Square,0.6249
Adj R-Sq,0.6224
AIC,6334.0262
AICC,6334.16179
SBC,6043.13755

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,12664.0,6650.339855,1.9
Gr_Liv_Area,1,69.606974,6.399091,10.88
Basement_Area,1,52.309702,4.137885,12.64

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Gr_Liv_Area,6334.1618
2,Age_Sold,6342.1451
3,Garage_Area,6351.4417
4,Total_Bathroom,6376.544
5,Deck_Porch_Area,6405.5828
6,Lot_Area,6432.0728
7,Bedroom_AbvGr,6434.2287

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,3,320714800000.0,106904900000.0,308.69
Error,296,102508700000.0,346313132.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,18609.0
Dependent Mean,137525.0
R-Square,0.7578
Adj R-Sq,0.7553
AIC,6204.82927
AICC,6205.03335
SBC,5917.6444

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,53400.0,6235.076995,8.56
Gr_Liv_Area,1,68.106646,5.152294,13.22
Basement_Area,1,36.32912,3.559067,10.21
Age_Sold,1,-543.493346,42.65184,-12.74

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Age_Sold,6205.0334
2,Garage_Area,6264.8697
3,Total_Bathroom,6296.5482
4,Deck_Porch_Area,6314.4852
5,Bedroom_AbvGr,6314.6119
6,Lot_Area,6336.003

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,4,333571000000.0,83392754480.0,274.4
Error,295,89652501590.0,303906785.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,17433.0
Dependent Mean,137525.0
R-Square,0.7882
Adj R-Sq,0.7853
AIC,6166.62734
AICC,6166.91403
SBC,5883.14625

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,43815.0,6023.907004,7.27
Gr_Liv_Area,1,61.238136,4.940722,12.39
Basement_Area,1,33.430181,3.363709,9.94
Garage_Area,1,42.984492,6.608851,6.5
Age_Sold,1,-455.704354,42.173481,-10.81

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Garage_Area,6166.914
2,Deck_Porch_Area,6184.4767
3,Bedroom_AbvGr,6194.3847
4,Total_Bathroom,6201.65
5,Lot_Area,6201.9821

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,5,339278800000.0,67855752389.0,237.65
Error,294,83944757568.0,285526386.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16898.0
Dependent Mean,137525.0
R-Square,0.8017
Adj R-Sq,0.7983
AIC,6148.89269
AICC,6149.27625
SBC,5869.11538

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,46009.0,5859.485517,7.85
Gr_Liv_Area,1,58.386514,4.831268,12.09
Basement_Area,1,30.55424,3.323249,9.19
Garage_Area,1,40.158112,6.436997,6.24
Deck_Porch_Area,1,35.720258,7.98924,4.47
Age_Sold,1,-447.25404,40.921927,-10.93

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Deck_Porch_Area,6149.2763
2,Bedroom_AbvGr,6159.139
3,Total_Bathroom,6164.8974
4,Lot_Area,6166.8138

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,6,341074900000.0,56845818595.0,202.75
Error,293,82148607939.0,280370676.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16744.0
Dependent Mean,137525.0
R-Square,0.8059
Adj R-Sq,0.8019
AIC,6144.40398
AICC,6144.89882
SBC,5868.33046

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,48620.0,5897.324643,8.24
Gr_Liv_Area,1,65.097413,5.472624,11.9
Basement_Area,1,31.279351,3.305546,9.46
Garage_Area,1,38.728785,6.403565,6.05
Deck_Porch_Area,1,32.487956,8.019119,4.05
Age_Sold,1,-434.199118,40.877494,-10.62
Bedroom_AbvGr,1,-4189.095026,1655.065743,-2.53

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Bedroom_AbvGr,6144.8988
2,Total_Bathroom,6148.1761
3,Lot_Area,6148.8642

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,7,342450800000.0,48921543221.0,176.86
Error,292,80772716963.0,276618894.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16632.0
Dependent Mean,137525.0
R-Square,0.8091
Adj R-Sq,0.8046
AIC,6141.33678
AICC,6141.95747
SBC,5868.96704

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,47463.0,5880.674041,8.07
Gr_Liv_Area,1,65.303724,5.436672,12.01
Basement_Area,1,29.849078,3.3454,8.92
Garage_Area,1,36.309606,6.452405,5.63
Deck_Porch_Area,1,32.052554,7.967677,4.02
Lot_Area,1,0.708127,0.317512,2.23
Age_Sold,1,-447.198682,41.019314,-10.9
Bedroom_AbvGr,1,-5042.766498,1687.928168,-2.99

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Lot_Area,6141.9575
2,Total_Bathroom,6144.402

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,8,343132100000.0,42891512314.0,155.84
Error,291,80091420996.0,275228251.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16590.0
Dependent Mean,137525.0
R-Square,0.8108
Adj R-Sq,0.8056
AIC,6140.79563
AICC,6141.55688
SBC,5872.12967

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,44347.0,6191.271944,7.16
Gr_Liv_Area,1,63.197764,5.585739,11.31
Basement_Area,1,28.692184,3.417034,8.4
Garage_Area,1,35.754191,6.44584,5.55
Deck_Porch_Area,1,31.370539,7.959436,3.94
Lot_Area,1,0.699495,0.316761,2.21
Age_Sold,1,-420.815037,44.219144,-9.52
Bedroom_AbvGr,1,-4834.848748,1688.858227,-2.86
Total_Bathroom,1,3022.124723,1920.839066,1.57

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,AICC
1,Total_Bathroom,6141.5569

Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary
Step,Effect Entered,Effect Removed,Number Effects In,AICC
0,Intercept,,1,6624.2555
1,Basement_Area,,2,6432.7045
2,Gr_Liv_Area,,3,6334.1618
3,Age_Sold,,4,6205.0334
4,Garage_Area,,5,6166.9140
5,Deck_Porch_Area,,6,6149.2763
6,Bedroom_AbvGr,,7,6144.8988
7,Lot_Area,,8,6141.9575
8,Total_Bathroom,,9,6141.5569*
* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion

0
Selection stopped because all effects are in the final model.

0,1
Effects:,Intercept Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,8,343132100000.0,42891512314.0,155.84
Error,291,80091420996.0,275228251.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16590.0
Dependent Mean,137525.0
R-Square,0.8108
Adj R-Sq,0.8056
AIC,6140.79563
AICC,6141.55688
SBC,5872.12967

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,44347.0,6191.271944,7.16
Gr_Liv_Area,1,63.197764,5.585739,11.31
Basement_Area,1,28.692184,3.417034,8.4
Garage_Area,1,35.754191,6.44584,5.55
Deck_Porch_Area,1,31.370539,7.959436,3.94
Lot_Area,1,0.699495,0.316761,2.21
Age_Sold,1,-420.815037,44.219144,-9.52
Bedroom_AbvGr,1,-4834.848748,1688.858227,-2.86
Total_Bathroom,1,3022.124723,1920.839066,1.57

0,1
Data Set,STATDATA.AMESHOUSING3
Dependent Variable,SalePrice
Selection Method,Stepwise
Select Criterion,SBC
Stop Criterion,SBC
Effect Hierarchy Enforced,

0,1
Number of Observations Read,300
Number of Observations Used,300

Dimensions,Dimensions.1
Number of Effects,9
Number of Parameters,9

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,0,0.0,.,.
Error,299,423223500000.0,1415463276,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,37623.0
Dependent Mean,137525.0
R-Square,0.0
Adj R-Sq,0.0
AIC,6624.21515
AICC,6624.25555
SBC,6325.91893

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,137525,2172.144314,63.31

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,1,201241800000.0,201241800000.0,270.16
Error,298,221981700000.0,744904950.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,27293.0
Dependent Mean,137525.0
R-Square,0.4755
Adj R-Sq,0.4737
AIC,6432.62346
AICC,6432.70454
SBC,6138.03102

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,73904.0,4179.19378,17.68
Basement_Area,1,72.107717,4.387055,16.44

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,SBC
1,Basement_Area,6138.031
2,Gr_Liv_Area,6166.5953
3,Age_Sold,6188.8172
4,Total_Bathroom,6197.4944
5,Garage_Area,6209.1649
6,Deck_Porch_Area,6267.1064
7,Lot_Area,6311.7214
8,Bedroom_AbvGr,6323.2465

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,2,264483000000.0,132241500000.0,247.42
Error,297,158740500000.0,534479711.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,23119.0
Dependent Mean,137525.0
R-Square,0.6249
Adj R-Sq,0.6224
AIC,6334.0262
AICC,6334.16179
SBC,6043.13755

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,12664.0,6650.339855,1.9
Gr_Liv_Area,1,69.606974,6.399091,10.88
Basement_Area,1,52.309702,4.137885,12.64

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,SBC
1,Gr_Liv_Area,6043.1375
2,Age_Sold,6051.1208
3,Garage_Area,6060.4174
4,Total_Bathroom,6085.5197
5,Deck_Porch_Area,6114.5586
6,Lot_Area,6141.0486
7,Bedroom_AbvGr,6143.2044

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,3,320714800000.0,106904900000.0,308.69
Error,296,102508700000.0,346313132.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,18609.0
Dependent Mean,137525.0
R-Square,0.7578
Adj R-Sq,0.7553
AIC,6204.82927
AICC,6205.03335
SBC,5917.6444

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,53400.0,6235.076995,8.56
Gr_Liv_Area,1,68.106646,5.152294,13.22
Basement_Area,1,36.32912,3.559067,10.21
Age_Sold,1,-543.493346,42.65184,-12.74

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,SBC
1,Age_Sold,5917.6444
2,Garage_Area,5977.4808
3,Total_Bathroom,6009.1592
4,Deck_Porch_Area,6027.0962
5,Bedroom_AbvGr,6027.223
6,Lot_Area,6048.6141

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,4,333571000000.0,83392754480.0,274.4
Error,295,89652501590.0,303906785.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,17433.0
Dependent Mean,137525.0
R-Square,0.7882
Adj R-Sq,0.7853
AIC,6166.62734
AICC,6166.91403
SBC,5883.14625

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,43815.0,6023.907004,7.27
Gr_Liv_Area,1,61.238136,4.940722,12.39
Basement_Area,1,33.430181,3.363709,9.94
Garage_Area,1,42.984492,6.608851,6.5
Age_Sold,1,-455.704354,42.173481,-10.81

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,SBC
1,Garage_Area,5883.1463
2,Deck_Porch_Area,5900.7089
3,Bedroom_AbvGr,5910.6169
4,Total_Bathroom,5917.8822
5,Lot_Area,5918.2143

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,5,339278800000.0,67855752389.0,237.65
Error,294,83944757568.0,285526386.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16898.0
Dependent Mean,137525.0
R-Square,0.8017
Adj R-Sq,0.7983
AIC,6148.89269
AICC,6149.27625
SBC,5869.11538

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,46009.0,5859.485517,7.85
Gr_Liv_Area,1,58.386514,4.831268,12.09
Basement_Area,1,30.55424,3.323249,9.19
Garage_Area,1,40.158112,6.436997,6.24
Deck_Porch_Area,1,35.720258,7.98924,4.47
Age_Sold,1,-447.25404,40.921927,-10.93

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,SBC
1,Deck_Porch_Area,5869.1154
2,Bedroom_AbvGr,5878.9781
3,Total_Bathroom,5884.7365
4,Lot_Area,5886.6529

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,6,341074900000.0,56845818595.0,202.75
Error,293,82148607939.0,280370676.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16744.0
Dependent Mean,137525.0
R-Square,0.8059
Adj R-Sq,0.8019
AIC,6144.40398
AICC,6144.89882
SBC,5868.33046

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,48620.0,5897.324643,8.24
Gr_Liv_Area,1,65.097413,5.472624,11.9
Basement_Area,1,31.279351,3.305546,9.46
Garage_Area,1,38.728785,6.403565,6.05
Deck_Porch_Area,1,32.487956,8.019119,4.05
Age_Sold,1,-434.199118,40.877494,-10.62
Bedroom_AbvGr,1,-4189.095026,1655.065743,-2.53

Entry Candidates,Entry Candidates,Entry Candidates
Rank,Effect,SBC
1,Bedroom_AbvGr,5868.3305
2,Total_Bathroom,5871.6077
3,Lot_Area,5872.2959

Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary,Stepwise Selection Summary
Step,Effect Entered,Effect Removed,Number Effects In,SBC
0,Intercept,,1,6325.9189
1,Basement_Area,,2,6138.0310
2,Gr_Liv_Area,,3,6043.1375
3,Age_Sold,,4,5917.6444
4,Garage_Area,,5,5883.1463
5,Deck_Porch_Area,,6,5869.1154
6,Bedroom_AbvGr,,7,5868.3305*
* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion,* Optimal Value of Criterion

0
Selection stopped at a local minimum of the SBC criterion.

Stop Details,Stop Details,Stop Details,Stop Details,Stop Details
Candidate For,Effect,Candidate SBC,Unnamed: 3_level_1,Compare SBC
Entry,Lot_Area,5868.967,>,5868.3305
Removal,Bedroom_AbvGr,5869.1154,>,5868.3305

0,1
Effects:,Intercept Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area Age_Sold Bedroom_AbvGr

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value
Model,6,341074900000.0,56845818595.0,202.75
Error,293,82148607939.0,280370676.0,
Corrected Total,299,423223500000.0,,

0,1
Root MSE,16744.0
Dependent Mean,137525.0
R-Square,0.8059
Adj R-Sq,0.8019
AIC,6144.40398
AICC,6144.89882
SBC,5868.33046

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Parameter,DF,Estimate,Standard Error,t Value
Intercept,1,48620.0,5897.324643,8.24
Gr_Liv_Area,1,65.097413,5.472624,11.9
Basement_Area,1,31.279351,3.305546,9.46
Garage_Area,1,38.728785,6.403565,6.05
Deck_Porch_Area,1,32.487956,8.019119,4.05
Age_Sold,1,-434.199118,40.877494,-10.62
Bedroom_AbvGr,1,-4189.095026,1655.065743,-2.53


## Include= option

In [16]:
title "Forcing Variables into a Stepwise Model";
proc reg data=exercise;
model Pushups = Max_Pulse Age Rest_Pulse Run_Pulse /
selection = stepwise include=1;
run;
quit;

## Influential Observations

The INFLUENCE option gives you statistics that show you how much each observation
changes aspects of the regression depending on whether that observation is included. The
R option gives you more details about the residuals, as well as the value of the Cook’s D
statistic.

Plot Name | Description
---- | ----
Cooksd Cook’s D statistic | (the effect on the predicted value)
Rstudentbypredicted | Externally Studentized residuals by predicted value
DFFITS | The difference in the overall effect on the betas
DFBETAS | The difference on each beta (one computed for each variable)

The **Cook's D statistic** measures the distance between the set of parameter estimates with that observation deleted from your regression analysis and the set of parameter estimates with all the observations in your regression analysis. If any observation has a Cook's D statistic greater than 4 divided by n, where n is the sample size, that observation is influential. The Cook's D statistic is most useful for identifying influential observations when the purpose of your model is **parameter estimation**.

**STUDENT residuals** are calculated by dividing the residuals by their standard errors, so you can think of each STUDENT residual as roughly equivalent to a z-score. Typically, people consider z-scores large if their absolute value is greater than 2. So, for a relatively small sample size, a cutoff value of plus or minus 2 is reasonable for STUDENT residuals. However, with a large sample, it's very likely that even more STUDENT residuals greater than plus or minus 2 will occur just by chance. So, for larger data sets, you should typically use a larger cutoff value, the absolute value of 3.

SAS computes the **RStudent** value by computing the residual between each data point and a
regression line that was computed with that data point removed, and then dividing by the
standard error. Why is this computation necessary? If you have a very influential data
point, it will pull the line (or surface) closer to the point. Then, when you compute the
residual, you get a smaller value than if you had computed the regression with the data
point omitted. Various texts refer to the RStudent residuals as deleted residuals or
externally standardized residuals.You can use two rules of thumb to evaluate RSTUDENT residuals. First, if the RSTUDENT residual is different from the STUDENT residual, the observation is probably influential. Second, if the absolute value of the RSTUDENT residuals is greater than 2 or 3, you've probably detected an influential observation.

**DFFITS** measures the impact that each observation has on its own predicted value. For each observation, DFFITS is calculated using two predicted values. The first predicted value is calculated from a model using the entire data set to estimate model parameters. The second predicted value is calculated from a model using the data set with that particular observation removed to estimate model parameters. The difference between the two predicted values is divided by the standard error of the predicted value, without the observation. If the standardized difference between these predicted values is large, that particular observation has a large effect on the model fit. The rule of thumb for DFFITS has two versions. The general cutoff value is 2. The more precise cutoff is 2 times the square root of p divided by n, where p is the number of terms in the model, including the intercept, and n is the sample size. If the absolute value of DFFITS for any observation is greater than this cutoff value, you've detected an influential observation. DFFITS is most useful for **predictive models**.

**DFBETAS** measure the change in each parameter estimate. One DFBETA is calculated per predictor variable per observation. Each DFBETA is calculated by taking the estimated coefficient for that particular predictor variable, using all the data, and subtracting the estimated coefficient for that particular predictor variable with the current observation removed. This difference in the betas is divided by its standard error. This calculation is repeated for all predictor variables and all observations. Large DFBETAS indicate observations that are influential in estimating a given parameter. For DFBETAS, you use the same two rules of thumb as for DFFITS. The general cutoff value is 2. The more precise cutoff is $2{\sqrt{1/n}}$, where n is the sample size. 

The DFBETAS plot is a panel plot. It contains one plot for each parameter. In this case, because we have so many parameters, SAS created two panels.

**You can use STUDENT residuals to detect outliers. To detect influential observations, you can use RSTUDENT residuals and the DFFITS and Cook's D statistics.**

What to do with infuential observations? 

First, recheck for data entry errors.

Second, if the data appears to be valid, consider whether you have an adequate model. A different model might fit the data better. Here's one rule of thumb: Divide the number of influential observations you detect by the number of observations in your data set. If the result is greater than 5%, you probably have the wrong model. You might need a model that uses higher order terms. 

Third, determine whether the influential observation is valid but just unusual. If you had a larger sample size there might be more observations similar to the unusual one. You might have to collect more data to confirm the relationship suggested by the influential observation.

In [17]:
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area 
              Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom ;

ods select none;
proc glmselect data=statdata.ameshousing3 plots=all;
   STEPWISE: model SalePrice = &interval / selection=stepwise
                   details=steps select=SL slentry=0.05 slstay=0.05;
   title "Stepwise Model Selection for SalePrice - SL 0.05";
run;
quit;
ods select all;

ods graphics on;
ods output RSTUDENTBYPREDICTED=Rstud 
           COOKSDPLOT=Cook
           DFFITSPLOT=Dffits 
           DFBETASPANEL=Dfbs;
proc reg data=statdata.ameshousing3 
         plots(unpack only label)=
              (RSTUDENTBYPREDICTED 
               COOKSD 
               DFFITS 
               DFBETAS);
   SigLimit: model SalePrice = &_GLSIND; /**/
   title 'SigLimit Model - Plots of Diagnostic Statistics';
run;
quit;

0,1
Number of Observations Read,300
Number of Observations Used,300

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,7,342450800000.0,48921543221.0,176.86,<.0001
Error,292,80772716963.0,276618894.0,,
Corrected Total,299,423223500000.0,,,

0,1,2,3
Root MSE,16632.0,R-Square,0.8091
Dependent Mean,137525.0,Adj R-Sq,0.8046
Coeff Var,12.09371,,

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Variable,Label,DF,Parameter Estimate,Standard Error,t Value,Pr > |t|
Intercept,Intercept,1,47463.0,5880.67404,8.07,<.0001
Gr_Liv_Area,Above grade (ground) living area square feet,1,65.30372,5.43667,12.01,<.0001
Basement_Area,Basement area in square feet,1,29.84908,3.3454,8.92,<.0001
Garage_Area,Size of garage in square feet,1,36.30961,6.45241,5.63,<.0001
Deck_Porch_Area,Total area of decks and porches in square feet,1,32.05255,7.96768,4.02,<.0001
Lot_Area,Lot size in square feet,1,0.70813,0.31751,2.23,0.0265
Age_Sold,"Age of house when sold, in years",1,-447.19868,41.01931,-10.9,<.0001
Bedroom_AbvGr,Bedrooms above grade,1,-5042.7665,1687.92817,-2.99,0.0031


Now let’s look at the Dffits data set. We see DFFITS influence statistics in the DFFITS column. But notice that for some observations like 7, 21, and 22 there are missing values in the DFFITS column. For observations that are flagged as influential by DFFITS, the statistics are in the DFFITSOUT column rather than the DFFITS column. Because the DFFITS values are not all in the same column, if we want to change the cutoff or ask questions about the DFFITS values we’ll have to do a little extra work.

Go back to the DFBETAS panel plot. Here we see the order of the variables in the _GLSIND macro: above grade living area, basement area, garage area, deck/porch area, lot area, age sold, and bedroom above grade. So in the Dfbs data set, _DFBETAS1 and _DFBETASOUT1 are for the intercept, _DFBETAS2 and _DFBETASOUT2 are for above grade living area, _DFBETAS3 and _DFBETASOUT3 are for basement area, and so forth, ending with _DFBETAS8 and _DFBETASOUT8 for the last predictor variable, bedroom above grade. 



In [18]:
 /* Before running the code below,*/
 /* run the code from the previous demo, 
 /* Looking for Influential Observations, Part 1.*/
 /* Run both programs in the same SAS session.*/

title;

/*Check outLevlabel column*/
proc print data=Rstud noobs;
run;
/*Check CooksDLabel column*/
proc print data=Cook noobs;
run;
/*Check DFFITSOUT column*/
proc print data=Dffits noobs;
run;

Model,Dependent,RStudent,PredictedValue,outLevLabel,Observation
SigLimit,SalePrice,1.73092,185283.46,.,1
SigLimit,SalePrice,0.67964,180284.34,.,2
SigLimit,SalePrice,0.63948,104541.46,.,3
SigLimit,SalePrice,-0.58261,169597.56,.,4
SigLimit,SalePrice,1.32153,158490.53,.,5
SigLimit,SalePrice,-0.05738,125932.4,.,6
SigLimit,SalePrice,1.92473,174736.56,.,7
SigLimit,SalePrice,0.36232,153008.35,.,8
SigLimit,SalePrice,1.47568,156222.61,.,9
SigLimit,SalePrice,0.10202,140446.83,.,10

Model,Dependent,CooksD,Observation,CooksDLabel
SigLimit,SalePrice,0.0126,1,.
SigLimit,SalePrice,0.00102,2,.
SigLimit,SalePrice,0.00186,3,.
SigLimit,SalePrice,0.00092,4,.
SigLimit,SalePrice,0.00904,5,.
SigLimit,SalePrice,2e-05,6,.
SigLimit,SalePrice,0.01782,7,7
SigLimit,SalePrice,0.00024,8,.
SigLimit,SalePrice,0.00486,9,.
SigLimit,SalePrice,3e-05,10,.

Model,Dependent,Observation,DFFITS,DFFITSOUT
SigLimit,SalePrice,1,0.31861,.
SigLimit,SalePrice,2,0.09029,.
SigLimit,SalePrice,3,0.12177,.
SigLimit,SalePrice,4,-0.08573,.
SigLimit,SalePrice,5,0.26928,.
SigLimit,SalePrice,6,-0.01301,.
SigLimit,SalePrice,7,.,0.37928
SigLimit,SalePrice,8,0.04366,.
SigLimit,SalePrice,9,0.19752,.
SigLimit,SalePrice,10,0.01636,.


In [19]:
/*Check rows and column*/
proc print data=Dfbs;
run;

First, we’ll use a DATA step to create a data set named Dfbs01 from the first 300 observations of the Dfbs data set. In the next DATA step, we’ll create a data set named Dfbs02 starting with observation 301. Then we’ll combine the two new data sets by using this UPDATE statement in a DATA step, combining by observation. Let’s run these three DATA steps and take a look at the new data sets in the temporary Work library.


In [20]:
data Dfbs01;
   set Dfbs (obs=300);
run;

data Dfbs02;
   set Dfbs (firstobs=301);
run;

data Dfbs2;
   update Dfbs01 Dfbs02;
   by Observation;
run;

In [21]:

proc print data = Dfbs2;
run;

proc sql number;
create table Dfbs3 as
select o.Model, o.Dependent, o.Observation, 
o._DFBETAS1, o._DFBETASOUT1,	
o._DFBETAS2, o._DFBETASOUT2, o._DFBETAS3, o._DFBETASOUT3, o._DFBETAS4, o._DFBETASOUT4 ,	
o._DFBETAS5, o._DFBETASOUT5, o._DFBETAS6, o._DFBETASOUT6,
t._DFBETAS7, t._DFBETASOUT7, t._DFBETAS8, t._DFBETASOUT8
from Dfbs01 as o inner join Dfbs02 as t
on o.observation = t.observation;
select* from Dfbs3;
run;

In [22]:
data influential;
/*  Merge data sets from above.*/
    merge Rstud
          Cook 
          Dffits
          Dfbs2;
    by observation;

/*  Flag observations that have exceeded at least one cutpoint;*/
   if (ABS(Rstudent)>3) or (Cooksdlabel ne ' ') or Dffitsout then flag=1;
   array dfbetas{*} _dfbetasout: ;
   do i=2 to dim(dfbetas);
      if dfbetas{i} then flag=1;
   end;

/*  Set to missing values of influence statistics for those*/
/*  that have not exceeded cutpoints;*/
   if ABS(Rstudent)<=3 then RStudent=.;
   if Cooksdlabel eq ' ' then CooksD=.;

/*  Subset only observations that have been flagged.*/
   if flag=1;
   drop i flag;
run;

title;
proc print data=influential;
   id observation;
   var Rstudent CooksD Dffitsout _dfbetasout:; 
run;

In [23]:
title "Displaying Influential Observations";
proc reg data=exercise plots(only) = (cooksd(label)
rstudentbypredicted(label));
id Subj;
model Pushups = Rest_Pulse / influence r;
run;
quit;


In [24]:
ods graphics on;
title "Detecting Influential Observations in Multiple Regression";
proc reg data=exercise 
    plots(label only) = (cooksd
    rstudentbypredicted
    dffits
    dfbetas);
id Subj;
model Pushups = Age Max_Pulse Run_Pulse / influence;
run;
quit;
ods graphics off;

## Creating Dummy Variables 

In [25]:
data Dummy;
    set Store;
    *Create dummy variable for Gender;
        if Gender = 'Male' then Male = 1;
        else if Gender = 'Female' then Male = 0;
    *Create Dummy Variable for Region;
        if Region not in ('North' 'East' 'South' 'West') then
            call missing(North, East, South);
            else if Region = 'North' then North = 1;
        else North = 0;
        if Region = 'East' then East = 1;
            else East = 0;
        if Region = 'South' then South = 1;
            else South = 0;
run;

title "Creating and Using Dummy variables";
proc print data=Dummy(obs=10) noobs;
    var Region Gender Male North East South;
run

In [26]:
title "Running a Multiple Regression with Dummy Variables";
proc reg data=Dummy;
model Music_Sales = Total_Sales Male North East South;
run;
quit;

## Detecting Collinearity via Variance Inflation Factor
(pay attention when VIF is between 5 and 10)

In [27]:
title "Using the VIF to Detect Collinearity";
proc reg data=exercise;
    model Pushups = Age Rest_Pulse Max_Pulse Run_Pulse / VIF;
run;
quit;

In the PLOTS= option, the global plot option ONLY suppresses the default plots. QQ requests a residual quantile-quantile plot to assess the normality of the residual error, and RESIDUALBYPREDICTED requests a plot of residuals by predicted values. RESIDUALS requests a panel of plots of residuals by the predictor variables in the model.

In [28]:
ods graphics / imagemap=on width=800;

proc reg data=statdata.fitness
         plots(only)=(QQ RESIDUALBYPREDICTED RESIDUALS); 
   PREDICT: model Oxygen_Consumption =
                  RunTime Age Run_Pulse Maximum_Pulse; 
   id Name; 
   title 'PREDICT Model - Plots of Diagnostic Statistics';
run;
quit;

title;

0,1
Number of Observations Read,31
Number of Observations Used,31

Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance,Analysis of Variance
Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,4,711.45087,177.86272,33.01,<.0001
Error,26,140.10368,5.3886,,
Corrected Total,30,851.55455,,,

0,1,2,3
Root MSE,2.32134,R-Square,0.8355
Dependent Mean,47.37581,Adj R-Sq,0.8102
Coeff Var,4.89984,,

Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates,Parameter Estimates
Variable,DF,Parameter Estimate,Standard Error,t Value,Pr > |t|
Intercept,1,97.16952,11.65703,8.34,<.0001
RunTime,1,-2.77576,0.34159,-8.13,<.0001
Age,1,-0.18903,0.09439,-2.0,0.0557
Run_Pulse,1,-0.34568,0.1182,-2.92,0.0071
Maximum_Pulse,1,0.27188,0.13438,2.02,0.0534


#### Code for SAS Statistic by example

In [29]:
*variables

Region
Advertising
Gender
Book_Sales
Music_Sales
Electronics_Sales
Total_Sales
;

proc format;
   value yesno 1 = 'Yes'
               0 = 'No';
data Store;
   length Region $ 5;
   call streaminit(57676);
   do Transaction = 1 to 200;
      R = ceil(rand('uniform')*10);
      select(R);
         when(1) Region = 'East';
         when(2) Region = 'West';
         when(3) Region = 'North';
         when(4) Region = 'South';
         otherwise;
      end;
      Advertising = rand('bernouli',.6);
      if rand('uniform') lt .6 then Gender = 'Female';
         else Gender = 'Male';
      Book_Sales = abs(round(rand('normal',250,50) + 30*(Gender = 'Female')
                    + 30*Advertising,10)) ;
      Music_Sales = abs(round(rand('uniform')*40 + rand('normal',50,5)
         + 30*(Region = 'East' and Gender = 'Male')
         - 20*(Region = 'West' and Gender = 'Female'),5) + 10*Advertising);
      Electronics_Sales = abs(round(rand('normal',300,60) + 70*(Gender = 'Male')
       + 55*Advertising + 50*(Region = 'East') - 20*(Region = 'South') 
       + 75*(Region = 'West'),10));
      Total_Sales = sum(Book_Sales,Music_Sales,Electronics_Sales);
   output;
   end;
   drop R;
   format Book_Sales Music_Sales Electronics_Sales Total_Sales dollar9.
          Advertising yesno.;
run;
 
/*title "Listing of Store";*/
/*proc print data=store heading=h;*/
/*run;*/

/*proc univariate data=store;*/
/*   var Book_Sales -- Total_Sales;*/
/*   histogram;*/
/*run;*/
/**/
/*title "Scatter Matrix for Store Variables";*/
/*proc sgscatter data=store;*/
/*   matrix Book_Sales -- Total_Sales / group = Gender;*/
/*run;*/
/**/
/*proc sgplot data=store;*/
/*   scatter x=Book_Sales y=Total_Sales / group=Gender;*/
/*run;*/

proc rank data=store out=median_sales groups=2;
   var Total_Sales;
   ranks Sales_Group;
run;

proc format;
   value sales 0 = 'Low'
               1 = 'High';
run;

/*proc logistic data=median_sales order=formatted;*/
/*   class Gender(param=ref ref='Male');*/
/*   model Sales_Group = Gender;*/
/*   format Sales_Group sales.;*/
/*quit;*/
/**/
/*proc logistic data=median_sales order=formatted;*/
/*   class Gender(param=ref ref='Male')*/
/*         Advertising (param=ref ref='No');*/
/*   model Sales_Group = Gender Advertising;*/
/*   format Sales_Group sales.;*/
/*quit;*/

*Create test data set;
libname example 'c:\books\statistics by example';
data example.Blood_Pressure;
   call streaminit(37373);
   do Drug = 'Placebo','Drug A','Drug B';
      do i = 1 to 20;
         Subj + 1;
         if mod(Subj,2) then Gender = 'M';
         else Gender = 'F';
         SBP = rand('normal',130,10) +
               7*(Drug eq 'Placebo') - 6*(Drug eq 'Drug B');
         SBP = round(SBP,2);
         DBP = rand('normal',80,5) +
               3*(Drug eq 'Placebo') - 2*(Drug eq 'Drug B');
         DBP = round(DBP,2);
         if Subj in (5,15,25,55) then call missing(SBP, DBP);
         if Subj in (4,18) then call missing(Gender);
         output;
      end;
   end;
   drop i;
run;

/*title "Listing of the first 25 observations from Blood_Pressure";*/
/*proc print data=example.Blood_Pressure(obs=25) noobs;*/
/*   var Subj Drug SBP DBP;*/
/*run;*/

data exercise;
   call streaminit(7657657);
   do Subj = 1 to 50;
      Age = round(rand('normal',50,15));
      Pushups = abs(int(rand('normal',40,10) - .30*age));
      Rest_Pulse = round(rand('normal',50,8) + .35*age);
      Max_Pulse = round(rest_pulse + rand('normal',50,5) - .05*age);
      Run_Pulse = round(max_pulse - rand('normal',3,3));
      output;
   end;
run;

*Data set for a paired t-test example;
data reading;
   input Subj Before After @@;
datalines;
1 100 110  2 120 121  3 130 140  4 90 110  5 85 92
6 133 137  7 210 209  8 155 179
;

/*title "Listing of Data Set READING";*/
/*proc print data=reading noobs;*/
/*run;*/

*Data set that violates assumptions for a t-test;
data salary;
   call streaminit(57575);
   do Subj = 1 to 50;
      do Gender = 'M','F';
         Income = round(20000*rand('exponential') + rand('uniform')*7000*(Gender = 'M'));
         output;
      end;
   end;
run;
/*proc univariate data=salary;*/
/*   class Gender;*/
/*   id Subj;*/
/*   var Income;*/
/*   histogram Income;*/
/*run;*/

*Data set risk for logistic regression example;
proc format;
   value yesno 1 = 'Yes'
               0 = 'No';
run;

data Risk;
   call streaminit(13579);
   length Age_Group $ 7;
   do i = 1 to 250;
      do Gender = 'F','M';
         Age = round(rand('uniform')*30 + 50);
         if missing(Age) then Age_Group = ' ';
         else if Age lt 60 then Age_Group = '1:< 60';
         else if Age le 70 then Age_Group = '2:60-70';
         else Age_Group = '3:71+';
         Chol = rand('normal',200,30) + rand('uniform')*8*(Gender='M');
         Chol = round(Chol);
         Score = .3*chol + age + 8*(Gender eq 'M');
         Heart_Attack = (Score gt 130)*(rand('uniform') lt .2);
         output;
       end;
   end;
   keep Gender Age Age_Group chol Heart_Attack;
   format Heart_Attack yesno.;
run;

/*title "Listing of first 100 observations from RISK";*/
/*proc print data=risk(obs=100);*/
/*run;*/