Skip to content
master
Go to file
Code

Latest commit

 

Git stats

Files

Permalink
Failed to load latest commit information.
Type
Name
Latest commit message
Commit time
pkg
 
 
 
 
 
 

README.md

lmSubsets

Overview

Exact and approximation algorithms for variable-subset selection in ordinary linear regression models. Either compute all submodels with the lowest residual sum of squares, or determine the single-best submodel according to a pre-determined statistical criterion. Hofmann et al. (2020).

Installation

# Install package from CRAN:
install.packages("lmSubsets")

Usage

# load package
library("lmSubsets")

# load some data
data("AirPollution", package = "lmSubsets")

head(AirPollution)

##   precipitation temperature1 temperature7  age household education housing
## 1            36           27           71  8.1      3.34      11.4    81.5
## 2            35           23           72 11.1      3.14      11.0    78.8
## 3            44           29           74 10.4      3.21       9.8    81.6
## 4            47           45           79  6.5      3.41      11.1    77.5
## 5            43           35           77  7.6      3.44       9.6    84.6
## 6            53           45           80  7.7      3.45      10.2    66.8
##   population noncauc whitecollar income hydrocarbon nox so2 humidity mortality
## 1       3243     8.8        42.6   11.7          21  15  59       59   921.870
## 2       4281     3.5        50.7   14.4           8  10  39       57   997.875
## 3       4260     0.8        39.4   12.4           6   6  33       54   962.354
## 4       3125    27.1        50.2   20.6          18   8  24       56   982.291
## 5       6441    24.4        43.7   14.3          43  38 206       55  1071.289
## 6       3325    38.5        43.1   25.5          30  32  72       54  1030.380

All-subsets regression

# compute 3 best submodels for every subset size
lm_all <- lmSubsets(mortality ~ ., data = AirPollution, nbest = 3)
lm_all

## Call:
##   lmSubsets(formula = mortality ~ ., data = AirPollution, nbest = 3)
## 
## Deviance:
##   [best, size (tolerance)] = RSS
##       2 (0)    3 (0)     4 (0)    5 (0)    6 (0)    7 (0)    8 (0)    9 (0)   
##   1st 133694.5  99841.07 82388.53 69154.11 64633.79 60538.76 58385.72 57379.21
##   2nd 168695.5 103859.31 83335.14 72250.33 65659.86 62288.70 58870.48 57617.43
##   3rd 169041.4 109202.60 85241.98 74666.42 66554.64 62953.77 60057.48 57748.66
##       10 (0)   11 (0)   12 (0)   13 (0)   14 (0)   15 (0)   16 (0)  
##   1st 55358.05 54221.58 53921.82 53712.66 53696.00 53683.31 53680.02
##   2nd 56185.55 54718.93 54146.37 53874.74 53696.65 53690.20         
##   3rd 56550.95 55260.67 54186.59 53900.78 53709.86 53695.48         
## 
## Subset:
##   [variable, best] = size
##                 1st       2nd          3rd             
##   +(Intercept)  2-16      2-15         2-15            
##   precipitation 5-16      4,6-15       2,6-15          
##   temperature1  4-16      3,5-15       5-15            
##   temperature7  7-16      6,8-15       8-15            
##   age           9-16      11-15        11-15           
##   household     8-16      7,9-15       5,10-15         
##   education     3-4,6-16  2,5,7-15     4-5,7,9-15      
##   housing       13-16     12,14-15     8,14-15         
##   population    11-16     8-10,12-15   6-9,12-15       
##   noncauc       2-16      3-15         3-15            
##   whitecollar   14-16     15                           
##   income        16        13,15        11,14-15        
##   hydrocarbon   10-16     10-15        9-15            
##   nox           10-16     10-15        9-15            
##   so2           5-9,12-16 4-9,11,13-15 3-4,6-8,10,13-15
##   humidity      15-16     14           12-13,15

# plot deviances and BICs
plot(lm_all)

# plot heatmap, hilite 10 best submodels (BIC)
image(lm_all, best = 1:3, hilite = 1:10, hilite_penalty = "BIC")

# evaluate summary statistics
summary(lm_all)

## Call:
##   lmSubsets(formula = mortality ~ ., data = AirPollution, nbest = 3)
## 
## Statistics:
##    SIZE BEST sigma    R2        R2adj     pval       Cp        AIC      BIC     
##     2   1    48.01123 0.4144106 0.4043142 2.8849e-08 53.585642 638.8107 645.0938
##         2    53.93092 0.2611043 0.2483647 3.0221e-05 82.274971 652.7630 659.0460
##         3    53.98617 0.2595895 0.2468238 3.2154e-05 82.558453 652.8859 659.1689
##     3   1    41.85209 0.5626906 0.5473464 5.7874e-11 27.836910 623.2920 631.6694
##         2    42.68598 0.5450905 0.5291288 1.7818e-10 31.130547 625.6595 634.0369
##         3    43.77025 0.5216866 0.5049037 7.4440e-10 35.510292 628.6696 637.0469
##     4   1    38.35653 0.6391337 0.6198016 1.9699e-12 15.531554 613.7640 624.2358
##         2    38.57626 0.6349875 0.6154333 2.7041e-12 16.307465 614.4495 624.9212
##         3    39.01510 0.6266355 0.6066338 5.0630e-12 17.870447 615.8069 626.2786
##     5   1    35.45910 0.6971012 0.6750722 1.0975e-13  6.683675 605.2575 617.8236
##         2    36.24421 0.6835396 0.6605243 3.5924e-13  9.221560 607.8855 620.4516
##         3    36.84523 0.6729570 0.6491721 8.7460e-13 11.201957 609.8591 622.4252
##     6   1    34.59656 0.7169005 0.6906875 1.1157e-13  4.978493 603.2015 617.8619
##         2    34.87009 0.7124062 0.6857771 1.6918e-13  5.819540 604.1465 618.8069
##         3    35.10688 0.7084870 0.6814951 2.4190e-13  6.552961 604.9587 619.6191
##     7   1    33.79705 0.7348369 0.7048185 1.1482e-13  3.621912 601.2743 618.0290
##         2    34.28204 0.7271721 0.6962859 2.3951e-13  5.056290 602.9841 619.7388
##         3    34.46458 0.7242590 0.6930431 3.1495e-13  5.601433 603.6213 620.3761
##     8   1    33.50824 0.7442674 0.7098418 2.4093e-13  3.857124 601.1015 619.9506
##         2    33.64706 0.7421441 0.7074327 2.9669e-13  4.254470 601.5977 620.4468
##         3    33.98458 0.7369449 0.7015337 4.9030e-13  5.227424 602.7954 621.6445
##     9   1    33.54225 0.7486759 0.7092525 7.6686e-13  5.032120 602.0582 623.0016
##         2    33.61181 0.7476325 0.7080454 8.4893e-13  5.227381 602.3068 623.2502
##         3    33.65006 0.7470577 0.7073805 8.9767e-13  5.334944 602.4433 623.3867
##    10   1    33.27403 0.7575287 0.7138839 1.4806e-12  5.375433 601.9066 624.9444
##         2    33.52180 0.7539042 0.7096070 2.1117e-12  6.053710 602.7968 625.8346
##         3    33.63063 0.7523037 0.7077184 2.4657e-12  6.353221 603.1858 626.2236
##    11   1    33.26504 0.7625065 0.7140385 3.9595e-12  6.443899 602.6620 627.7941
##         2    33.41725 0.7603281 0.7114155 4.8987e-12  6.851561 603.2098 628.3420
##         3    33.58227 0.7579552 0.7085584 6.1626e-12  7.295611 603.8010 628.9331
##    12   1    33.51673 0.7638195 0.7096948 1.4379e-11  8.198194 604.3294 631.5559
##         2    33.58645 0.7628359 0.7084858 1.5800e-11  8.382254 604.5787 631.8052
##         3    33.59892 0.7626598 0.7082693 1.6068e-11  8.415219 604.6233 631.8498
##    13   1    33.80566 0.7647356 0.7046681 5.1381e-11 10.026756 606.0962 635.4170
##         2    33.85663 0.7640257 0.7037769 5.4909e-11 10.159607 606.2770 635.5978
##         3    33.86481 0.7639116 0.7036338 5.5497e-11 10.180949 606.3060 635.6268
##    14   1    34.16584 0.7648086 0.6983414 1.8846e-10 12.013101 608.0776 639.4928
##         2    34.16604 0.7648057 0.6983378 1.8850e-10 12.013627 608.0783 639.4935
##         3    34.17025 0.7647479 0.6982636 1.8950e-10 12.024458 608.0931 639.5082
##    15   1    34.53929 0.7648641 0.6917108 6.5757e-10 14.002698 610.0634 643.5729
##         2    34.54150 0.7648340 0.6916712 6.5932e-10 14.008341 610.0711 643.5806
##         3    34.54320 0.7648108 0.6916409 6.6067e-10 14.012672 610.0770 643.5865
##    16   1    34.92851 0.7648786 0.6847235 2.1933e-09 16.000000 612.0597 647.6636

Best-subset regression

# compute 20 best (BIC) submodels
lm_best <- lmSelect(mortality ~ ., data = AirPollution, nbest = 20)
lm_best

## Call:
##   lmSelect(formula = mortality ~ ., data = AirPollution, nbest = 20)
## 
## Criterion:
##   [best] = BIC
##        1st      2nd      3rd      4th      5th      6th      7th      8th 
##   617.8236 617.8619 618.0290 618.8069 619.6191 619.7388 619.8733 619.9506 
##        9th     10th     11th     12th     13th     14th     15th     16th 
##   620.3761 620.4269 620.4468 620.4516 620.5732 620.6156 620.8005 620.9322 
##       17th     18th     19th     20th 
##   621.0169 621.0900 621.1458 621.1550 
## 
## Subset:
##                 best                 
##   +(Intercept)  1-20                 
##   precipitation 1-11,14-19           
##   temperature1  1-20                 
##   temperature7  3-4,8,10-11,17-18    
##   age           13,16,19             
##   household     6,8,20               
##   education     2-3,6,8-9,11-14,19-20
##   housing       15,18                
##   population    5,9-11               
##   noncauc       1-20                 
##   whitecollar   7,17                 
##   income                             
##   hydrocarbon                        
##   nox                                
##   so2           1-20                 
##   humidity      14

# plot deviances and BIC
plot(lm_best)

# show heatmap
image(lm_best, hilite = 1)

# evaluate summary statistics
summary(lm_best)

## Call:
##   lmSelect(formula = mortality ~ ., data = AirPollution, nbest = 20)
## 
## Statistics:
##    SIZE BEST sigma    R2        R2adj     pval       Cp       AIC      BIC     
##    5     1   35.45910 0.6971012 0.6750722 1.0975e-13 6.683675 605.2575 617.8236
##    6     2   34.59656 0.7169005 0.6906875 1.1157e-13 4.978493 603.2015 617.8619
##    7     3   33.79705 0.7348369 0.7048185 1.1482e-13 3.621912 601.2743 618.0290
##    6     4   34.87009 0.7124062 0.6857771 1.6918e-13 5.819540 604.1465 618.8069
##    6     5   35.10688 0.7084870 0.6814951 2.4190e-13 6.552961 604.9587 619.6191
##    7     6   34.28204 0.7271721 0.6962859 2.3951e-13 5.056290 602.9841 619.7388
##    6     7   35.18135 0.7072491 0.6801425 2.7055e-13 6.784624 605.2129 619.8733
##    8     8   33.50824 0.7442674 0.7098418 2.4093e-13 3.857124 601.1015 619.9506
##    7     9   34.46458 0.7242590 0.6930431 3.1495e-13 5.601433 603.6213 620.3761
##    7    10   34.47918 0.7240254 0.6927829 3.2190e-13 5.645161 603.6721 620.4269
##    8    11   33.64706 0.7421441 0.7074327 2.9669e-13 4.254470 601.5977 620.4468
##    5    12   36.24421 0.6835396 0.6605243 3.5924e-13 9.221560 607.8855 620.4516
##    6    13   35.38715 0.7038141 0.6763895 3.6815e-13 7.427446 605.9128 620.5732
##    7    14   34.53343 0.7231562 0.6918154 3.4908e-13 5.807818 603.8608 620.6156
##    6    15   35.45422 0.7026903 0.6751616 4.0686e-13 7.637752 606.1401 620.8005
##    6    16   35.49316 0.7020369 0.6744477 4.3113e-13 7.760027 606.2718 620.9322
##    7    17   34.64911 0.7212983 0.6897472 4.1474e-13 6.155493 604.2621 621.0169
##    7    18   34.67023 0.7209585 0.6893689 4.2797e-13 6.219089 604.3352 621.0900
##    7    19   34.68636 0.7206987 0.6890797 4.3835e-13 6.267703 604.3911 621.1458
##    6    20   35.55912 0.7009283 0.6732365 4.7553e-13 7.967486 606.4946 621.1550

Authors

Marc Hofmann (maintainer), Cristian Gatu, Erricos J. Kontoghiorghes, Ana Colubi, Achim Zeileis

About

Source code for R package 'lmSubstets'.

Resources

Packages

No packages published
You can’t perform that action at this time.