In [4]:
data.raw = read.table("crabs.dat.txt", header = TRUE)

In [5]:
data.raw$color = factor(data.raw$color)
data.raw$y = factor(data.raw$y)

In [6]:
library(MASS)
model.all_variables = glm.nb(spine ~ sat + y + weight + width + color, data = data.raw)
summary(model.all_variables)


Call:
glm.nb(formula = spine ~ sat + y + weight + width + color, data = data.raw, 
    init.theta = 182421.4628, link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2842  -0.2786   0.1515   0.3177   1.3208  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept) -0.009537   0.978631  -0.010  0.99222   
sat         -0.007227   0.022171  -0.326  0.74444   
y1           0.091759   0.147522   0.622  0.53394   
weight      -0.132128   0.180488  -0.732  0.46413   
width        0.022427   0.049338   0.455  0.64942   
color2       0.581651   0.259188   2.244  0.02482 * 
color3       0.716702   0.268607   2.668  0.00763 **
color4       0.766032   0.284859   2.689  0.00716 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(182421.5) family taken to be 1)

    Null deviance: 56.156  on 172  degrees of freedom
Residual deviance: 44.604  on 165  degrees of freedom
AIC: 537.83

Nu

In [7]:
options(warn=-1)
model.all_variables.backwards = stepAIC(model.all_variables, direction = "backward")

Start:  AIC=535.83
spine ~ sat + y + weight + width + color

         Df    AIC
- sat     1 533.93
- width   1 534.03
- y       1 534.21
- weight  1 534.36
<none>      535.83
- color   3 539.62

Step:  AIC=533.93
spine ~ y + weight + width + color

         Df    AIC
- width   1 532.16
- y       1 532.22
- weight  1 532.55
<none>      533.93
- color   3 537.86

Step:  AIC=532.16
spine ~ y + weight + color

         Df    AIC
- y       1 530.52
- weight  1 530.69
<none>      532.16
- color   3 535.98

Step:  AIC=530.52
spine ~ weight + color

         Df    AIC
- weight  1 528.82
<none>      530.52
- color   3 534.05

Step:  AIC=528.82
spine ~ color

        Df    AIC
<none>     528.82
- color  3 533.38


In [8]:
model.color = glm.nb(spine ~ color, data = data.raw)
summary(model.color)


Call:
glm.nb(formula = spine ~ color, data = data.raw, init.theta = 183942.6653, 
    link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2740  -0.2725   0.1347   0.3656   1.2378  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)   0.2877     0.2500   1.151  0.24985   
color2        0.5922     0.2586   2.290  0.02202 * 
color3        0.7321     0.2659   2.754  0.00589 **
color4        0.7644     0.2800   2.730  0.00632 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(183942.7) family taken to be 1)

    Null deviance: 56.156  on 172  degrees of freedom
Residual deviance: 45.603  on 169  degrees of freedom
AIC: 530.82

Number of Fisher Scoring iterations: 1


              Theta:  183943 
          Std. Err.:  2342618 

 2 x log-likelihood:  -520.825 

Dispersion parameter equals to one!!! This is just equivalent to the Poisson Regression model!

In [15]:
model.weight.weightcolor = glm.nb(spine ~ weight + weight:color, data = data.raw)
model.color.weight = glm.nb(spine ~ color + weight, data = data.raw)
model.color.width = glm.nb(spine ~ color + width, data = data.raw)
model.color.y = glm.nb(spine ~ color + y, data = data.raw)
model.color.sat = glm.nb(spine ~ color + sat, data = data.raw)

In [19]:
require(formula.tools)
require(boot)
options(warn=-1)

crossValidation <- function(mod) {
    set.seed(12345)
    rawError = signif(cv.glm(data.raw[, 2:7], mod, K = 10)$delta[1], 4)
    adjustedError = signif(cv.glm(data.raw[, 2:7], mod, K = 10)$delta[2], 4)
    c(rawCVError=rawError, adjustedCVError=adjustedError)
}

models = list(color = model.color, 
              all_variables = model.all_variables,
              color.weight = model.color.weight,
              color.width = model.color.width,
              color.y = model.color.y,
              color.sat = model.color.sat,
              weight.weightcolor = model.weight.weightcolor
             )

print(sapply(models, crossValidation, simplify = FALSE))

$color
     rawCVError adjustedCVError 
         0.5716          0.5682 

$all_variables
     rawCVError adjustedCVError 
         0.6002          0.5875 

$color.weight
     rawCVError adjustedCVError 
         0.5848          0.5773 

$color.width
     rawCVError adjustedCVError 
         0.5892          0.5772 

$color.y
     rawCVError adjustedCVError 
         0.5728          0.5706 

$color.sat
     rawCVError adjustedCVError 
         0.5836          0.5723 

$weight.weightcolor
     rawCVError adjustedCVError 
         0.5854          0.5808 

