Replicating Some Results from An Introduction to Statistical Learning (2nd Edition) - Chapter 5: Resampling Methods

Nishan Mudalige

2022-10-01

Source: An Introduction to Statistical Learning (2nd Edition) by Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani

knitr::opts_chunk$set(echo = TRUE)

Cross-Validation

# Required libraries for data analysis

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.1.3

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.1.3

## 
## Attaching package: 'ISLR2'

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

library(boot)
library(bestglm)

## Warning: package 'bestglm' was built under R version 4.1.3

## Loading required package: leaps

## Warning: package 'leaps' was built under R version 4.1.3

library(npreg)

## Warning: package 'npreg' was built under R version 4.1.3

## 
## Attaching package: 'npreg'

## The following object is masked from 'package:boot':
## 
##     boot

Validation Set Approach

Replicate the results in Figure 5.2

data(Auto)

set.seed(6)

n = nrow(Auto)
MSE = NULL

training_set_indices = sample(n, n/2, replace = F)
validation_set_indices = (1:n)[!(1:n %in% training_set_indices)]

training_set = Auto[training_set_indices, ]
validation_set = Auto[validation_set_indices, ]

for(i in 1:10){
  
  n_valid = nrow(validation_set)

  model = lm(mpg ~ poly(horsepower, i), data = training_set)

  fitted_values = predict(model, validation_set)

  MSE[i] = (sum((validation_set$mpg - fitted_values)^2)) / (n_valid - length(model$coefficients))
}

plot(MSE, type = "b", pch = 16, ylab = "MSE")

set.seed(1)

MSE_list = NULL

for(i in 1:10){
  
  training_set_indices = sample(n, n/2, replace = F)
  validation_set_indices = (1:n)[!(1:n %in% training_set_indices)]
  
  training_set = Auto[training_set_indices, ]
  validation_set = Auto[validation_set_indices, ]  
  
  for(j in 1:10){
    
    n_valid = nrow(validation_set)
    
    model = lm(mpg ~ poly(horsepower, j), data = training_set)
    
    fitted_values = predict(model, validation_set)
    
    MSE[j] = (sum((validation_set$mpg - fitted_values)^2)) / (n_valid - length(model$coefficients))
  }
  
  MSE_list[[i]] = MSE
}


colour_vec = adjustcolor(rainbow(10), alpha = 0.65)
plot(MSE_list[[1]], type = "b", pch = 16, col = colour_vec[1], ylim=c(16,25), ylab = "MSE")

for(i in 2:9){
  lines(MSE_list[[i]], type="b", pch = 16, col = colour_vec[i])  
}

Leave One Out Cross Validation (LOOCV)

Replicate the result in the right left panel in Figure 5.4

set.seed(9)

X = Auto$horsepower
Y = Auto$mpg

loocv_list = NULL
cv_data = NULL

for(i in 1:10){
  cv_data = cbind(cv_data, X^i)
  loocv_list[[i]] = LOOCV(cv_data, Y)  
}

loocv_df = do.call(rbind.data.frame, loocv_list)
names(loocv_df) = c("MSE", "SD of MSE")

plot(loocv_df$MSE, type = "b", pch = 16, ylab = "MSE")

K-Fold Cross Validation (LOOCV)

set.seed(12)

cv.error.list = NULL

for(i in 1:10){
  
  cv.error.10 <- rep(0, 10)
  
  for (j in 1:10) {
    glm.fit <- glm(mpg ~ poly(horsepower, j), data = Auto)
    cv.error.10[j] <- cv.glm(Auto, glm.fit, K = 10)$delta[1]
  }
  cv.error.list[[i]] = cv.error.10
}

# cv.error.list

colour_vec = adjustcolor(rainbow(10), alpha = 0.65)

plot(cv.error.list[[1]], type = "b", pch = 16, 
     col = colour_vec[1], ylim=c(18, 25),
     ylab = "MSE", xlab = "Degree of polynomial")

for(i in 2:9){
  lines(cv.error.list[[i]], type = "b", pch = 16, col = colour_vec[i])
}

Attempting to recreate figure similar to Figure 2.9

library(matlib)

## Warning: package 'matlib' was built under R version 4.1.3

# Visually estimating 4 points from 

x1 = 32;  y1 = 6
x2 = 40;  y2 = 7.1
x3 = 47;  y3 = 7.95
x4 = 70;  y4 = 9.25


A = rbind(c(1, x1, x1^2, x1^3),
          c(1, x2, x2^2, x2^3),
          c(1, x3, x3^2, x3^3),
          c(1, x4, x4^2, x4^3))

b = c(y1, y2, y3, y4)

ge_solve = gaussianElimination(A, b)

# Function to obtain coefficients of cubic polynomial
f = function(x){
  ge_solve[1,5]  +  ge_solve[2,5]*x  +  ge_solve[3,5]*x^2  +  ge_solve[4,5]*x^3
}

x = seq(0, 100, by=0.75)
y = sapply(x, f)

# plot(x, y, type = "l")

# Add some noise
sim_y = y + rnorm(length(y), 0, 0.25)
sim_x = x + rnorm(length(x), 0, 2)

# Create linear model
linear_model = lm(sim_y ~ sim_x)


# Create smoothing Spline
model_spline_A <- ss(sim_x, sim_y, nknots = 3)
model_spline_B <- ss(sim_x, sim_y, nknots = 10)

plot(x, y, xlab = "X", ylab = "Y")
points(sim_x, sim_y)
lines(sim_x, predict(linear_model, data.frame(sim_x)), col = "orange", lwd=2)
lines(x, model_spline_A$y, col = "lightblue", lwd = 2)
lines(x, model_spline_B$y, col = "springgreen3", lwd = 2)

Name		Name	Last commit message	Last commit date
Latest commit History 37 Commits
.Rproj.user		.Rproj.user
ReadMe_files/figure-gfm		ReadMe_files/figure-gfm
.gitignore		.gitignore
BookClubCVExample.Rproj		BookClubCVExample.Rproj
ReadMe.Rmd		ReadMe.Rmd
ReadMe.md		ReadMe.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Replicating Some Results from An Introduction to Statistical Learning (2nd Edition) - Chapter 5: Resampling Methods

Cross-Validation

Validation Set Approach

Leave One Out Cross Validation (LOOCV)

K-Fold Cross Validation (LOOCV)

Attempting to recreate figure similar to Figure 2.9

About

Releases

Packages

nishanmudalige/BookClubCVExample

Folders and files

Latest commit

History

Repository files navigation

Replicating Some Results from An Introduction to Statistical Learning (2nd Edition) - Chapter 5: Resampling Methods

Cross-Validation

Validation Set Approach

Leave One Out Cross Validation (LOOCV)

K-Fold Cross Validation (LOOCV)

Attempting to recreate figure similar to Figure 2.9

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Packages