Replicating Some Results from An Introduction to Statistical Learning (2nd Edition) - Chapter 5: Resampling Methods
Nishan Mudalige
2022-10-01
Source: An Introduction to Statistical Learning (2nd Edition) by Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani
knitr::opts_chunk$set(echo = TRUE)
# Required libraries for data analysis
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.3
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(boot)
library(bestglm)
## Warning: package 'bestglm' was built under R version 4.1.3
## Loading required package: leaps
## Warning: package 'leaps' was built under R version 4.1.3
library(npreg)
## Warning: package 'npreg' was built under R version 4.1.3
##
## Attaching package: 'npreg'
## The following object is masked from 'package:boot':
##
## boot
Replicate the results in Figure 5.2
data(Auto)
set.seed(6)
n = nrow(Auto)
MSE = NULL
training_set_indices = sample(n, n/2, replace = F)
validation_set_indices = (1:n)[!(1:n %in% training_set_indices)]
training_set = Auto[training_set_indices, ]
validation_set = Auto[validation_set_indices, ]
for(i in 1:10){
n_valid = nrow(validation_set)
model = lm(mpg ~ poly(horsepower, i), data = training_set)
fitted_values = predict(model, validation_set)
MSE[i] = (sum((validation_set$mpg - fitted_values)^2)) / (n_valid - length(model$coefficients))
}
plot(MSE, type = "b", pch = 16, ylab = "MSE")
set.seed(1)
MSE_list = NULL
for(i in 1:10){
training_set_indices = sample(n, n/2, replace = F)
validation_set_indices = (1:n)[!(1:n %in% training_set_indices)]
training_set = Auto[training_set_indices, ]
validation_set = Auto[validation_set_indices, ]
for(j in 1:10){
n_valid = nrow(validation_set)
model = lm(mpg ~ poly(horsepower, j), data = training_set)
fitted_values = predict(model, validation_set)
MSE[j] = (sum((validation_set$mpg - fitted_values)^2)) / (n_valid - length(model$coefficients))
}
MSE_list[[i]] = MSE
}
colour_vec = adjustcolor(rainbow(10), alpha = 0.65)
plot(MSE_list[[1]], type = "b", pch = 16, col = colour_vec[1], ylim=c(16,25), ylab = "MSE")
for(i in 2:9){
lines(MSE_list[[i]], type="b", pch = 16, col = colour_vec[i])
}
Replicate the result in the right left panel in Figure 5.4
set.seed(9)
X = Auto$horsepower
Y = Auto$mpg
loocv_list = NULL
cv_data = NULL
for(i in 1:10){
cv_data = cbind(cv_data, X^i)
loocv_list[[i]] = LOOCV(cv_data, Y)
}
loocv_df = do.call(rbind.data.frame, loocv_list)
names(loocv_df) = c("MSE", "SD of MSE")
plot(loocv_df$MSE, type = "b", pch = 16, ylab = "MSE")
set.seed(12)
cv.error.list = NULL
for(i in 1:10){
cv.error.10 <- rep(0, 10)
for (j in 1:10) {
glm.fit <- glm(mpg ~ poly(horsepower, j), data = Auto)
cv.error.10[j] <- cv.glm(Auto, glm.fit, K = 10)$delta[1]
}
cv.error.list[[i]] = cv.error.10
}
# cv.error.list
colour_vec = adjustcolor(rainbow(10), alpha = 0.65)
plot(cv.error.list[[1]], type = "b", pch = 16,
col = colour_vec[1], ylim=c(18, 25),
ylab = "MSE", xlab = "Degree of polynomial")
for(i in 2:9){
lines(cv.error.list[[i]], type = "b", pch = 16, col = colour_vec[i])
}
library(matlib)
## Warning: package 'matlib' was built under R version 4.1.3
# Visually estimating 4 points from
x1 = 32; y1 = 6
x2 = 40; y2 = 7.1
x3 = 47; y3 = 7.95
x4 = 70; y4 = 9.25
A = rbind(c(1, x1, x1^2, x1^3),
c(1, x2, x2^2, x2^3),
c(1, x3, x3^2, x3^3),
c(1, x4, x4^2, x4^3))
b = c(y1, y2, y3, y4)
ge_solve = gaussianElimination(A, b)
# Function to obtain coefficients of cubic polynomial
f = function(x){
ge_solve[1,5] + ge_solve[2,5]*x + ge_solve[3,5]*x^2 + ge_solve[4,5]*x^3
}
x = seq(0, 100, by=0.75)
y = sapply(x, f)
# plot(x, y, type = "l")
# Add some noise
sim_y = y + rnorm(length(y), 0, 0.25)
sim_x = x + rnorm(length(x), 0, 2)
# Create linear model
linear_model = lm(sim_y ~ sim_x)
# Create smoothing Spline
model_spline_A <- ss(sim_x, sim_y, nknots = 3)
model_spline_B <- ss(sim_x, sim_y, nknots = 10)
plot(x, y, xlab = "X", ylab = "Y")
points(sim_x, sim_y)
lines(sim_x, predict(linear_model, data.frame(sim_x)), col = "orange", lwd=2)
lines(x, model_spline_A$y, col = "lightblue", lwd = 2)
lines(x, model_spline_B$y, col = "springgreen3", lwd = 2)