# 5.3 Lab: Cross-Validation and the Bootstrap

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import rpy2.robjects as robjects

import statsmodels.formula.api as smf

import sklearn.preprocessing
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.linear_model import LinearRegression

%load_ext rpy2.ipython

## 5.3.1 The Validation Set Approach

In [2]:
%%R
library(ISLR2)
set.seed(1)
train <- sort(sample(392, 196))

In [3]:
data = robjects.r("""
library(ISLR2)
set.seed(1)
train <- sample(392, 196)
""")

train_idx = np.array(data)
train_idx = np.sort(train_idx)

In [4]:
auto_df = pd.read_csv("../../../datasets/Auto.csv", na_values='?')

# Reset index labels to start at 1 to match R's behavior
auto_df = auto_df.set_index(keys=np.arange(1, len(auto_df) + 1))

# Drow rows that contain '?' values that represent na values
auto_df = auto_df.dropna()

In [5]:
## Since boolean masks work using integer labels for indexing, this mimics the behavior in R nicely.  It also makes it easyto get the testing indices by negating the training indices.

auto_df_no_gaps = auto_df.copy(deep=True)

auto_df_no_gaps= auto_df_no_gaps.set_index(np.arange(1, auto_df_no_gaps.shape[0] + 1))

auto_train_mask_no_gaps = auto_df_no_gaps.index.isin(train_idx)

auto_test_mask_no_gaps = ~auto_train_mask_no_gaps

In [6]:
%%R
lm.fit <- lm(mpg ~ horsepower, data = Auto, subset = train)

In [7]:
%%R
summary(lm.fit)


Call:
lm(formula = mpg ~ horsepower, data = Auto, subset = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.3177 -3.5428 -0.5591  2.3910 14.6836 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 41.283548   1.044352   39.53   <2e-16 ***
horsepower  -0.169659   0.009556  -17.75   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.032 on 194 degrees of freedom
Multiple R-squared:  0.619,	Adjusted R-squared:  0.6171 
F-statistic: 315.2 on 1 and 194 DF,  p-value: < 2.2e-16



In [8]:
lm_model = smf.ols(formula = 'mpg ~ horsepower', data = auto_df_no_gaps, subset=train_idx)
lm_fit = lm_model.fit()

lm_fit.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.619
Model:,OLS,Adj. R-squared:,0.617
Method:,Least Squares,F-statistic:,315.2
Date:,"Mon, 23 Jan 2023",Prob (F-statistic):,1.61e-42
Time:,23:21:05,Log-Likelihood:,-593.8
No. Observations:,196,AIC:,1192.0
Df Residuals:,194,BIC:,1198.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,41.2835,1.044,39.530,0.000,39.224,43.343
horsepower,-0.1697,0.010,-17.754,0.000,-0.189,-0.151

0,1,2,3
Omnibus:,13.451,Durbin-Watson:,1.171
Prob(Omnibus):,0.001,Jarque-Bera (JB):,14.488
Skew:,0.662,Prob(JB):,0.000714
Kurtosis:,3.149,Cond. No.,318.0


In [9]:
%%R
attach(Auto)
mean((mpg - predict(lm.fit, Auto))[-train]^2)

[1] 23.26601


In [10]:
pred = lm_fit.predict(auto_df_no_gaps[auto_test_mask_no_gaps]['horsepower'])
((auto_df_no_gaps[auto_test_mask_no_gaps]['mpg'] - pred)**2).mean()

23.2660086465003

### Polynomial Fits

#### 2nd Degree Polynomial

In [11]:
%%R
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, subset = train)
summary(lm.fit2)


Call:
lm(formula = mpg ~ poly(horsepower, 2), data = Auto, subset = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.8711  -2.6655  -0.0096   2.0806  16.1063 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)            23.5496     0.3175  74.182  < 2e-16 ***
poly(horsepower, 2)1 -123.5881     6.4587 -19.135  < 2e-16 ***
poly(horsepower, 2)2   47.7189     6.3613   7.501 2.25e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.439 on 193 degrees of freedom
Multiple R-squared:  0.705,	Adjusted R-squared:  0.702 
F-statistic: 230.6 on 2 and 193 DF,  p-value: < 2.2e-16



In [12]:
## include_bias=False so that an intercept column is not returned
polynomial_features = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)

In [13]:
## include_bias=False so that an intercept column is not returned
polynomial_features = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)

lm_model2_poly_feats  = smf.ols(formula = 'mpg ~ polynomial_features.fit_transform(np.array(horsepower).reshape(-1,1))', data = auto_df_no_gaps, subset=train_idx)
lm_fit2_poly_feats = lm_model2_poly_feats.fit()

lm_fit2_poly_feats.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.705
Model:,OLS,Adj. R-squared:,0.702
Method:,Least Squares,F-statistic:,230.6
Date:,"Mon, 23 Jan 2023",Prob (F-statistic):,6.829999999999999e-52
Time:,23:21:05,Log-Likelihood:,-568.72
No. Observations:,196,AIC:,1143.0
Df Residuals:,193,BIC:,1153.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,58.8738,2.519,23.368,0.000,53.905,63.843
"polynomial_features.fit_transform(np.array(horsepower).reshape(-1, 1))[0]",-0.4961,0.044,-11.192,0.000,-0.584,-0.409
"polynomial_features.fit_transform(np.array(horsepower).reshape(-1, 1))[1]",0.0013,0.000,7.501,0.000,0.001,0.002

0,1,2,3
Omnibus:,11.485,Durbin-Watson:,1.232
Prob(Omnibus):,0.003,Jarque-Bera (JB):,16.996
Skew:,0.354,Prob(JB):,0.000204
Kurtosis:,4.257,Cond. No.,121000.0


**Using PolynomialFeatures with smf.ols doesn't produce the same model as poly and lm in R.  Although the models look different, that's because PolynomialFeatures returns an array where the vectors aren't orthogonalized, whereas poly in R does.  When the array isn't orthogonalized, the inputs are much larger and affect the coefficients for the model, hence the difference.**

**If I generate the model in R using poly(horsepower, 2, raw=TRUE), then poly doesn't return an orthogonalized array and the coefficients of the model in R and Python match.  If I was able to orthogonalize the array from PolynomialFeatures in Python, then I think the models would generate the same coefficients.  I may look into this later, but it's a little unncessary and plan to proceed ahead for now.**

In [14]:
%%R
mean((mpg - predict(lm.fit2, Auto))[-train]^2)

[1] 18.71646


In [15]:
pred2_poly_feats = lm_fit2_poly_feats.predict(auto_df_no_gaps[auto_test_mask_no_gaps]['horsepower'])

((pred2_poly_feats - auto_df_no_gaps[auto_test_mask_no_gaps]['mpg'])**2).mean()

18.716459493382548

**The MSE from using PolynomialFeatures with smf.ols matches in Python and R, despite the models between Python and R not entirely matching.  This is due to the difference in inputs where poly returns orthogonalized vectors while PolynomialFeatures doesn't.**

#### 3rd Degree Polynomial

##### Checking that ortho_poly_fit and smf.ols produce same model as poly and lm in R

In [16]:
%%R
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data=Auto, subset=train)
summary(lm.fit3)


Call:
lm(formula = mpg ~ poly(horsepower, 3), data = Auto, subset = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.6625  -2.7108   0.0805   2.0724  16.1378 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)            23.5527     0.3185  73.946  < 2e-16 ***
poly(horsepower, 3)1 -123.6143     6.4755 -19.089  < 2e-16 ***
poly(horsepower, 3)2   47.8284     6.3935   7.481 2.58e-12 ***
poly(horsepower, 3)3    1.3825     5.8107   0.238    0.812    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.45 on 192 degrees of freedom
Multiple R-squared:  0.7051,	Adjusted R-squared:  0.7005 
F-statistic:   153 on 3 and 192 DF,  p-value: < 2.2e-16



In [17]:
## include_bias=False so that an intercept column is not returned
polynomial_features = sklearn.preprocessing.PolynomialFeatures(3, include_bias=False)

lm_model3_poly_feats  = smf.ols(formula = 'mpg ~ polynomial_features.fit_transform(np.array(horsepower).reshape(-1,1))', data = auto_df_no_gaps, subset=train_idx)
lm_fit3_poly_feats = lm_model3_poly_feats.fit(method='qr')

lm_fit3_poly_feats.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.705
Model:,OLS,Adj. R-squared:,0.701
Method:,Least Squares,F-statistic:,153.0
Date:,"Mon, 23 Jan 2023",Prob (F-statistic):,1.14e-50
Time:,23:21:05,Log-Likelihood:,-568.69
No. Observations:,196,AIC:,1145.0
Df Residuals:,192,BIC:,1158.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,57.5977,5.928,9.715,0.000,45.904,69.291
"polynomial_features.fit_transform(np.array(horsepower).reshape(-1, 1))[0]",-0.4610,0.154,-2.989,0.003,-0.765,-0.157
"polynomial_features.fit_transform(np.array(horsepower).reshape(-1, 1))[1]",0.0010,0.001,0.831,0.407,-0.001,0.004
"polynomial_features.fit_transform(np.array(horsepower).reshape(-1, 1))[2]",7.515e-07,3.16e-06,0.238,0.812,-5.48e-06,6.98e-06

0,1,2,3
Omnibus:,11.392,Durbin-Watson:,1.228
Prob(Omnibus):,0.003,Jarque-Bera (JB):,16.312
Skew:,0.365,Prob(JB):,0.000287
Kurtosis:,4.211,Cond. No.,47100000.0


In [18]:
%%R
mean((mpg - predict(lm.fit3, Auto))[-train]^2)

[1] 18.79401


In [19]:
pred3_poly_feats = lm_fit3_poly_feats.predict(auto_df_no_gaps[~auto_train_mask_no_gaps])

((pred3_poly_feats - auto_df_no_gaps[~auto_train_mask_no_gaps]['mpg'])**2).mean()

18.794006797394548

**The MSE from using PolynomialFeatures with smf.ols matches in Python and R, despite the models between Python and R not entirely matching.  This is due to the difference in inputs where poly returns orthogonalized vectors while PolynomialFeatures doesn't.**

### Generating new training indices and reealuting MSE

In [20]:
%%R
set.seed(2)
train <- sample(392, 196)
lm.fit <- lm(mpg ~ horsepower, subset = train)
mean((mpg - predict(lm.fit, Auto))[-train]^2)

[1] 25.72651


In [21]:
data = robjects.r("""
library(ISLR2)
set.seed(2)
train <- sample(392, 196)
""")

new_train_idx = np.array(data)
new_train_idx = np.sort(new_train_idx)

new_auto_train_mask_no_gaps = auto_df_no_gaps.index.isin(new_train_idx)
new_auto_test_mask_no_gaps = ~new_auto_train_mask_no_gaps

In [22]:
lm_model = smf.ols(formula='mpg ~ horsepower', data = auto_df_no_gaps, subset = new_train_idx)

lm_fit = lm_model.fit()

pred = lm_fit.predict(auto_df_no_gaps[~new_auto_train_mask_no_gaps]['horsepower'])

((pred - auto_df_no_gaps[~new_auto_train_mask_no_gaps]['mpg'])**2).mean()

25.726510644813906

#### Polynomial Fits

##### 2nd Degree Polynomial

In [23]:
%%R
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, subset = train)
mean((mpg - predict(lm.fit2, Auto))[-train]^2)

[1] 20.43036


In [24]:
## include_bias=False so that an intercept column is not returned
polynomial_features = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)

new_lm_model2_poly_feats  = smf.ols(formula = 'mpg ~ polynomial_features.fit_transform(np.array(horsepower).reshape(-1,1))', data = auto_df_no_gaps, subset=new_train_idx)
new_lm_fit2_poly_feats = new_lm_model2_poly_feats.fit()

new_pred2_poly_feats = new_lm_fit2_poly_feats.predict(auto_df_no_gaps[new_auto_test_mask_no_gaps]['horsepower'])

((new_pred2_poly_feats - auto_df_no_gaps[new_auto_test_mask_no_gaps]['mpg'])**2).mean()

20.430364274145607

**Same MSE in R and Python despite models being slightly different due to input vectors not being orthogonalized in Python.**

##### 3rd Degree Polynomial

In [25]:
%%R
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto, subset = train)
mean((mpg - predict(lm.fit3, Auto))[-train]^2)

[1] 20.38533


In [26]:
## include_bias=False so that an intercept column is not returned
polynomial_features = sklearn.preprocessing.PolynomialFeatures(3, include_bias=False)

new_lm_model3_poly_feats  = smf.ols(formula = 'mpg ~ polynomial_features.fit_transform(np.array(horsepower).reshape(-1,1))', data = auto_df_no_gaps, subset=new_train_idx)
new_lm_fit3_poly_feats = new_lm_model3_poly_feats.fit(method='qr')

new_pred3_poly_feats = new_lm_fit3_poly_feats.predict(auto_df_no_gaps[new_auto_test_mask_no_gaps]['horsepower'])

((new_pred3_poly_feats - auto_df_no_gaps[new_auto_test_mask_no_gaps]['mpg'])**2).mean()

20.385326863877566

**Same MSE in R and Python despite models being slightly different due to input vectors not being orthogonalized in Pytho.**

## 5.3.2 Leave-One-Out Cross-Validation

In [27]:
%%R
glm.fit <- glm(mpg ~ horsepower, data = Auto)
coef(glm.fit)

(Intercept)  horsepower 
 39.9358610  -0.1578447 


In [28]:
glm_model = smf.glm(formula='mpg ~ horsepower', data=auto_df_no_gaps)
glm_fit = glm_model.fit()
glm_fit.params

Intercept     39.935861
horsepower    -0.157845
dtype: float64

In [29]:
%%R
lm.fit <- lm(mpg ~ horsepower, data = Auto)
coef(lm.fit)

(Intercept)  horsepower 
 39.9358610  -0.1578447 


In [30]:
lm_model = smf.ols(formula = 'mpg ~ horsepower', data=auto_df_no_gaps)
lm_fit = lm_model.fit()
lm_fit.params

Intercept     39.935861
horsepower    -0.157845
dtype: float64

In [31]:
%%R
library(boot)
glm.fit <- glm(mpg ~ horsepower, data = Auto)
cv.err <- cv.glm(Auto, glm.fit)
cv.err$delta

[1] 24.23151 24.23114


In [32]:
X = auto_df_no_gaps['horsepower'].values.reshape(-1,1)
y = auto_df_no_gaps['mpg'].values.reshape(-1,1)

loo = LeaveOneOut()

skl_lm_model = sklearn.linear_model.LinearRegression()

scores = cross_val_score(skl_lm_model, X, y, cv = loo, scoring='neg_mean_squared_error')

np.mean(np.abs(scores))

24.231513517929226

### Polynomial Fits

In [33]:
%%R
cv.error <- rep(0, 10)
for (i in 1:10) {
    glm.fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
    cv.error[i] <- cv.glm(Auto, glm.fit)$delta[1]
}
cv.error

 [1] 24.23151 19.24821 19.33498 19.42443 19.03321 18.97864 18.83305 18.96115
 [9] 19.06863 19.49093


In [34]:
cv_error = []

for i in range(1,11):
    polynomial_features = sklearn.preprocessing.PolynomialFeatures(i, include_bias=False)

    scores = cross_val_score(skl_lm_model, polynomial_features.fit_transform(X), y, cv = loo, scoring='neg_mean_squared_error')
    
    mean_score = np.mean(np.abs(scores))
    
    cv_error.append(mean_score)
    
cv_error

[24.231513517929226,
 19.24821312448967,
 19.33498406402931,
 19.42443031024277,
 19.03321248615882,
 18.97863406819667,
 19.129480449254846,
 19.224150660848743,
 19.133322843461364,
 18.93976572079586]

**The MSEs match closely in Python and R.  All but the last 4 entries match exactly.**

## 5.3.3 k-Fold Cross-Validation

In [35]:
%%R
set.seed(17)
cv.error.10 <- rep(0, 10)
for (i in 1:10) {
    glm.fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
    cv.error.10[i] <- cv.glm(Auto, glm.fit, K = 10)$delta[1]
}
cv.error.10

 [1] 24.27207 19.26909 19.34805 19.29496 19.03198 18.89781 19.12061 19.14666
 [9] 18.87013 20.95520


In [36]:
cv_error_10 = []

for i in range(1,11):
    polynomial_features = sklearn.preprocessing.PolynomialFeatures(i, include_bias=False)

    scores = cross_val_score(skl_lm_model, polynomial_features.fit_transform(X), y, cv = 10, scoring='neg_mean_squared_error')
    
    mean_score = np.mean(np.abs(scores))
    
    cv_error_10.append(mean_score)
    
cv_error_10

[27.439933652339874,
 21.235840055802225,
 21.33660618322788,
 21.35388698195473,
 20.905641650770082,
 20.779180086179668,
 20.990939391569672,
 21.077615424551695,
 21.036905853431772,
 20.977623972381583]

**Because the folds are randomly generated, the folds generated from cv.glm in R are likely to be different than the folds generates from cross_val_score in Python and we shouldn't expect the bootstrap MSEs to match, but we'd expect them to be similar.** 

## 5.3.4 The Bootstrap

### Estimating the Accuracy of a Statistic of Interest

In [37]:
%%R
alpha.fn <- function(data, index) {
    X <- data$X[index]
    Y <- data$Y[index]
    (var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}

In [38]:
%%R
alpha.fn(Portfolio, 1:100)

[1] 0.5758321


In [39]:
pfolio_df = pd.read_csv("../../../datasets/Portfolio.csv")
pfolio_df = pfolio_df.set_index(np.arange(1, pfolio_df.shape[0] + 1))

In [40]:
def alpha_fn(data, index):
    X = data.loc[index]['X']
    Y = data.loc[index]['Y']
    var_x = np.var(X, ddof=1)
    var_y = np.var(Y, ddof=1)
    cov_x_y = np.cov(X, Y, ddof=1)[0][1]
    
    return (var_y - cov_x_y) / (var_x + var_y - 2 * cov_x_y)

In [41]:
alpha_fn(pfolio_df, np.arange(1,101))

0.5758320745928298

In [42]:
%%R
set.seed(7)
alpha.fn(Portfolio, sample(100, 100, replace=T))

[1] 0.5385326


In [43]:
data = robjects.r("""
set.seed(7)
train <- sample(100, 100, replace=T)
""")

bstrap_idx = np.array(data)
bstrap_idx = np.sort(bstrap_idx)

In [44]:
alpha_fn(pfolio_df, bstrap_idx)

0.5385325919467925

In [45]:
%%R
boot(Portfolio, alpha.fn, R = 1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Portfolio, statistic = alpha.fn, R = 1000)


Bootstrap Statistics :
     original       bias    std. error
t1* 0.5758321 0.0007959475  0.08969074


In [46]:
## Note: the boot package from R automatically creates bootstrap samples of length n, where n = the number of observations in the dataset you pass to boot.  In order to mimic that behavior in Python, we need np.random.choice to choose n numbers to ensure the bootstrap sample has the same number of observations as in the dataset.

alphas = []
n = pfolio_df.shape[0]

for _ in range(1000):
    idx = np.random.choice(np.arange(1, n+1), n)
    alpha = alpha_fn(pfolio_df, idx)
    alphas.append(alpha)
    
original_alpha = alpha_fn(pfolio_df, np.arange(1, n+1))
alpha_bstrap_mean = np.mean(alphas)
alpha_bstrap_std = np.std(alphas, ddof=1) ## When calculating the std dev, np.std divides by N-ddof.  To find the sample std dev, we set ddof=1

## Bias = bootstrap realization of the statistic - the original statistic from the original data
alpha_bias = alpha_bstrap_mean - original_alpha

print(f'Original Alpha: {original_alpha}')
print(f'Alpha Bias: {alpha_bias}')
print(f'Alpha Std: {alpha_bstrap_std}')

Original Alpha: 0.5758320745928298
Alpha Bias: 0.0018467314323676876
Alpha Std: 0.09286469104959781


### Estimating the Accuracy of a Linear Regression Model

In [47]:
%%R
boot.fn <- function(data, index)
    coef(lm(mpg ~ horsepower, data = data, subset = index))
boot.fn(Auto, 1:392)

(Intercept)  horsepower 
 39.9358610  -0.1578447 


In [48]:
def boot_fn(data, index):
    model = smf.glm(formula = 'mpg ~ horsepower', data = data, subset = index)
    fit = model.fit()
    coefficients = fit.params
    
    return coefficients

In [49]:
boot_fn(auto_df_no_gaps, np.arange(1, 393))

Intercept     39.935861
horsepower    -0.157845
dtype: float64

In [50]:
%%R
set.seed(1)
boot.fn(Auto, sample(392, 392, replace = T))

(Intercept)  horsepower 
 40.3404517  -0.1634868 


In [51]:
data = robjects.r("""
set.seed(1)
samp <- sample(392, 392, replace=T)
""")

bstrap_idx = np.array(data)
bstrap_idx = np.sort(bstrap_idx)

In [52]:
boot_fn(auto_df_no_gaps, bstrap_idx)

Intercept     40.340452
horsepower    -0.163487
dtype: float64

In [53]:
%%R
boot.fn(Auto, sample(392, 392, replace = T))

(Intercept)  horsepower 
 40.1186906  -0.1577063 


In [54]:
idx = np.random.choice(np.arange(1,393), 392)
boot_fn(auto_df_no_gaps, idx)

Intercept     40.227766
horsepower    -0.162126
dtype: float64

In [55]:
%%R
boot(Auto, boot.fn, 1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Auto, statistic = boot.fn, R = 1000)


Bootstrap Statistics :
      original        bias    std. error
t1* 39.9358610  0.0544513229 0.841289790
t2* -0.1578447 -0.0006170901 0.007343073


In [56]:
intercepts = []
slopes = []

n = auto_df_no_gaps.shape[0]

for _ in range(1000):
    idx = np.random.choice(np.arange(1, n+1), n)
    param = boot_fn(auto_df_no_gaps, idx)
    intercept = param[0]
    slope = param[1]
    intercepts.append(intercept)
    slopes.append(slope)

original_intercept = boot_fn(auto_df_no_gaps, np.arange(1, 393))[0]
original_slope = boot_fn(auto_df_no_gaps, np.arange(1, 393))[1]
intercept_bstrap_mean = np.mean(intercepts)
intercept_bstrap_std = np.std(intercepts, ddof=1) ## When calculating the std dev, np.std divides by N-ddof.  To find the sample std dev, we set ddof=1
slope_bstrap_mean = np.mean(slopes)
slope_bstrap_std = np.std(slopes, ddof=1)

## Bias = bootstrap realization of the statistic - the original statistic from the original data
## bias for intercepts
intercept_bias = intercept_bstrap_mean - original_intercept

## bias for slopes
slope_bias = slope_bstrap_mean - original_slope

print(f'Original Intercept: {original_intercept}')
print(f'Bstrap Intercept Bias: {intercept_bias}')
print(f'Bstrap Intercept Std: {intercept_bstrap_std}')
print()
print(f'Original Slope: {original_slope}')
print(f'Bstrap Slope Bias: {slope_bias}')
print(f'Bstrap Slope Std: {slope_bstrap_std}')

Original Intercept: 39.93586102117048
Bstrap Intercept Bias: -0.00867467112543352
Bstrap Intercept Std: 0.892766093263005

Original Slope: -0.15784473335365357
Bstrap Slope Bias: 8.926628730326902e-05
Bstrap Slope Std: 0.00762841271663687


In [57]:
%%R
summary(lm(mpg ~ horsepower, data = Auto))$coef

              Estimate  Std. Error   t value      Pr(>|t|)
(Intercept) 39.9358610 0.717498656  55.65984 1.220362e-187
horsepower  -0.1578447 0.006445501 -24.48914  7.031989e-81


In [58]:
## https://stackoverflow.com/questions/51734180/converting-statsmodels-summary-object-to-pandas-dataframe

results = smf.ols(formula = 'mpg ~ horsepower', data = auto_df_no_gaps).fit().summary().tables[1]

results_as_html = results.as_html()
results_df = pd.read_html(results_as_html, header=0, index_col=0)[0]

results_df[['coef', 'std err', 't', 'P>|t|']]

Unnamed: 0,coef,std err,t,P>|t|
Intercept,39.9359,0.717,55.66,0.0
horsepower,-0.1578,0.006,-24.489,0.0


In [59]:
%%R
boot.fn <- function(data, index)
    coef(
      lm(mpg ~ horsepower + I(horsepower^2),
        data = data, subset = index)
    )

set.seed(1)
boot(Auto, boot.fn, 1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Auto, statistic = boot.fn, R = 1000)


Bootstrap Statistics :
        original        bias     std. error
t1* 56.900099702  3.511640e-02 2.0300222526
t2* -0.466189630 -7.080834e-04 0.0324241984
t3*  0.001230536  2.840324e-06 0.0001172164


In [60]:
## What does I() do in the formula: https://stackoverflow.com/questions/24192428/what-does-the-capital-letter-i-in-r-linear-regression-formula-mean

def boot_fn(data, index):
    model = smf.ols(formula='mpg ~ horsepower + I(horsepower**2)', 
                    data=data, 
                    subset=index)
    fit = model.fit()
    params = fit.params
    
    return params

In [61]:
intercepts = []
hp_1s = []
hp_2s = []

n = auto_df_no_gaps.shape[0]

for _ in range(1000):
    idx = np.random.choice(np.arange(1, n+1), n)
    param = boot_fn(auto_df_no_gaps, idx)
    intercept = param[0]
    hp_1 = param[1]
    hp_2 = param[2]
    intercepts.append(intercept)
    hp_1s.append(hp_1)
    hp_2s.append(hp_2)

original_intercept = boot_fn(auto_df_no_gaps, np.arange(1, 393))[0]
original_hp_1 = boot_fn(auto_df_no_gaps, np.arange(1, 393))[1]
original_hp_2 = boot_fn(auto_df_no_gaps, np.arange(1, 393))[2]

intercept_bstrap_mean = np.mean(intercepts)
intercept_bstrap_std = np.std(intercepts, ddof=1) ## When calculating the std dev, np.std divides by N-ddof.  To find the sample std dev, we set ddof=1

hp_1_bstrap_mean = np.mean(hp_1s)
hp_1_bstrap_std = np.std(hp_1s, ddof=1)

hp_2_bstrap_mean = np.mean(hp_2s)
hp_2_bstrap_std = np.std(hp_2s, ddof=1)

## Bias = bootstrap realization of the statistic - the original statistic from the original data
## bias for intercepts
intercept_bias = intercept_bstrap_mean - original_intercept

## bias for horsepower
hp_1_bias = hp_1_bstrap_mean - original_hp_1

## bias for horsepower**2
hp_2_bias = hp_2_bstrap_mean - original_hp_2

In [62]:
print(f'Original Intercept: {original_intercept}')
print(f'Bstrap Intercept Bias: {intercept_bias}')
print(f'Bstrap Intercept Std: {intercept_bstrap_std}')
print()
print(f'Original Slope: {original_hp_1}')
print(f'Bstrap horsepower Bias: {hp_1_bias}')
print(f'Bstrap horsepower Std: {hp_1_bstrap_std}')
print()
print(f'Original Slope: {original_hp_2}')
print(f'Bstrap horsepower**2 Bias: {hp_2_bias}')
print(f'Bstrap horsepower**2 Std: {hp_2_bstrap_std}')

Original Intercept: 56.90009970211517
Bstrap Intercept Bias: 0.021694888257798084
Bstrap Intercept Std: 2.1200659809299682

Original Slope: -0.46618962994736257
Bstrap horsepower Bias: -0.000384803575593784
Bstrap horsepower Std: 0.03388658575935158

Original Slope: 0.0012305361007737656
Bstrap horsepower**2 Bias: 1.2778057296998057e-06
Bstrap horsepower**2 Std: 0.00012272733089463907


In [63]:
%%R
summary(
    lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
)$coef

                    Estimate   Std. Error   t value      Pr(>|t|)
(Intercept)     56.900099702 1.8004268063  31.60367 1.740911e-109
horsepower      -0.466189630 0.0311246171 -14.97816  2.289429e-40
I(horsepower^2)  0.001230536 0.0001220759  10.08009  2.196340e-21


In [64]:
model = smf.ols(formula = 'mpg ~ horsepower + I(horsepower**2)',
                data = auto_df_no_gaps)
fit = model.fit()

results = fit.summary().tables[1]

results_as_html = results.as_html()
results_df = pd.read_html(results_as_html, header=0, index_col=0)[0]

results_df[['coef', 'std err', 't', 'P>|t|']]

Unnamed: 0,coef,std err,t,P>|t|
Intercept,56.9001,1.8,31.604,0.0
horsepower,-0.4662,0.031,-14.978,0.0
I(horsepower ** 2),0.0012,0.0,10.08,0.0


# The End