In [88]:
import numpy as np
import pandas as pd


# Bootstrap Example: 
n = 100 # number of samples
iters = 500  # iterations for estimating standard deviations
mu_vec = np.zeros(iters)

for i in range(iters):
    v = np.random.rand(n, 1)
    mu_vec[i] = np.mean(v)  # average is estimator of mean 
    
print("Bias: " + str(np.mean(mu_vec)-0.5))
print("Std: " + str(np.std(mu_vec)) + ", Theoretical std: " + str(1.0 / np.sqrt(12.0*n)))





Bias: -0.0017937751453583894
Std: 0.029046680150334965, Theoretical std: 0.028867513459481284


In [97]:
# Bootstrap function
def Bootstrap(x, n=-1):
    if n == -1:
        n = x.shape[0]
    return x[np.random.choice(x.shape[0],n),] # sample from x 

v = np.array([1,2,3,4])
print(Bootstrap(v))

[4 2 1 1]


In [102]:

# An estimator of mu from a sample
def mean_estimator(x):
    return np.mean(x)
    
# Compute bootstrap standad deviation
def Bootstrap_std(x, estimator, bootstrap_iters):
    theta_vec = np.zeros(bootstrap_iters)
    for i in range(bootstrap_iters):
        theta_vec[i] = estimator(Bootstrap(x))
    return np.std(theta_vec) # This is the bootstrap estimate of the standard deviation 


# Compute standard deviation using the bootstrap 
v = np.random.rand(n, 1)
bootstrap_iters = 500
bootstrap_std = Bootstrap_std(v, mean_estimator, bootstrap_iters)
print("Bootstrap Std: " + str(bootstrap_std) + ", Theoretical std: " + str(1.0 / np.sqrt(12.0*n)))


Bootstrap Std: 0.027886338774107157, Theoretical std: 0.028867513459481284


In [103]:
# Train test example 
# load Boston housing dataset
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)

print(X)
print(y)
print(X.shape)
print(y.shape)



[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]
[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.

In [104]:
# divide to train/test: 
test_prop = 0.2 
n = X.shape[0]

test_ind = np.random.choice(n, round(test_prop*n), replace=False)
train_ind = list(set(range(n)).difference(test_ind))
print(test_ind)
print(train_ind)



[213 481 364 386  53 252 290 263 440 337 143 114 346  51 429 374 460 282
 192  30 323  76 241 232  14 142 348 216 102 365 111 163 452 235 266  89
 396 505 403 439  83  36 456 268 121 117  85 115 393 196 492 497 335 279
 184 137 488 428 495 357 416  70 147 443 463 283 410 243 466 284  29 209
 320  22 303 436 381 392 453 368 325 496 399   7 430  48  88 486  38 269
 240 206 356  32 427   1 159 457 190 401 198]
[0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 31, 33, 34, 35, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 52, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 84, 86, 87, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 116, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 144, 145, 146, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 162, 164, 165

In [113]:
split_iters = 100 # random splits to train and test 

train_error = 0
test_error = 0
for i in range(split_iters):
    test_ind = np.random.choice(n, round(test_prop*n), replace=False)
    train_ind = list(set(range(n)).difference(test_ind))
#    print(test_ind)
#    print(train_ind)
     
    # Fit model for train: 
    reg = LinearRegression().fit(X[train_ind,], y[train_ind])

    y_train_hat = reg.predict(X[train_ind,])
    train_error += np.mean((y_train_hat - y[train_ind])**2)

    y_test_hat = reg.predict(X[test_ind,])
    test_error += np.mean((y_test_hat - y[test_ind])**2)

    
print("Average Train Error: " + str(train_error/split_iters))
print("Average Test Error: " + str(test_error/split_iters))





Average Train Error: 21.84131276000921
Average Test Error: 23.33229583498381
