In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn
import sklearn

from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
import sklearn.discriminant_analysis
import sklearn.neighbors

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn import cross_validation
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale

%matplotlib inline



In [11]:
boston = pd.read_csv('Boston.csv')
boston.head()
boston.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [8]:
mu_hat = np.mean(boston['medv'])
mu_hat
# The population mean for 'medv' is approximately 22.532806324110677.

22.532806324110677

In [9]:
std_err_hat = np.std(boston.iloc[:, 13]) / np.sqrt(len(boston.index))
std_err_hat

0.40845693469728661

In [33]:
realizations = np.zeros((1000, 506, 14))
np.random.seed(seed = 50)

for i in range(1000) :
    indices = np.random.choice(np.arange(0, 506, 1), size = 506, replace = True)
    for j in range(506) :
        realizations[i, j, :] = boston.iloc[indices[j], :]

bootstrap_means = np.zeros((1000, 1))
for num in range(1000) :
    bootstrap_means[num, 0] = np.mean(realizations[num, :, 13])

0.39511693250398283

In [41]:
# Now we have 1000 different 'realizations of history'. We can calculate the standard error
# of the population mean for 'medv' as follows:
std_err_bs_hat = 0

for n in range(1000) :
    std_err_bs_hat = std_err_bs_hat + (bootstrap_means[n, 0] - np.mean(bootstrap_means[:, 0]))**2
    
std_err_bs_hat = np.sqrt((1 / 999) * std_err_bs_hat)
std_err_bs_hat

0.39531463926266647

In [42]:
mu_bs_hat = np.mean(bootstrap_means[:, 0])
lower_bound = mu_bs_hat - 2 * std_err_bs_hat
upper_bound = mu_bs_hat + 2 * std_err_bs_hat
print(lower_bound, upper_bound)
# Based on bootstrap, the 95% confidence interval for population mean is [21.7253628163, 23.3066213734].

21.7253628163 23.3066213734


In [17]:
med_hat = np.median(boston['medv'])
med_hat
# The population median for 'medv' is approximately 21.199999999999999.

21.199999999999999

In [40]:
bootstrap_medians = np.zeros((1000, 1))
for num1 in range(1000) :
    bootstrap_medians[num1, 0] = np.median(realizations[num1, :, 13])

In [37]:
# Now we have 1000 different 'realizations of history'. We can calculate the standard error
# of the population mean for 'medv' as follows:
std_err_bs_hat1 = 0

for n in range(1000) :
    std_err_bs_hat1 = std_err_bs_hat1 + (bootstrap_medians[n, 0] - np.mean(bootstrap_medians[:, 0]))**2
    
std_err_bs_hat1 = np.sqrt((1 / 999) * std_err_bs_hat1)
std_err_bs_hat1

0.37751326634472882

In [43]:
med_bs_hat1 = np.mean(bootstrap_medians[:, 0])
lower_bound1 = med_bs_hat1 - 2 * std_err_bs_hat1
upper_bound1 = med_bs_hat1 + 2 * std_err_bs_hat1
print(med_bs_hat1)
print(lower_bound1, upper_bound1)
# Based on bootstrap, the 95% confidence interval for the median is [20.4274734673, 21.9375265327].
# The value calculated for the population median of 'medv' falls inside of the 95% CI found with bootstrap.
# Not only does the median fall inside of the CI, it is almost in the center of the interval.
# Also, the standard error for the median of 'medv' is not too large.

21.16615
20.4111234673 21.9211765327
