In [1]:
# Suppressing warnings
import warnings
warnings.filterwarnings(action = "ignore")

import quadratic2SLS as q2sls
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Plotting preferences
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns

In [6]:
## ~~~~~ Functions ~~~~~ ##

## Coefficient Results Table ##
def results_df(results_obj, true_coeffs):
    '''Returns summary of coefficient results compared to true coeff.s'''
    
    coeff_estms = pd.DataFrame([results_obj.beta_hat_boots.mean(axis=0), results_obj.result2.params]).transpose()
    coeff_estms.columns = ['Bootstrapped', 'Full_Sample']
    coeff_estms['True_Coeffs'] = true_coeffs
    return coeff_estms

def results_df2(big_bootstrap_res, small_bootstrap_res, true_coeffs):
    '''Returns summary of coefficient results compared to true coeff.s'''
    
    coeff_estms = pd.DataFrame([true_coeffs, list(big_bootstrap_res.mean(axis=0)), list(big_bootstrap_res.mean(axis=0))]).transpose()
    coeff_estms.columns = ['True_Coeffs', 'Full_Sample_Bootstrap', 'Bootstrap_Sample_Bootstrap']
    coeff_estms = coeff_estms.set_index(['const', 'endog_hat', 'endog_sq_hat', 'X2', 'X3'])
    return coeff_estms

## Coefficient Estimate Distributions ##
def custom_dist_plots(results_obj, true_coeffs):
    coeff_estms = results_df(results_obj, true_coeffs)
    
    plt.figure(figsize=(16, 16))
    sns.set(style="whitegrid", palette="muted", color_codes=True)

    plt.subplots_adjust(top=0.92)
    plt.suptitle('Bootstrapped Coefficient Estimates Compared to Full Sample Estimate\nBlue: Bootstrapped Estimate\nRed: Full Sample Estimate\nBlack: True Coefficient')

    sns.despine()
    for column_index, column in enumerate(results_obj.X_hat.columns.values.tolist()):
        plt.subplot(3, 2, column_index + 1)
        sns.kdeplot(results_obj.beta_hat_boots[column])
        #plt.plot([results_obj.result2.params[column_index], results_obj.result2.params[column_index]], 
        #         [0, 500], 
        #         linewidth=1,
        #        color = 'red')
        plt.plot([coeff_estms['Full_Sample'][column_index], coeff_estms['Full_Sample'][column_index]], 
                 [0, 500], 
                 linewidth=1,
                color = 'red')
        plt.plot([coeff_estms['True_Coeffs'][column_index], coeff_estms['True_Coeffs'][column_index]], 
                 [0, 500], 
                 linewidth=1,
                color = 'black')
        plt.plot([coeff_estms['Bootstrapped'][column_index], coeff_estms['Bootstrapped'][column_index]], 
                 [0, 500], 
                 linewidth=1,
                color = 'blue')

## Coefficient Paths ##
def coeff_path_plot(results_obj, var_name, n_boot):
    cumsum_beta_hat_boots = np.cumsum(results_obj.beta_hat_boots[var_name])
    plt.title('Running Average from Bootstrap: `' + str(var_name) +'`')
    sns.tsplot(cumsum_beta_hat_boots/np.arange(n_boot));

# Strong Instruments - Double Bootstrap

In [5]:
np.random.seed(49067)

n = 1500
n_small_bootstrap = 100
n_big_bootstrap = 100

## DGP ##
# X1 is endogenous because we are going to leave X4 out as an ommitted var and cov(X1, X4) =/= 0
# Z's are the instruments we will use for X1
var_names = ['X1', 'X2', 'X3', 'X4', 'Z1', 'Z2']
means = [3, -1.5, 1.1, 2.3, -1, 3]
cov = [[1, 0, 0, 0.75, 0.8, 0.6],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0.75, 0, 0, 1, 0, 0],
       [0.8, 0, 0, 0, 1, 0],
       [0.6, 0, 0, 0, 0, 1]]



### Big Bootstrap ###
full_sample_btsp_coeff  = np.zeros((n_big_bootstrap, 5))
small_btsp_avg_coeff = np.zeros((n_big_bootstrap, 5))

for BB_iter in range(0, n_big_bootstrap):
    ## Data Generation ##
    X1, X2, X3, X4, Z1, Z2 = np.random.multivariate_normal(means, cov, n).T

    # Normal, mean zero errors
    epsilon = np.random.normal(0, 1, n)

    # True model:
    Y = 1.5 + 2.5*X1 - 0.7*(X1)**2 + 2*X2 + 3*X3 + 2*X4 + epsilon

    ## Constructing the data sets ##
    # Note: we leave out X4 as the omitted variable from which endogeneity is arising
    exogenous = pd.DataFrame({'X2' : X2, 'X3' : X3})
    exogenous = sm.add_constant(exogenous)
    endogenous = pd.DataFrame({'X1' : X1})
    instruments = pd.DataFrame({'Z1' : Z1, 'Z2' : Z2})
    Y = pd.DataFrame({'Y' : Y})

    ## Estimating the models ##
    model1 = q2sls.Quadratic2SLS(Y, exogenous, endogenous, instruments)
    result1 = model1.fit(cov_type='Bootstrap', n_iter = n_small_bootstrap)

    ## Results ##
    out_df = results_df(result1, true_coeffs = [1.5, 2.5, -0.7, 2, 3])
    full_sample_btsp_coeff[BB_iter] = out_df['Full_Sample']
    small_btsp_avg_coeff[BB_iter] = out_df['Bootstrapped']

full_sample_btsp_coeff

100%|██████████| 100/100 [00:00<00:00, 112.50it/s]
100%|██████████| 100/100 [00:00<00:00, 124.17it/s]
100%|██████████| 100/100 [00:00<00:00, 116.68it/s]
100%|██████████| 100/100 [00:00<00:00, 120.90it/s]
100%|██████████| 100/100 [00:00<00:00, 123.24it/s]
100%|██████████| 100/100 [00:00<00:00, 120.79it/s]
100%|██████████| 100/100 [00:00<00:00, 121.59it/s]
100%|██████████| 100/100 [00:00<00:00, 121.85it/s]
100%|██████████| 100/100 [00:00<00:00, 122.38it/s]
100%|██████████| 100/100 [00:00<00:00, 120.23it/s]
100%|██████████| 100/100 [00:00<00:00, 120.56it/s]
100%|██████████| 100/100 [00:00<00:00, 119.94it/s]
100%|██████████| 100/100 [00:00<00:00, 102.17it/s]
100%|██████████| 100/100 [00:00<00:00, 107.49it/s]
100%|██████████| 100/100 [00:00<00:00, 117.55it/s]
100%|██████████| 100/100 [00:00<00:00, 121.26it/s]
100%|██████████| 100/100 [00:00<00:00, 122.10it/s]
100%|██████████| 100/100 [00:00<00:00, 120.00it/s]
100%|██████████| 100/100 [00:00<00:00, 120.81it/s]
100%|██████████| 100/100 [00:01

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 5.54846566,  2.5446211 , -0.65250629,  2.01818442,  2.98746244],
       [ 4.38076152,  3.38305302, -0.77283514,  1.9995534 ,  2.94400659],
       [ 6.32816843,  1.73456513, -0.52288903,  1.86708278,  2.98909542],
       [ 5.93741119,  2.05677884, -0.56401487,  1.99084829,  3.08189188],
       [ 6.10096111,  2.10652559, -0.58764336,  1.98114748,  2.98530492],
       [ 6.02498399,  2.17514871, -0.59813683,  1.97519486,  2.97538541],
       [ 4.3929119 ,  3.46663491, -0.79766039,  2.02562386,  2.92132533],
       [ 5.03517027,  2.95490643, -0.73384958,  1.90863867,  2.96773935],
       [ 6.99095726,  1.19915985, -0.43748573,  1.87917127,  3.09865907],
       [ 7.32744623,  1.39796104, -0.49044247,  2.01303336,  3.01073585],
       [ 3.1529927 ,  4.28396281, -0.94069967,  1.94406787,  3.01252558],
       [ 6.69980212,  1.66782372, -0.51418007,  2.03345818,  2.96285796],
       [ 5.57655967,  2.66310515, -0.6

In [7]:
results_df2(full_sample_btsp_coeff, small_btsp_avg_coeff, true_coeffs = [1.5, 2.5, -0.7, 2, 3])

TypeError: Expected list, got numpy.ndarray

In [11]:
list(full_sample_btsp_coeff.mean(axis=0))

[5.248361883266482,
 2.6806748188152336,
 -0.676987126104186,
 1.9695749781522967,
 2.970758927425448]