In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from multiprocessing.pool import ThreadPool
from multiprocessing import cpu_count
max_cpu = cpu_count()

In [109]:
from sklearn.linear_model import LinearRegression, LassoCV

def np_standardize(arr):
    return (arr - np.mean(arr,axis=0))/np.std(arr,axis=0)

def postlassoIV_est(y,d,w,Z):
    '''
    Post Lasso IV Estimator based on Section 2.4 of BCCH 2015. Expects numpy arrays as arguments.
    For our setting, we only have 1 endogenous regression X which enters as a column vector.
    
    Parameters:
        y - outcome vector
        d - endogenous regressor
        w - exogenous regressors (excluded instruments)
        Z - instruments for D (included instruments)
    '''
    Z_standard = np_standardize(Z)
    
    # constructing optimal instrument \hat{D}
    ## selecting instruments
    lasso = LassoCV()
    lasso.fit(np.concatenate((Z_standard,w), axis=1),d)
    var_select = [i for i, coef in enumerate(lasso.coef_.flatten()[:Z.shape[1]]) if coef != 0]
    Z_select = Z[:,var_select]
    
    ## getting \hat{D} using fitted values of OLS of d on Z_select
    lr = LinearRegression()
    lr.fit(np.concatenate((Z_select,w), axis=1),d)
    D_hat = lr.predict(np.concatenate((Z_select,w), axis=1)).T
    
    # calculating IV estimator (since we are in the just identified case due to having one endogenous regressor, the
    # standard covariance formula suffices
    #alpha = np.linalg.inv(D_hat @ d.reshape(d.shape[0],1)) @ D_hat.T @ y
    alpha = np.cov(D_hat,y)[0,1] / np.cov(D_hat,d)[0,1]
    
    return alpha

def partialw(u,w):
    u_tilde = u - w @ np.linalg.inv(w.T @ w) @ (w.T @ u)
    return u_tilde

def postlassoIV_robusttest(y,d,w,Z,a):
    '''
    Sup-score statistic for testing the hypothesis \alpha_1 = a. Robust to weak instruments.
    Section 4.2 of BCCH 2015. Modified for scalar a.
    '''
    
    # partialling out w
    y_tilde = partialw(y,w)
    d_tilde = partialw(d,w)
    Z_tilde = partialw(Z,w)
    
    # normalizing instruments
    Z_tilde_norm = np_standardize(Z_tilde)
    
    # making test statistic
    n = y_tilde.shape[0]
    temp_stats = np.array([])
    for j in range(0,Z.shape[1]):
        num = np.abs(n * np.mean((y_tilde - d_tilde * a) @ Z_tilde_norm[:,j]))
        denom = np.sqrt(np.mean(np.square(y_tilde - d_tilde * a) @ np.square(Z_tilde_norm[:,j])))
        temp_stats = np.append(temp_stats,num / denom)
    
    return temp_stats.max()

In [106]:
df = pd.read_csv('data/final_data.csv')

In [107]:
y = df['lsales'].to_numpy()
d = df['lprice'].to_numpy()
w = df.filter(regex='^cntyfe_|^timefe_').to_numpy()
Z = df.filter(regex='^aftexpl\\.dist_to_ref\\d+').to_numpy()

In [103]:
alpha_1 = postlassoIV_est(y,d,w,Z)
alpha_1

-0.6128438965704044

In [110]:
a = 0
postlassoIV_robusttest(y,d,w,Z,a)

35338.9134585225