In [1]:
import numpy as np
import numpy.linalg as la
import pandas as pd
from sklearn.linear_model import Lasso
from scipy.stats import norm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

# autoreload
%load_ext autoreload
%autoreload 2

# import data and code
from Project_2 import *
print(f'The data contains {dat.shape[0]} rows (countries) and {dat.shape[1]} columns (variables).')

The data contains 214 rows (countries) and 85 columns (variables).


### Background code

In [None]:
def var_single(x, y, coeffs):
    # Estimate variance for single post Lasso
    N = x.shape[0]
    res = y - x @ coeffs
    SSR = res.T @ res
    sigma2 = SSR/(N-x.shape[1])
    var_single = sigma2*la.inv(x.T@x)

    return var_single

def standard_errors1(var):
    # Calculate standard errors for SPL
    se1 = np.sqrt(np.diagonal(var)).reshape(-1, 1)
    se1 = se1[1][0]

    return se1

### Using first dataset

**Single Lasso using BRT**

In [3]:
''' Step 1: Calculate BRT penalty '''
penalty_BRT = BRT(X_tilde, g)

''' Step 2: Lasso g using y0 and Z (i.e. g using X) '''
fit_BRTgx, coeff_BRTgx, intercept_BRTgx = lasso(X_tilde, g, penalty_BRT)

# Implied estimates and selection of non-zero coefficient variables for further analysis
Z_J_BRTgx = Z[:,coeff_BRTgx[1:]!=0] # Note: We use Z and not Z_tilde
selected_variables_BRTgx = (coeff_BRTgx != 0)

# Display number of variables in Z_J
if Z_J_BRTgx.size > 0: # if Z_J is non-empty and variables have been selected
    print("The number of variables in Z_J is {}".format(Z_J_BRTgx.shape[1]))
    print('Selected varriables: ', X_names[selected_variables_BRTgx].to_list())
else: # if Z_J is empty
    print("The number of variables in Z_J is 0, no variables selected.")

''' Step 3: Regress g using y0 and Z_J (selected variables) using OLS '''
# Add a constant to X
xx = np.column_stack((np.ones(N),y0,Z_J_BRTgx))
yy = np.array(g).reshape(-1,1)

# Calculate OLS estimate
coefs_BRT_SPL = la.inv(xx.T@xx)@xx.T@yy
beta_y0_BRTSPL = coefs_BRT_SPL[1][0]

#print the coefficients
pd.DataFrame(coefs_BRT_SPL, index=['constant', 'lgdp_initial'], columns=['gdp_growth'])

''' Step 4: Calculate variance, standard errors and confidence interval '''
# Estimate variance
var_BRT_SPL = var_single(xx,yy,coefs_BRT_SPL)

# Calculate standard errors
se_BRT_SPL = standard_errors1(var_BRT_SPL)
print("se_BRT_PSL = ",se_BRT_SPL.round(5))

# Calculate confidence interval
CI_low_BRT_SPL  = beta_y0_BRTSPL - q * se_BRT_SPL
CI_high_BRT_SPL = beta_y0_BRTSPL + q * se_BRT_SPL

# Display confidence interval
print("Confidence interval for β_y0 (BRT_SPL) = ",(CI_low_BRT_SPL.round(5),CI_high_BRT_SPL.round(5)))

NameError: name 'X_tilde' is not defined

**Single Lasso using BCCH**

In [None]:
''' Run the Single-post Lasso with BCCH penalty '''
# Caluclate the BRT penalty
penalty_BCCH = BCCH(X_tilde,g)

# Implied estimates and selection
fit_BCCHgx, coeff_BCCHgx, intercept_BCCHgx = lasso(X_tilde, g, penalty_BCCH)

Z_J_BCCHgx = Z[:,coeff_BCCHgx[1:]!=0] # Note: We use Z and not Z_stan
selected_variables_BCCHgx = (coeff_BCCHgx != 0)

# Display number of variables in Z_J
if Z_J_BCCHgx.size > 0: # if Z_J is non-empty and variables have been selected
    print("The number of variables in Z_J is {}".format(Z_J_BCCHgx.shape[1]))
    print('Selected varriables: ', X_names[selected_variables_BCCHgx].to_list())
else:
    print("The number of variables in Z_J is 0, no variables selected.")

''' Step 3: Regress g using y0 and Z_J (selected variables) using OLS '''

# Add a constant to X
xx = np.column_stack((np.ones(N),y0,Z_J_BCCHgx))
yy = np.array(g).reshape(-1,1)

# Calculate OLS estimate
coefs_BCCH_SPL = la.inv(xx.T@xx)@xx.T@yy
beta_y0_BCCHSPL = coefs_BRT_SPL[1][0]

#print the coefficients
pd.DataFrame(coefs_BRT_SPL, index=['constant', 'lgdp_initial'], columns=['gdp_growth'])

# Estimate variance
var_BCCH_SPL = var_single(xx,yy,coefs_BCCH_SPL)

# Calculate standard errors
se_BCCH_SPL = standard_errors1(var_BCCH_SPL)
print("se_BCCH_PSL = ",se_BCCH_SPL.round(5))

# Calculate confidence interval
CI_low_BCCH_SPL  = beta_y0_BCCHSPL - q * se_BCCH_SPL
CI_high_BCCH_SPL = beta_y0_BCCHSPL + q * se_BCCH_SPL

# Display confidence interval
print("Confidence interval for β_y0 (BRT_SPL) = ",(CI_low_BCCH_SPL.round(5),CI_high_BCCH_SPL.round(5)))

### Using second dataset

In [None]:
included_rows2 = dat[vv_outcome + vv_key + ['investment_rate'] ].notnull().all(axis=1)
data2 = dat[included_rows2]

data2 = data2.dropna(axis=1)

print(f'The number of observations left in data2 is {data2.shape[0]}.')
print(f'The number of variables with no missing values is {data2.shape[1]}.')
print(f'The remaining variables are: {data2.columns.to_list()}')

In [None]:
g = data2[vv_outcome].squeeze() #*100 to get it in percentage and not decimals
y0 = data2[vv_key].squeeze()
Z_basic = data2.drop(["gdp_growth", "lgdp_initial",  # Drop outcome and key explanatory variable
                      "code", "constant", "gdp_initial", "lpop_initial", "pother", "europe"], axis=1) # Drop irrelevant/perfectly correlated/reference variables
Z2 = PolynomialFeatures(1, include_bias=False).fit_transform(Z_basic)
X2 = np.column_stack((y0,Z2))
N = X2.shape[0]

# Standardize data
X2_tilde = standardize(X2)
Z2_tilde = standardize(Z2)
y0_tilde = standardize(y0)

print(f'The number of variables in Z is {Z2.shape[1]}.')

# Create a DataFrame with X_names as the index
Z2_names = Z_basic.columns
X2_names = Z2_names.insert(0, y0.name)
print(f'The first five rows are: {X2_names[:5]}')

**Single Lasso BRT**

In [None]:
''' Step 1: Calculate BRT penalty '''
penalty_BRT2 = BRT(X2_tilde,g)

''' Step 2: Lasso g using y0 and Z (i.e. g using X) '''
# Implied estimates and selection
fit_BRTgx2, coeff_BRTgx2, intercept_BRTgx2 = lasso(X2_tilde, g, penalty_BRT2)

Z_J_BRTgx2 = Z2[:,coeff_BRTgx2[1:]!=0] # Note: We use Z and not Z_stan
selected_variables_BRTgx2 = (coeff_BRTgx2 != 0)

# Display number of variables in Z_J
if Z_J_BRTgx2.size > 0: # if Z_J is non-empty and variables have been selected
    print("The number of variables in Z_J is {}".format(Z_J_BRTgx2.shape[1]))
    print('Selected varriables: ', X2_names[selected_variables_BRTgx2].to_list())
else:
    print("The number of variables in Z_J is 0, no variables selected.")

''' Step 3: Regress g using y0 and Z_J (selected variables) using OLS '''
# Add a constant to X
xx2 = np.column_stack((np.ones(N),y0,Z_J_BRTgx2))
yy = np.array(g).reshape(-1,1)

# Calculate OLS estimate
coefs_BRT_SPL2 = la.inv(xx2.T@xx2)@xx2.T@yy
beta_y0_BRTSPL2 = coefs_BRT_SPL2[1][0]

#print the coefficients
print(pd.DataFrame(coefs_BRT_SPL2, index=[['constant', 'lgdp_initial']+X2_names[selected_variables_BRTgx2].to_list()], columns=['gdp_growth']))

''' Step 4: calculate confidence interval  for β_y0 '''
# Estimate variance
var_BRT_SPL2 = var_single(xx2,yy,coefs_BRT_SPL2)

# Calculate standard errors
se_BRT_SPL2 = standard_errors1(var_BRT_SPL2)
print("se_BRT_SPL = ",se_BRT_SPL2.round(5))

# Calculate confidence interval
CI_low_BRT_SPL2  = beta_y0_BRTSPL2 - q * se_BRT_SPL2
CI_high_BRT_SPL2 = beta_y0_BRTSPL2 + q * se_BRT_SPL2

# Display confidence interval
print("Confidence interval for β_y0 (BRT_PSL) = ",(CI_low_BRT_SPL2.round(5),CI_high_BRT_SPL2.round(5)))

Double Lasso

In [None]:
''' Step 1 and 2 are the same as for PSL: Calculate BRT penalty and Lasso g using X_tilde '''
''' Step 3: Lasso y0 using Z_tilde '''
penalty_BRTy0z2 = BRT(Z2_tilde, y0)
fit_BRTy0z2, coeff_BRTy0z2, intercept_BRTy0z2 = lasso(Z2_tilde, y0, penalty_BRTy0z2)

selected_variables_BRTy0z2 = (coeff_BRTy0z2 != 0)
print('Selected varriables: ', Z2_names[selected_variables_BRTy0z2].to_list())

''' Step 4: Calculate β_y0 and its confidence interval '''
# Calculate residuals
res_BRTgx2 = g - fit_BRTgx2.predict(X2_tilde)
res_BRTgxz2 = res_BRTgx2 + y0_tilde * coeff_BRTgx2[0]
res_BRTy0z2 = y0 - fit_BRTy0z2.predict(Z2_tilde)

# Calculate and display beta_y0
beta_y0_BRTPDL2 = beta(y0, res_BRTy0z2, res_BRTgxz2)
print("Coefficient for β_y0 (BRT_PDL2) = ",beta_y0_BRTPDL2.round(5))

# Calculate variance    
sigma2_BRT_PDL2 = var_double(X2_tilde, res_BRTy0z2, res_BRTgx2)

# Calculate and display standard error
se_BRT_PDL2 = standard_errors2(X2_tilde, sigma2_BRT_PDL2)
print("se_BRT_PDL = ",se_BRT_PDL2.round(5))

# Calculate and display confidence interval
CI_low_BRT_PDL2  = beta_y0_BRTPDL2 - q * se_BRT_PDL2
CI_high_BRT_PDL2 = beta_y0_BRTPDL2 + q * se_BRT_PDL2
print("Confidence interval for β_y0 (BRT_PDL) = ",(CI_low_BRT_PDL2.round(5),CI_high_BRT_PDL2.round(5)))

**Single Lasso BCCH**

In [None]:
penalty_BCCH2 = BCCH(X2_tilde,g)

# Implied estimates and selection
fit_BCCHgx2, coeff_BCCHgx2, intercept_BCCHgx2 = lasso(X2_tilde, g, penalty_BCCH2)

Z_J_BCCHgx = Z2[:,coeff_BCCHgx2[1:]!=0] # Note: We use Z and not Z_stan
selected_variables_BCCHgx = (coeff_BCCHgx2 != 0)

# Display number of variables in Z_J
if Z_J_BCCHgx.size > 0: # if Z_J is non-empty and variables have been selected
    print("The number of variables in Z_J is {}".format(Z_J_BCCHgx.shape[1]))
    print('Selected varriables: ', X2_names[selected_variables_BCCHgx].to_list())
else:
    print("The number of variables in Z_J is 0, no variables selected.")

Double Lasso

In [4]:
# Calculate penalty rule
penalty_BCCHy0z2 = BCCH(Z2_tilde, y0)

# Run Lasso
fit_BCCHy0z2, coeff_BCCHy0z2, intercept_BCCHy0z2 = lasso(Z2_tilde, y0, penalty_BCCHy0z2)

selected_variables_BCCHy0z = (coeff_BCCHy0z2 != 0)
print('Selected varriables: ', Z2_names[selected_variables_BCCHy0z].to_list())

# Calculate residuals
res_BCCHgx2 = g - fit_BCCHgx2.predict(X2_tilde)
res_BCCHgxz2 = res_BCCHgx2 + y0_tilde * coeff_BCCHgx2[0]
res_BCCHy0z2 = y0 - fit_BCCHy0z2.predict(Z2_tilde)

# Calculate and display beta_y0
beta_y0_BCCHPDL2 = beta(y0, res_BCCHy0z2,res_BCCHgxz2)
print("Coefficient for β_y0 (BCCH_PDL) = ",beta_y0_BCCHPDL2.round(5))

# Calculate variance   
sigma2_BCCH_PDL2 = var_double(X2_tilde,res_BCCHy0z2,res_BCCHgx2)

# Calculate and display standard error
se_BCCH_PDL2 = standard_errors2(X2_tilde,sigma2_BCCH_PDL2)
print("se_BCCH_PDL = ",se_BCCH_PDL.round(5))

# Calculate and display confidence interval
CI_low_BCCH_PDL2  = beta_y0_BCCHPDL2 - q * se_BCCH_PDL2
CI_high_BCCH_PDL2 = beta_y0_BCCHPDL2 + q * se_BCCH_PDL2
print("Confidence interval for β_y0 (BCCH_PDL) = ",(CI_low_BCCH_PDL2.round(5),CI_high_BCCH_PDL2.round(5)))

NameError: name 'Z2_tilde' is not defined

In [None]:
BRT_PSL_results2={'β_y0': beta_y0_BRTSPL2, 'SE': se_BRT_SPL2, 'CI_low': CI_low_BRT_SPL2, 'CI_high': CI_high_BRT_SPL2}
BRT_PDL_results2={'β_y0': beta_y0_BRTPDL2, 'SE': se_BRT_PDL2, 'CI_low': CI_low_BRT_PDL2, 'CI_high': CI_high_BRT_PDL2}
BCCH_PSL_results2={'β_y0': beta_y0_BRTSPL2, 'SE': se_BRT_SPL2, 'CI_low': CI_low_BRT_SPL2, 'CI_high': CI_high_BRT_SPL2}
BCCH_PDL_results2={'β_y0': beta_y0_BCCHPDL2, 'SE': se_BCCH_PDL2, 'CI_low': CI_low_BCCH_PDL2, 'CI_high': CI_high_BCCH_PDL2}

all_results_2=pd.DataFrame([BRT_PSL_results2, BRT_PDL_results2, BCCH_PSL_results2, BCCH_PDL_results2], index=['PSL (BRT)', 'PDL (BRT)', 'PSL (BCCH)', 'PDL (BCCH)'])
print(pd.DataFrame.to_latex(all_results_2,index=0))

penalty_2=pd.DataFrame([penalty_BRT2.round(4), penalty_BCCH2.round(4)], index=['BRT', 'BCCH'], columns=['Penalty'])
print(pd.DataFrame.to_latex(penalty_2))