In [139]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [140]:
df = pd.read_stata("aerdat4.dta")
tab5 = pd.read_stata("tab5v1.dta")

In [141]:
tab5.columns

Index(['id', 'vouch0', 'sex', 'svy', 'age', 'hsvisit', 'strata1', 'strata2',
       'strata3', 'strata4', 'strata5', 'strata6', 'mom_sch', 'dad_sch',
       'dad_miss', 'mom_miss', 'math', 'reading', 'writing', 't_site',
       'totalpts', 'tsite1', 'tsite2', 'tsite3', 'sex_name', 'bog95smp',
       'bog95asd', 'bog97smp', 'bog97asd', 'jam93smp', 'jam93asd', 'test_tak',
       'dbogota', 'djamundi', 'd1995', 'd1993', 'd1997', 'phone', 'scyfnsh',
       'darea1', 'darea2', 'darea3', 'darea4', 'darea5', 'darea6', 'darea7',
       'darea8', 'darea9', 'darea10', 'darea11', 'darea12', 'darea13',
       'darea14', 'darea15', 'darea16', 'darea17', 'darea18', 'darea19',
       'dmonth1', 'dmonth2', 'dmonth3', 'dmonth4', 'dmonth5', 'dmonth6',
       'dmonth7', 'dmonth8', 'dmonth9', 'dmonth10', 'dmonth11', 'dmonth12',
       'bog95', 'bog97', 'mom_age', 'mom_mw', 'dad_age', 'dad_mw', 'sex2',
       'stratams', 'age2'],
      dtype='object')

In [142]:
# Panel A Column 1

treatment = 'vouch0'
controls = ['tsite1','tsite2','tsite3']
ols_vars_panelA = ['totalpts','math','reading','writing']

def tab5_panelA_col1(vars, treatment, controls):

    for var in vars:
        vars_needed = [treatment, var] + controls
        data = tab5[vars_needed].dropna()

        X = sm.add_constant(data[[treatment] + controls])
        y = data[var]

        model = sm.OLS(y, X).fit()

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = data.shape[0]

        print(f"{var}: coef={round(coef, 4)}, std_err={round(std_err, 4)}, n={n}")

In [143]:
tab5_panelA_col1(ols_vars_panelA, treatment, controls)

totalpts: coef=0.2168, std_err=0.1194, n=282
math: coef=0.1776, std_err=0.119, n=282
reading: coef=0.2036, std_err=0.1201, n=283
writing: coef=0.1259, std_err=0.1209, n=283


In [144]:
# Panel B Column 1

ols_vars_panelBandC = ['totalpts','math','reading']

def tab5_panelB_col1(vars, treatment, controls):

    for var in vars:
        vars_needed = [treatment, var] + controls
        data = tab5[tab5['sex2'] == 0][vars_needed].dropna()

        X = sm.add_constant(data[[treatment] + controls])
        y = data[var]

        model = sm.OLS(y, X).fit()

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = data.shape[0]

        print(f"{var}: coef={round(coef, 4)}, std_err={round(std_err, 4)}, n={n}")

In [145]:
tab5_panelB_col1(ols_vars_panelBandC, treatment, controls)

totalpts: coef=0.2412, std_err=0.1534, n=147
math: coef=0.3281, std_err=0.1447, n=147
reading: coef=0.1449, std_err=0.1567, n=148


In [146]:
# Panel C Column 1

def tab5_panelC_col1(vars, treatment, controls):

    for var in vars:
        vars_needed = [treatment, var] + controls
        data = tab5[tab5['sex2'] == 1][vars_needed].dropna()

        X = sm.add_constant(data[[treatment] + controls])
        y = data[var]

        model = sm.OLS(y, X).fit()

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = data.shape[0]

        print(f"{var}: coef={round(coef, 4)}, std_err={round(std_err, 4)}, n={n}")

In [147]:
tab5_panelC_col1(ols_vars_panelBandC, treatment, controls)

totalpts: coef=0.179, std_err=0.1834, n=135
math: coef=-0.0083, std_err=0.1884, n=135
reading: coef=0.2549, std_err=0.1841, n=135


In [148]:
# Panel A Column 2

treatment = 'vouch0'
controls_w_covariates = ['tsite1','tsite2','tsite3','svy','hsvisit','age','sex','mom_sch','strata1','strata2','strata3','strata4','strata5','strata6','dad_sch','mom_miss','dad_miss']

def tab5_panelA_col2(vars, treatment, controls):

    for var in vars:
        vars_needed = [treatment, var] + controls
        data = tab5[vars_needed].dropna()

        X = sm.add_constant(data[[treatment] + controls])
        y = data[var]

        model = sm.OLS(y, X).fit()

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = data.shape[0]

        print(f"{var}: coef={round(coef, 4)}, std_err={round(std_err, 4)}, n={n}")

In [149]:
tab5_panelA_col2(ols_vars_panelA, treatment, controls_w_covariates)

totalpts: coef=0.2237, std_err=0.1091, n=282
math: coef=0.1763, std_err=0.1139, n=282
reading: coef=0.2113, std_err=0.1147, n=283
writing: coef=0.1391, std_err=0.1132, n=283


In [150]:
# Panel B Column 2

def tab5_panelB_col2(vars, treatment, controls):

    for var in vars:
        vars_needed = [treatment, var] + controls
        data = tab5[tab5['sex2'] == 0][vars_needed].dropna()

        X = sm.add_constant(data[[treatment] + controls])
        y = data[var]

        model = sm.OLS(y, X).fit()

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = data.shape[0]

        print(f"{var}: coef={round(coef, 4)}, std_err={round(std_err, 4)}, n={n}")

In [151]:
tab5_panelB_col2(ols_vars_panelBandC, treatment, controls_w_covariates)

totalpts: coef=0.3046, std_err=0.1324, n=147
math: coef=0.3686, std_err=0.1361, n=147
reading: coef=0.1775, std_err=0.1494, n=148


In [152]:
# Panel C Column 2

def tab5_panelC_col2(vars, treatment, controls):

    for var in vars:
        vars_needed = [treatment, var] + controls
        data = tab5[tab5['sex2'] == 1][vars_needed].dropna()

        X = sm.add_constant(data[[treatment] + controls])
        y = data[var]

        model = sm.OLS(y, X).fit()

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = data.shape[0]

        print(f"{var}: coef={round(coef, 4)}, std_err={round(std_err, 4)}, n={n}")

In [153]:
tab5_panelC_col2(ols_vars_panelBandC, treatment, controls_w_covariates)

totalpts: coef=0.1497, std_err=0.1858, n=135
math: coef=-0.0134, std_err=0.1926, n=135
reading: coef=0.2186, std_err=0.1864, n=135


In [154]:
# Panel A Column 3 & 4 (Pooled)

controls_w_covariates = ['tsite1','tsite2','tsite3','svy','hsvisit','age','sex2','mom_sch','dad_sch','mom_miss','dad_miss']

def tab5_panelA_col3or4_pooled(vars, treatment, controls, with_covariates=True):
    for var in vars:
        long = tab5[['id',treatment] + controls + ['math','reading','writing']]
        long = long.melt(id_vars=['id',treatment] + controls, value_vars=['math','reading','writing'], var_name='subject', value_name='score').dropna(subset=['score'])
        
        subject = pd.get_dummies(long['subject'], drop_first=True)
        base_cols  = [treatment] + (controls if with_covariates else [])
        X  = sm.add_constant(pd.concat([long[base_cols], subject], axis=1)).astype(float)
        y  = long['score'].astype(float)

        cluster = long['id']
        model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': cluster})

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = len(y)

        col_num = 4 if with_covariates else 3
        print(f"{var:9s}  col{col_num}  coef={coef: .3f},  se={std_err: .3f},  N={n}")

test_vars = ['pooled_test_scores']

print("-- column 3 panel A (no extra covariates) --")
tab5_panelA_col3or4_pooled(test_vars, treatment, controls_w_covariates, with_covariates=False)

print("\n-- column 4 panel A (with covariates) --")
tab5_panelA_col3or4_pooled(test_vars, treatment, controls_w_covariates, with_covariates=True)

-- column 3 panel A (no extra covariates) --
pooled_test_scores  col3  coef= 0.145,  se= 0.097,  N=848

-- column 4 panel A (with covariates) --
pooled_test_scores  col4  coef= 0.155,  se= 0.089,  N=848


In [155]:
# Panel A Column 3 & 4 (Math and Reading Scores)

def tab5_panelA_col3or4_MandR(vars, treatment, controls, with_covariates=True):
    for var in vars:
        long = tab5[['id',treatment] + controls + ['math','reading']]
        long = long.melt(id_vars=['id',treatment] + controls, value_vars=['math','reading'], var_name='subject', value_name='score').dropna(subset=['score'])
        
        subject = pd.get_dummies(long['subject'], drop_first=True)
        base_cols  = [treatment] + (controls if with_covariates else [])
        X  = sm.add_constant(pd.concat([long[base_cols], subject], axis=1)).astype(float)
        y  = long['score'].astype(float)

        cluster = long['id']
        model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': cluster})

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = len(y)

        col_num = 4 if with_covariates else 3
        print(f"{var:9s}  col{col_num}  coef={coef: .3f},  se={std_err: .3f},  N={n}")

test_vars = ['math_and_reading']

print("-- column 3 panel A (no extra covariates) --")
tab5_panelA_col3or4_MandR(test_vars, treatment, controls_w_covariates, with_covariates=False)

print("\n-- column 4 panel A (with covariates) --")
tab5_panelA_col3or4_MandR(test_vars, treatment, controls_w_covariates, with_covariates=True)

-- column 3 panel A (no extra covariates) --
math_and_reading  col3  coef= 0.157,  se= 0.103,  N=565

-- column 4 panel A (with covariates) --
math_and_reading  col4  coef= 0.170,  se= 0.096,  N=565


In [156]:
# Panel B Column 3 & 4 (Math and Reading Scores)

def tab5_panelB_col3or4_MandR(vars, treatment, controls, with_covariates=True):
    for var in vars:
        long = tab5[['id',treatment] + controls + ['math','reading']]
        long = long.melt(id_vars=['id',treatment] + controls, value_vars=['math','reading'], var_name='subject', value_name='score').dropna(subset=['score'])
        long = long[ long['sex2'] == 0]
        
        subject = pd.get_dummies(long['subject'], drop_first=True)
        base_cols  = [treatment] + (controls if with_covariates else [])
        X  = sm.add_constant(pd.concat([long[base_cols], subject], axis=1)).astype(float)
        y  = long['score'].astype(float)

        cluster = long['id']
        model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': cluster})

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = len(y)

        col_num = 4 if with_covariates else 3
        print(f"{var:9s}  col{col_num}  coef={coef: .3f},  se={std_err: .3f},  N={n}")

test_vars = ['math_and_reading']

print("-- column 3 panel B (no extra covariates) --")
tab5_panelB_col3or4_MandR(test_vars, treatment, controls_w_covariates, with_covariates=False)

print("\n-- column 4 panel B (with covariates) --")
tab5_panelB_col3or4_MandR(test_vars, treatment, controls_w_covariates, with_covariates=True)

-- column 3 panel B (no extra covariates) --
math_and_reading  col3  coef= 0.173,  se= 0.128,  N=295

-- column 4 panel B (with covariates) --
math_and_reading  col4  coef= 0.262,  se= 0.112,  N=295


In [157]:
# Panel C Column 3 & 4 (Math and Reading Scores)

def tab5_panelC_col3or4_MandR(vars, treatment, controls, with_covariates=True):
    for var in vars:
        long = tab5[['id',treatment] + controls + ['math','reading']]
        long = long.melt(id_vars=['id',treatment] + controls, value_vars=['math','reading'], var_name='subject', value_name='score').dropna(subset=['score'])
        long = long[ long['sex2'] == 1]
        
        subject = pd.get_dummies(long['subject'], drop_first=True)
        base_cols  = [treatment] + (controls if with_covariates else [])
        X  = sm.add_constant(pd.concat([long[base_cols], subject], axis=1)).astype(float)
        y  = long['score'].astype(float)

        cluster = long['id']
        model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': cluster})

        coef = model.params[treatment]
        std_err = model.bse[treatment]
        n = len(y)

        col_num = 4 if with_covariates else 3
        print(f"{var:9s}  col{col_num}  coef={coef: .3f},  se={std_err: .3f},  N={n}")

test_vars = ['math_and_reading']

print("-- column 3 panel C (no extra covariates) --")
tab5_panelC_col3or4_MandR(test_vars, treatment, controls_w_covariates, with_covariates=False)

print("\n-- column 4 panel C (with covariates) --")
tab5_panelC_col3or4_MandR(test_vars, treatment, controls_w_covariates, with_covariates=True)

-- column 3 panel C (no extra covariates) --
math_and_reading  col3  coef= 0.114,  se= 0.163,  N=270

-- column 4 panel C (with covariates) --
math_and_reading  col4  coef= 0.066,  se= 0.154,  N=270
