In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.sandwich_covariance import cov_hc1

In [2]:
data_path = "aerdat4.dta"
voucher = pd.read_stata(data_path)

# remove rows with conflicting cohort data
cohort = voucher.copy()[(voucher['bog95smp']==1)|(voucher['bog97smp']==1) | (voucher['jam93smp']==1)]
cohort = cohort[cohort['tab3smpl']==1]
#only keep certain columns 
drop_col = ['sex', 'bog95asd', 'bog97asd', 'jam93asd', 'response', 'test_tak', 'sex_name', 'bog95', 'bog97',
             'mom_sch', 'mom_age', 'mom_mw', 'dad_sch','dad_age', 'dad_mw','haschild','married', 'working',
             'hoursum', 'tab3smpl', 'working3']
filtered_cohort = cohort.drop(columns = drop_col)

#### Bogota 1995: male and female loser means

In [3]:
#bogota 1995 sample of losers 
bog95_loser = filtered_cohort[(filtered_cohort['bog95smp']==1) &(filtered_cohort['vouch0']==0)]
#sort and group values by sex
bog95_loser_sex = bog95_loser.groupby('sex2').agg(['mean','std','count'])
#only keep dependent variables
dep_var = ['prscha_1', 'prscha_2', 'prsch_c', 'scyfnsh', 'inschl', 'finish6', 'finish7', 'finish8', 'rept6',
       'rept', 'nrept', 'totscyrs']
bog95_loser_means = bog95_loser_sex[dep_var]

#put values in dataframe
bog95_loser_tbl = bog95_loser_means.transpose().rename(columns = {0:'female',1:'male'})
print(bog95_loser_tbl)

sex2                female        male
prscha_1 mean     0.897163    0.857143
         std      0.304286    0.350554
         count  282.000000  280.000000
prscha_2 mean     0.698582    0.646429
         std      0.459690    0.478934
         count  282.000000  280.000000
prsch_c  mean     0.535461    0.542857
         std      0.499628    0.499052
         count  282.000000  280.000000
scyfnsh  mean     7.570922    7.400000
         std      0.933904    0.989551
         count  282.000000  280.000000
inschl   mean     0.819149    0.842857
         std      0.385579    0.364587
         count  282.000000  280.000000
finish6  mean     0.953901    0.932143
         std      0.210073    0.251951
         count  282.000000  280.000000
finish7  mean     0.868794    0.825000
         std      0.338225    0.380647
         count  282.000000  280.000000
finish8  mean     0.673759    0.589286
         std      0.469670    0.492844
         count  282.000000  280.000000
rept6    mean     0.15957

#### Bogota 1995: Female and Male Basic Controls

In [4]:
#define ind variables
x_var = ['vouch0', 'svy', 'hsvisit', 'djamundi', 'phone', 'age', 'sex2', 'strata1','strata2','strata3','strata4',
           'strata5','strata6', 'stratams','dbogota', 'd1993', 'd1995', 'd1997', 'dmonth1','dmonth2','dmonth3','dmonth4',
           'dmonth5','dmonth6','dmonth7','dmonth8','dmonth9','dmonth10','dmonth11','dmonth12', 'sex_miss']

#coefficients for female and male data respectively
for sex in [1,0]:
    df = filtered_cohort[(filtered_cohort['sex2']==sex)& (filtered_cohort['bog95smp']==1)]
    # Add constant to independent variables
    X = sm.add_constant(df[x_var])
    # Create OLS model for each y variable  
    coeff = {}
    sd = {}
    for y in dep_var:
        model = sm.OLS(df[y], X)
        # HC1 for heteroskedasticity-consistent standard errors (SAS /acov equivalent)
        results = model.fit(cov_type='HC1')
        coeff[y] =results.params['vouch0']
        sd[y]=results.bse['vouch0']
    bog96_control = pd.DataFrame({'coeff': coeff, 'sd': sd})
    print(bog96_control)
    print({'count':len(df)})

             coeff        sd
prscha_1  0.090172  0.026306
prscha_2  0.192240  0.036229
prsch_c   0.136347  0.039494
scyfnsh   0.123496  0.077273
inschl   -0.019509  0.029418
finish6   0.014449  0.018518
finish7   0.026428  0.029756
finish8   0.094981  0.040236
rept6    -0.086613  0.037790
rept     -0.083024  0.034306
nrept    -0.101475  0.042915
totscyrs -0.028629  0.077949
{'count': 575}
             coeff        sd
prscha_1  0.022859  0.022313
prscha_2  0.143959  0.034651
prsch_c   0.171052  0.039543
scyfnsh   0.140357  0.066422
inschl    0.034716  0.027702
finish6   0.031746  0.013066
finish7   0.041145  0.025046
finish8   0.104731  0.036784
rept6    -0.036167  0.030390
rept     -0.028960  0.031614
nrept    -0.031336  0.033774
totscyrs  0.090922  0.063950
{'count': 572}


#### Combined: Female and Male Basic Controls

In [5]:
#define ind variables
x_var = ['vouch0', 'svy', 'hsvisit', 'djamundi', 'phone', 'age', 'sex2', 'strata1','strata2','strata3','strata4',
           'strata5','strata6', 'stratams','dbogota', 'd1993', 'd1995', 'd1997', 'dmonth1','dmonth2','dmonth3','dmonth4',
           'dmonth5','dmonth6','dmonth7','dmonth8','dmonth9','dmonth10','dmonth11','dmonth12', 'sex_miss']

#coefficients for male and female data respectively
for sex in [1,0]:
    df = filtered_cohort[filtered_cohort['sex2']==sex]
    # Add constant to independent variables
    X = sm.add_constant(df[x_var])
    # Create OLS model for each y variable  
    coeff = {}
    sd = {}
    for y in dep_var:
        model = sm.OLS(df[y], X)
        results = model.fit(cov_type='HC1')
        coeff[y] =results.params['vouch0']
        sd[y]=results.bse['vouch0']
    combined_tbl = pd.DataFrame({'coeff': coeff, 'sd': sd})
    print(combined_tbl)
    print({'count':len(df)})

             coeff        sd
prscha_1  0.062864  0.023682
prscha_2  0.168773  0.031446
prsch_c   0.123528  0.033524
scyfnsh   0.055993  0.063111
inschl   -0.026311  0.023896
finish6   0.002598  0.017397
finish7  -0.003332  0.024286
finish8   0.065651  0.030854
rept6    -0.069601  0.031504
rept     -0.075807  0.028607
nrept    -0.079082  0.035143
totscyrs -0.040871  0.067597
{'count': 779}
             coeff        sd
prscha_1  0.072180  0.021715
prscha_2  0.175611  0.029953
prsch_c   0.182161  0.032973
scyfnsh   0.121720  0.053158
inschl    0.029170  0.022708
finish6   0.027212  0.012260
finish7   0.022232  0.020372
finish8   0.078306  0.027680
rept6    -0.032620  0.023773
rept     -0.034877  0.025300
nrept    -0.037013  0.026675
totscyrs  0.081021  0.055901
{'count': 798}
