In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.sandwich_covariance import cov_hc1

In [2]:
data_path = "aerdat4.dta"
voucher = pd.read_stata(data_path)

#only keep relevant columns 
drop_col = ['inschl', 'prsch_c', 'prscha_1', 'prscha_2','response',  'rept6', 'totscyrs', 'haschild', 'married', 
 'working', 'rept', 'nrept', 'finish6', 'finish7', 'finish8', 'sex_miss', 'usngsch', 'hoursum', 'tab3smpl',
 'working3']
df = voucher.drop(columns = drop_col)

#### A: loser means 

In [3]:
A_lose_bog95 = df[(df['age2']>=9) &  (df['age2']<=25) &  (df['vouch0'] == 0) &  
    (df['id']<=4044)]
A_lose_bog97 = df[(df['age2']>=9) &  (df['age2']<=25) & (df['vouch0'] == 0) &   
    (df['dbogota']==1) & (df['d1997']==1)] 
A_lose_jum93 = df[(df['age2']>=9) & (df['age2']<=25) & (df['vouch0'] == 0) & 
    (df['djamundi']==1)]
A_lose_combined = df[(df['age2']>=9) & (df['age2']<=25) & (df['vouch0'] == 0) & 
    ((df['dbogota']==1) | (df['djamundi']==1))]

#loser means for bog95, bog97, jum93, and combined sample respectively
A_loser_mean = [A_lose_bog95, A_lose_bog97, A_lose_jum93, A_lose_combined]
for tbl in A_loser_mean:
    result= tbl[['phone', 'age2','sex_name']].agg(['mean', 'std','count'])
    print(result)

             phone         age2     sex_name
mean      0.881501    12.739302     0.492667
std       0.323305     1.326919     0.500113
count  1519.000000  1519.000000  1500.000000
            phone        age2    sex_name
mean     0.828125   12.738281    0.484127
std      0.378011    1.530686    0.500743
count  256.000000  256.000000  252.000000
            phone        age2    sex_name
mean     0.301205   12.692771    0.386076
std      0.460170    1.504199    0.488396
count  166.000000  166.000000  158.000000
             phone         age2     sex_name
mean      0.824833    12.735188     0.482723
std       0.380208     1.370508     0.499832
count  1941.000000  1941.000000  1910.000000


#### B: loser means

In [4]:
B_lose_bog95 = df[(df['age2']>=9) &  (df['age2']<=25) &  (df['vouch0'] == 0) &  
    (df['bog95asd']==1)]
B_lose_bog97 = df[(df['age2']>=9) &  (df['age2']<=25) & (df['vouch0'] == 0) &   
    (df['bog97asd']==1)] 
B_lose_jum93 = df[(df['age2']>=9) & (df['age2']<=25) & (df['vouch0'] == 0) & 
    (df['jam93asd']==1)]
B_lose_combined = df[(df['age2']>=9) & (df['age2']<=25) & (df['vouch0'] == 0) & 
    ((df['bog95asd']==1) | (df['jam93asd']==1)| (df['bog97asd']==1))]

#loser means for bog95, bog97, jum93, and combined sample respectively
B_loser_mean = [B_lose_bog95, B_lose_bog97, B_lose_jum93, B_lose_combined]
for tbl in B_loser_mean:
    result= tbl[['phone', 'age2','sex_name']].agg(['mean', 'std','count'])
    print(result)

        phone         age2     sex_name
mean      1.0    12.767150     0.500489
std       0.0     1.340805     0.500244
count  1035.0  1035.000000  1023.000000
       phone        age2    sex_name
mean     1.0   12.646226    0.488038
std      0.0    1.521515    0.501057
count  212.0  212.000000  209.000000
            phone        age2    sex_name
mean     0.370370   12.777778    0.372093
std      0.484702    1.572462    0.485247
count  135.000000  135.000000  129.000000
             phone         age2     sex_name
mean      0.938495    12.749638     0.486407
std       0.240341     1.393420     0.499999
count  1382.000000  1382.000000  1361.000000


#### C: loser means

In [5]:
C_lose_bog95 = df[(df['vouch0'] == 0) & (df['bog95smp']==1)]
C_lose_bog97 = df[(df['vouch0'] == 0) & (df['bog97smp']==1)] 
C_lose_jum93 = df[(df['vouch0'] == 0) & (df['jam93smp']==1)]
C_lose_combined = df[(df['vouch0'] == 0) & ((df['bog95smp']==1) | (df['bog97smp']==1)| (df['jam93smp']==1))]

#loser means for bog95, bog97, jum93, and combined sample respectively
C_loser_mean = [C_lose_bog95, C_lose_bog97, C_lose_jum93, C_lose_combined]
for tbl in C_loser_mean:
    result= tbl[['age', 'sex2','mom_sch','dad_sch', 'mom_age', 'dad_age','dad_mw']].agg(['mean', 'std','count'])
    print(result)

              age        sex2     mom_sch     dad_sch     mom_age     dad_age  \
mean    15.036207    0.500858    5.892157    5.890244   40.745887   44.443231   
std      1.350647    0.500429    2.668697    2.941176    7.326463    8.140590   
count  580.000000  583.000000  510.000000  410.000000  547.000000  458.000000   

           dad_mw  
mean     0.099762  
std      0.300040  
count  421.000000  
              age        sex2     mom_sch    dad_sch     mom_age     dad_age  \
mean    13.238462    0.526718    5.887931   5.541667   38.716667   41.882353   
std      1.440305    0.501202    2.746608   2.495962    6.600463    7.293418   
count  130.000000  131.000000  116.000000  96.000000  120.000000  102.000000   

          dad_mw  
mean    0.087912  
std     0.284736  
count  91.000000  
             age       sex2    mom_sch    dad_sch    mom_age    dad_age  \
mean   17.189189   0.364865   4.385965   5.244444  43.559322  45.468085   
std     1.391517   0.484678   2.717367   2.93223

#### A: won voucher

In [6]:
A_won_bog95 = df[(df['age2']>=9) &  (df['age2']<=25) &  
    (df['id'] <= 4044)]
A_won_bog97 = df[(df['age2']>=9) &  (df['age2']<=25) &  
    (df['dbogota'] == 1) & (df['d1997']==1)]
A_won_jam93 = df[(df['age2']>=9) &  (df['age2']<=25) &  
    (df['djamundi'] == 1)]
A_won_combine = df[(df['age2']>=9) &  (df['age2']<=25) &  
    ((df['dbogota'] == 1) | (df['djamundi'] == 1))]
A_won_coeff = [A_won_bog95, A_won_bog97, A_won_jam93, A_won_combine]

#define dep and ind variables
y_var = ['phone', 'age2', 'sex_name']
x_var = ['vouch0', 'dbogota', 'djamundi', 'd1993', 'd1995', 'd1997']

#winner coefficients for bog95, bog97, jum93, and combined sample respectively
for y in y_var:
    #remove NA values from dep var
    for tbl in A_won_coeff:
        if y == 'sex_name':
            tbl = tbl[(tbl[y].notna())]
        else:
            tbl = tbl
# Add constant to independent variables
        X = sm.add_constant(tbl[x_var])
# Create OLS model for each y variable  
        coeff = {}
        sd = {}
        model = sm.OLS(tbl[y], X)
        results = model.fit(cov_type='HC1')
        coeff[y] =results.params['vouch0']
        sd[y]=results.bse['vouch0']
        A_won = pd.DataFrame({'coeff': coeff, 'sd': sd})
        print(A_won)
print('count:')
print({'bog95':len(A_won_bog95),'bog97':len(A_won_bog97),'jam93':len(A_won_jam93),'combined':len(A_won_combine)})

          coeff        sd
phone  0.008788  0.010697
          coeff        sd
phone  0.028632  0.025291
          coeff        sd
phone  0.067843  0.051749
          coeff        sd
phone  0.016637  0.009929
         coeff        sd
age2 -0.085707  0.044664
        coeff        sd
age2 -0.22747  0.101592
         coeff        sd
age2 -0.383247  0.162684
         coeff      sd
age2 -0.132819  0.0399
             coeff        sd
sex_name  0.012552  0.016934
            coeff        sd
sex_name  0.00651  0.043983
             coeff       sd
sex_name  0.113924  0.05534
             coeff        sd
sex_name  0.019368  0.015171
count:
{'bog95': 3661, 'bog97': 1736, 'jam93': 335, 'combined': 5732}


In [7]:
#### B: won voucher

In [8]:
B_won_bog95 = df[(df['age2']>=9) &  (df['age2']<=25) &  
    (df['bog95asd']==1)]
B_won_bog97 = df[(df['age2']>=9) &  (df['age2']<=25) &  
    (df['bog97asd']==1)]
B_won_jam93 = df[(df['age2']>=9) &  (df['age2']<=25) &  
    (df['jam93asd']==1)]
B_won_combine = df[(df['age2']>=9) &  (df['age2']<=25) &  
    ((df['bog95asd']==1) | (df['bog97asd']==1)| (df['jam93asd']==1))]
B_won_coeff = [B_won_bog95, B_won_bog97, B_won_jam93, B_won_combine]

#define dep and ind variables
y_var = ['phone', 'age2','sex_name']
x_var = ['vouch0', 'dbogota', 'djamundi', 'd1993', 'd1995', 'd1997']

#winner coefficients for bog95, bog97, jum93, and combined sample respectively
for y in y_var:
    #remove NA values from dep var 
    for tbl in B_won_coeff:
        if y == 'sex_name':
            tbl = tbl[(tbl[y].notna())]
        else:
            tbl = tbl
# Add constant to independent variables
        X = sm.add_constant(tbl[x_var])
# Create OLS model for each y variable  
        coeff = {}
        sd = {}
        model = sm.OLS(tbl[y], X)
        results = model.fit(cov_type='HC1')
        coeff[y] =results.params['vouch0']
        sd[y]=results.bse['vouch0']
        B_won = pd.DataFrame({'coeff': coeff, 'sd': sd})
        print(B_won)
print('count:')
print({'bog95':len(B_won_bog95),'bog97':len(B_won_bog97),'jam93':len(B_won_jam93),'combined':len(B_won_combine)})

              coeff        sd
phone -5.152129e-16  0.000966
              coeff            sd
phone  9.367507e-17  1.691363e-17
          coeff        sd
phone  0.082184  0.059682
          coeff       sd
phone  0.008027  0.00588
         coeff       sd
age2 -0.110417  0.06087
         coeff        sd
age2 -0.192837  0.136414
         coeff        sd
age2 -0.595296  0.183253
         coeff        sd
age2 -0.177093  0.052562
             coeff        sd
sex_name -0.006918  0.022201
             coeff       sd
sex_name -0.020227  0.04769
             coeff        sd
sex_name  0.101591  0.060943
             coeff       sd
sex_name  0.001133  0.01911
count:
{'bog95': 2067, 'bog97': 448, 'jam93': 272, 'combined': 2787}


#### C: won voucher

In [9]:
C_won_bog95 = df[df['bog95smp']==1]
C_won_bog97 = df[df['bog97smp']==1]
C_won_jam93 = df[df['jam93smp']==1]
C_won_combine = df[(df['bog95smp']==1) | (df['bog97smp']==1)| (df['jam93smp']==1)]
C_won_coeff = [C_won_bog95, C_won_bog97, C_won_jam93, C_won_combine]

#define dep and ind variables
y_var = ['age', 'sex2','mom_sch','dad_sch', 'mom_age', 'dad_age','dad_mw']
x_var = ['vouch0', 'svy', 'hsvisit', 'djamundi', 'dbogota', 'd1993', 'd1995', 'd1997', 'dmonth1','dmonth2',
         'dmonth3','dmonth4','dmonth5','dmonth6','dmonth7','dmonth8','dmonth9','dmonth10','dmonth11','dmonth12',
         'darea1', 'darea2', 'darea3', 'darea4', 'darea5','darea6', 'darea7', 'darea8', 'darea9', 'darea10', 
         'darea11', 'darea12','darea13', 'darea14', 'darea15', 'darea16', 'darea17', 'darea18','darea19']

#winner coefficients for bog95, bog97, jum93, and combined sample respectively
for y in y_var:
    for tbl in C_won_coeff:
        # remove NA values to get coefficient
        if y != 'sex2':
            tbl = tbl[(tbl[y].notna())] 
        else:
            tbl = tbl
# Add constant to independent variables
        X = sm.add_constant(tbl[x_var])
# Create OLS model for each y variable  
        coeff = {}
        sd = {}
        model = sm.OLS(tbl[y], X)
        results = model.fit(cov_type='HC1')
        coeff[y] =results.params['vouch0']
        sd[y]=results.bse['vouch0']
        C_won = pd.DataFrame({'coeff': coeff, 'sd': sd})
        print(C_won)
print('count:')
print({'bog95':len(C_won_bog95),'bog97':len(C_won_bog97),'jam93':len(C_won_jam93),'combined':len(C_won_combine)})

        coeff        sd
age -0.013303  0.079095
        coeff        sd
age -0.259007  0.175177
        coeff        sd
age -0.374875  0.221477
        coeff        sd
age -0.106201  0.068724
         coeff        sd
sex2  0.003554  0.029683
         coeff        sd
sex2 -0.047065  0.062126
         coeff       sd
sex2  0.110268  0.07872
         coeff        sd
sex2  0.010124  0.025157
            coeff        sd
mom_sch -0.078517  0.168646
            coeff        sd
mom_sch  0.654107  0.380194
            coeff        sd
mom_sch  1.462128  0.505468
            coeff        sd
mom_sch  0.183343  0.146139
            coeff        sd
dad_sch -0.431082  0.203114
            coeff        sd
dad_sch  0.929246  0.399743
            coeff        sd
dad_sch  0.737059  0.661103
            coeff        sd
dad_sch -0.039288  0.172978
            coeff        sd
mom_age -0.089659  0.430998
            coeff        sd
mom_age -0.145763  0.827477
            coeff        sd
mom_age -0.736044  1.4

#### Test: loser means

In [10]:
data_path2 = "tab5v1.dta"
test = pd.read_stata(data_path2)

#loser means
test_loser = test[test['vouch0']==0]
test_loser_means = test_loser[['age', 'sex2','mom_sch','dad_sch', 'mom_age', 'dad_age','dad_mw']].agg(['mean', 'std', 'count'])
print(test_loser_means)

              age        sex2     mom_sch     dad_sch     mom_age     dad_age  \
mean    14.927419    0.451613    5.459677    4.032258   40.289256   43.495238   
std      1.397869    0.499672    2.911707    3.283206    6.575761    7.652455   
count  124.000000  124.000000  124.000000  124.000000  121.000000  105.000000   

          dad_mw  
mean    0.051546  
std     0.222258  
count  97.000000  


#### Test: won voucher

In [11]:
#define dep and ind variables
y_var = ['age','sex2','mom_sch','dad_sch', 'mom_age', 'dad_age','dad_mw']
x_var = ['vouch0', 'svy', 'hsvisit', 'djamundi', 'dbogota', 'd1993', 'd1995', 'd1997', 'dmonth1','dmonth2',
         'dmonth3','dmonth4','dmonth5','dmonth6','dmonth7','dmonth8','dmonth9','dmonth10','dmonth11','dmonth12',
         'darea1', 'darea2', 'darea3', 'darea4', 'darea5','darea6', 'darea7', 'darea8', 'darea9', 'darea10', 
         'darea11', 'darea12','darea13', 'darea14', 'darea15', 'darea16', 'darea17', 'darea18','darea19']

for y in y_var:
    # remove NA values to get coefficient
    tbl = test[(test[y].notna())]
    # Add constant to independent variables
    X = sm.add_constant(tbl[x_var])
    # Create OLS model for each y variable  
    coeff = {}
    sd = {}
    model = sm.OLS(tbl[y], X)
    results = model.fit(cov_type='HC1')
    coeff[y] =results.params['vouch0']
    sd[y]=results.bse['vouch0']
    test_coeff = pd.DataFrame({'coeff': coeff, 'sd': sd})
    print(test_coeff)
print({'count':len(test)})

        coeff       sd
age -0.128327  0.18407
        coeff        sd
sex2  0.06202  0.064468
            coeff       sd
mom_sch -0.121869  0.38476
            coeff        sd
dad_sch -0.073584  0.433849
            coeff        sd
mom_age  0.155623  0.919073
           coeff        sd
dad_age  0.62002  1.232901
           coeff      sd
dad_mw  0.128317  0.0497
{'count': 283}
