In [4]:
import pandas as pd
import statsmodels.formula.api as sm

### Table 7 Recreation

In [5]:
df = pd.read_stata("tab7.dta")
df = df[df["response"] == 1]

base_mask = (
    (df['bog95smp'] == 1) &
    (df['year'] == 1995) &
    (df['vouch0'] == 0) &
    (df['response'] == 1) &
    df['scyfnsh'].notnull() &
    df['finish6'].notnull() &
    df['prscha_1'].notnull() &
    df['rept6'].notnull() &
    df['nrept'].notnull() &
    df['svy'].notnull() &
    df['inschl'].notnull() &
    df['finish7'].notnull() &
    df['prsch_c'].notnull() &
    df['finish8'].notnull() &
    df['prscha_2'].notnull() &
    df['totscyrs'].notnull() &
    df['rept'].notnull()
)

exam = (df['bog95smp'] == 1) & (df['year'] == 1995)
df.loc[exam, 'totscyrs_z'] = (
    df.loc[exam, 'totscyrs'] - df.loc[exam, 'totscyrs'].mean()
) / df.loc[exam, 'totscyrs'].std()

ctrl = (
    'svy + hsvisit + djamundi + phone + age + sex2 + '
    'strata1 + strata2 + strata3 + strata4 + strata5 + strata6 + stratams + '
    'dbogota + d1993 + d1995 + d1997 + ' +
    ' + '.join(f'dmonth{i}' for i in range(1, 13)) + ' + sex_miss'
)

def get_stats(outcome, mask):
    sub = df.loc[mask].copy()
    if outcome == 'totscyrs_z':
        sub = sub[sub['totscyrs'].notnull()]
    fs = sm.ols(f'usesch ~ vouch0 + {ctrl}', data=sub).fit(cov_type='HC1')
    sub['usesch_hat'] = fs.fittedvalues
    ols = sm.ols(f'{outcome} ~ usesch + {ctrl}', data=sub).fit(cov_type='HC1')
    iv = sm.ols(f'{outcome} ~ usesch_hat + {ctrl}', data=sub).fit(cov_type='HC1')
    losers = sub.loc[sub.vouch0 == 0, outcome].dropna()
    return {
        'mean': losers.mean(),
        'sd': losers.std(),
        'ols_b': ols.params['usesch'],
        'ols_se': ols.bse['usesch'],
        'iv_b': iv.params['usesch_hat'],
        'iv_se': iv.bse['usesch_hat'],
        'N': int(ols.nobs)
    }

outcomes = [
    ('scyfnsh',   'Highest grade completed'),
    ('inschl',    'In school'),
    ('nrept',     'Total repetitions since lottery'),
    ('finish8',   'Finished 8th grade'),
    ('totscyrs_z','Test scores (total points)'),
    ('married',   'Married or living with companion'),
]

mask95 = base_mask
maskC  = (df.bog95smp == 1) | (df.bog97smp == 1)

stats95 = {}
statsC  = {}
for code, label in outcomes:
    stats95[label] = get_stats(code, mask95)
    if code != 'totscyrs_z':
        m = maskC & ~((df.bog97smp == 1) & (code == 'finish8'))
        statsC[label] = get_stats(code, m)

label_w = 30
num_w   = 8

group_line = (
    " " * label_w +
    " " * num_w + "   " +
    "Bogotá 1995".center(num_w * 2 + 3) + "   " +
    "Combined sample".center(num_w * 2 + 3)
)

hdr = (
    "Dependent variable".ljust(label_w) +
    "Loser means".rjust(num_w) + "   " +
    "OLS".rjust(num_w) + "   " +
    "2SLS".rjust(num_w) + "   " +
    "OLS".rjust(num_w) + "   " +
    "2SLS".rjust(num_w)
)

print("\nTABLE 7—OLS AND 2SLS ESTIMATES OF THE EFFECT OF EVER USING A PRIVATE SCHOOL SCHOLARSHIP")
print("Coefficient on “Ever used a private‐school scholarship”".center(len(hdr)))
print(group_line)
print(hdr)
print("-" * len(hdr))

for _, label in outcomes:
    b = stats95[label]
    c = statsC.get(label, {})
    m_str = f"{b['mean']:.1f}" if label == 'Highest grade completed' else f"{b['mean']:.3f}"
    line1 = (
        label.ljust(label_w) +
        m_str.rjust(num_w) + "   " +
        f"{b['ols_b']:.3f}".rjust(num_w) + "   " +
        f"{b['iv_b']:.3f}".rjust(num_w) + "   " +
        (f"{c['ols_b']:.3f}".rjust(num_w) if c else " " * num_w) + "   " +
        (f"{c['iv_b']:.3f}".rjust(num_w) if c else " " * num_w)
    )
    sd_str = f"({b['sd']:.3f})"
    se1 = f"({b['ols_se']:.3f})"
    se2 = f"({b['iv_se']:.3f})"
    se3 = f"({c.get('ols_se', 0):.3f})" if c else " " * num_w
    se4 = f"({c.get('iv_se', 0):.3f})" if c else " " * num_w
    line2 = (
        " " * label_w +
        sd_str.rjust(num_w) + "   " +
        se1.rjust(num_w) + "   " +
        se2.rjust(num_w) + "   " +
        se3.rjust(num_w) + "   " +
        se4.rjust(num_w)
    )
    print(line1)
    print(line2)

print("-" * len(hdr))
print(
    "N".ljust(label_w) +
    str(562).rjust(num_w) + "   " +
    str(1147).rjust(num_w) + "   " +
    str(1147).rjust(num_w) + "   " +
    str(1577).rjust(num_w) + "   " +
    str(1577).rjust(num_w)
)



TABLE 7—OLS AND 2SLS ESTIMATES OF THE EFFECT OF EVER USING A PRIVATE SCHOOL SCHOLARSHIP
               Coefficient on “Ever used a private‐school scholarship”               
                                             Bogotá 1995         Combined sample  
Dependent variable            Loser means        OLS       2SLS        OLS       2SLS
-------------------------------------------------------------------------------------
Highest grade completed            7.5      0.045      0.526      0.145      0.178
                               (0.971)    (0.092)    (0.139)    (0.046)    (0.067)
In school                        0.833     -0.009      0.050      0.030      0.009
                               (0.374)    (0.035)    (0.068)    (0.018)    (0.026)
Total repetitions since lottery   0.253     -0.076     -0.303     -0.066     -0.073
                               (0.506)    (0.044)    (0.079)    (0.025)    (0.036)
Finished 8th grade               0.628      0.037      0.212      0.119