In [101]:
import pandas as pd
import statsmodels.api as sm
from IPython.display import display, HTML

In [75]:
df = pd.read_stata("tab5v1.dta")
print(df.columns.tolist())


['id', 'vouch0', 'sex', 'svy', 'age', 'hsvisit', 'strata1', 'strata2', 'strata3', 'strata4', 'strata5', 'strata6', 'mom_sch', 'dad_sch', 'dad_miss', 'mom_miss', 'math', 'reading', 'writing', 't_site', 'totalpts', 'tsite1', 'tsite2', 'tsite3', 'sex_name', 'bog95smp', 'bog95asd', 'bog97smp', 'bog97asd', 'jam93smp', 'jam93asd', 'test_tak', 'dbogota', 'djamundi', 'd1995', 'd1993', 'd1997', 'phone', 'scyfnsh', 'darea1', 'darea2', 'darea3', 'darea4', 'darea5', 'darea6', 'darea7', 'darea8', 'darea9', 'darea10', 'darea11', 'darea12', 'darea13', 'darea14', 'darea15', 'darea16', 'darea17', 'darea18', 'darea19', 'dmonth1', 'dmonth2', 'dmonth3', 'dmonth4', 'dmonth5', 'dmonth6', 'dmonth7', 'dmonth8', 'dmonth9', 'dmonth10', 'dmonth11', 'dmonth12', 'bog95', 'bog97', 'mom_age', 'mom_mw', 'dad_age', 'dad_mw', 'sex2', 'stratams', 'age2']


In [77]:
print(df['scyfnsh'].describe())
print(df['scyfnsh'].value_counts())


count    283.000000
mean       7.749117
std        0.869473
min        5.000000
25%        8.000000
50%        8.000000
75%        8.000000
max       11.000000
Name: scyfnsh, dtype: float64
scyfnsh
8.0     203
7.0      30
6.0      20
9.0      18
5.0       9
11.0      2
10.0      1
Name: count, dtype: int64


In [78]:
df_subset = df[
    (df['test_tak'] == 1) &
    ((df['bog95smp'] == 1) | (df['bog97smp'] == 1) | (df['jam93smp'] == 1))
]


# TABLE 5

## A: All Applications

In [102]:
df_subset = df.rename(columns={'id': 'total_points'})
mask = (
    ((df_subset['bog95smp'] == 1) |
     (df_subset['bog97smp'] == 1) |
     (df_subset['jam93smp'] == 1)) &
    (df_subset['test_tak'] == 1)
)
df_exp = df_subset[mask].copy()

outcomes = {
    'Total points': 'scyfnsh',
    'Math scores': 'math',
    'Reading scores': 'reading',
    'Writing scores': 'writing'
}

results = []

for label, outcome_var in outcomes.items():
    df_model = df_exp[[outcome_var, 'vouch0']].dropna()
    X = sm.add_constant(df_model['vouch0'])
    y = df_model[outcome_var]
    model = sm.OLS(y, X).fit()
    coef = round(model.params['vouch0'], 3)
    se = round(model.bse['vouch0'], 3)
    n = df_model.shape[0]
    results.append((label, coef, se, n))

results_df['Estimate (SE)'] = results_df.apply(
    lambda row: f"{row['Coefficient']:.3f}<br>({row['Std. Error']:.3f})", axis=1
)

html_table = results_df[['Outcome', 'Estimate (SE)', 'N']].to_html(escape=False, index=False)
display(HTML(html_table))

Outcome,Estimate (SE),N
Total points,0.202 (0.103),271
Math scores,0.152 (0.122),270
Reading scores,0.182 (0.121),271
Writing scores,0.149 (0.120),271


In [109]:
# NEED TO FIX CELL BELOW

In [108]:
df_subset['pooled_score'] = df_subset[['math', 'reading', 'writing']].mean(axis=1)
df_subset['math_reading_score'] = df_subset[['math', 'reading']].mean(axis=1)

pooled_outcomes = [
    ('Pooled test scores', 'pooled_score'),
    ('Math and reading scores', 'math_reading_score')
]

results = []

for label, outcome_var in pooled_outcomes:
    df_model = df_subset[[outcome_var, 'vouch0']].dropna()
    X = sm.add_constant(df_model['vouch0'])
    y = df_model[outcome_var]
    model = sm.OLS(y, X).fit()
    
    coef = round(model.params['vouch0'], 3)
    se = round(model.bse['vouch0'], 3)
    n = df_model.shape[0]
    
    results.append((label, coef, se, n))

pooled_df['Estimate (SE)'] = pooled_df.apply(
    lambda row: f"{row['Coefficient']:.3f}<br>({row['Std. Error']:.3f})", axis=1
)
display(HTML(pooled_df[['Outcome', 'Estimate (SE)', 'N']].to_html(escape=False, index=False)))

Outcome,Estimate (SE),N
Pooled test scores,0.145 (0.095),283
Math and reading scores,0.157 (0.102),283
