In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats

# Load data
df = pd.read_csv('../../data/cleand_df2018.csv')

treatment = "computer"
outcome = "achievement"
country_cols = [col for col in df.columns if col.startswith("country_")]

confounders = [
    'escs', 'mother_educ', 'father_educ', 'desk', 'room', 'book', 'gender'
] + country_cols

X = df[confounders]
y = df[outcome]
T = df[treatment]

# Unadjusted ATE
y1 = y[T == 1]
y0 = y[T == 0]
ate_naive = y1.mean() - y0.mean()
n1 = len(y1)
n0 = len(y0)
se_naive = np.sqrt(y1.var(ddof=1)/n1 + y0.var(ddof=1)/n0)
ci_low = ate_naive - 1.96 * se_naive
ci_high = ate_naive + 1.96 * se_naive
print(f"Unadjusted ATE: {ate_naive:.2f} (95% CI: [{ci_low:.2f}, {ci_high:.2f}])")

# Covariate-adjusted ATE
X_adj = pd.concat([T, X], axis=1)
X_adj = sm.add_constant(X_adj)
model = sm.OLS(y, X_adj).fit()
ate_adjusted = model.params['computer']
ci_lower, ci_upper = model.conf_int().loc['computer']
print(f"Adjusted ATE: {ate_adjusted:.2f} (95% CI: [{ci_lower:.2f}, {ci_upper:.2f}])")


Unadjusted ATE: 67.80 (95% CI: [67.21, 68.39])
Adjusted ATE: 15.79 (95% CI: [15.19, 16.39])
