In [468]:
import numpy as np
import pandas as pd
import math
import scipy as sp
import statsmodels.discrete.discrete_model as sm_model
import statsmodels.tools as sm_tools
import statsmodels.api as sm
from sklearn.utils import resample

In [328]:
file1_path = "nswre74_treated.mat"
mat_data = sp.io.loadmat(file1_path)
mat_data.keys()

dict_keys(['__header__', '__version__', '__globals__', 't', 'age', 'education', 'black', 'hispanic', 'married', 'nodegree', 're74', 're75', 're78', 'u74', 'u75'])

In [249]:
# Extract the variables
treated = mat_data['t'].flatten()
age = mat_data['age'].flatten()
educ = mat_data['education'].flatten()
black = mat_data['black'].flatten()
hisp = mat_data['hispanic'].flatten()
married = mat_data['married'].flatten()
nodegree = mat_data['nodegree'].flatten()
earn_74 = mat_data['re74'].flatten()
earn_75 = mat_data['re75'].flatten()
earn_78 = mat_data['re78'].flatten()
u_74 = mat_data['u74'].flatten()
u_75 = mat_data['u75'].flatten()

# Create a DataFrame
df_treat = pd.DataFrame({
    'treated': treated,
    'age': age,
    'educ': educ,
    'black': black,
    'hisp': hisp,
    'married': married,
    'nodegree': nodegree,
    'earn_74': earn_74/1000,
    'earn_75': earn_75/1000,
    'earn_78': earn_78/1000,
    'u_74': u_74,
    'u_75': u_75
})
df_treat

Unnamed: 0,treated,age,educ,black,hisp,married,nodegree,earn_74,earn_75,earn_78,u_74,u_75
0,1,37,11,1,0,1,1,0.00000,0.00000,9.930046,1,1
1,1,22,9,0,1,0,1,0.00000,0.00000,3.595894,1,1
2,1,30,12,1,0,0,0,0.00000,0.00000,24.909450,1,1
3,1,27,11,1,0,0,1,0.00000,0.00000,7.506146,1,1
4,1,33,8,1,0,0,1,0.00000,0.00000,0.289790,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
180,1,33,12,1,0,1,0,20.27995,10.94135,15.952600,0,0
181,1,25,14,1,0,1,0,35.04007,11.53657,36.646950,0,0
182,1,35,9,1,0,1,1,13.60243,13.83064,12.803970,0,0
183,1,35,8,1,0,1,1,13.73207,17.97615,3.786628,0,0


In [298]:
file3_path = "cps_controls.mat"
mat_data = sp.io.loadmat(file3_path)
mat_data.keys()

dict_keys(['__header__', '__version__', '__globals__', 't', 'age', 'education', 'black', 'hispanic', 'married', 'nodegree', 're74', 're75', 're78', 'u74', 'u75'])

In [303]:
file3_path = "cps_controls.mat"
mat_data = sp.io.loadmat(file3_path)
mat_data.keys()

# Extract the variables
treated = mat_data['t'].flatten()
age = mat_data['age'].flatten()
educ = mat_data['education'].flatten()
black = mat_data['black'].flatten()
hisp = mat_data['hispanic'].flatten()
married = mat_data['married'].flatten()
nodegree = mat_data['nodegree'].flatten()
earn_74 = mat_data['re74'].flatten()
earn_75 = mat_data['re75'].flatten()
earn_78 = mat_data['re78'].flatten()
u_74 = mat_data['u74'].flatten()
u_75 = mat_data['u75'].flatten()

# Create a DataFrame
df_control = pd.DataFrame({
    'treated': treated,
    'age': age,
    'educ': educ,
    'black': black,
    'hisp': hisp,
    'married': married,
    'nodegree': nodegree,
    'earn_74': earn_74/1000,
    'earn_75': earn_75/1000,
    'earn_78': earn_78/1000,
    'u_74': u_74,
    'u_75': u_75,
})

1.a. Contruct table with normalized differences in covars

In [305]:
treat_covars = df_treat.iloc[:, 1:].drop(columns=['earn_78'])
control_covars = df_control.iloc[:, 1:].drop(columns=['earn_78'])

In [307]:
df_diff = treat_covars.mean()-control_covars.mean()
df_diff

age         -7.409021
educ        -1.681568
black        0.769706
hisp        -0.012577
married     -0.522542
nodegree     0.412273
earn_74    -11.921227
earn_75    -12.118748
u_74         0.588486
u_75         0.490695
dtype: float64

In [322]:
s_0 = (np.sum((control_covars - control_covars.mean())**2))/(len(control_covars)-1)
s_0

  return reduction(axis=axis, out=out, **passkwargs)


age         121.996792
educ          8.241754
black         0.068133
hisp          0.066851
married       0.205183
nodegree      0.208330
earn_74      91.580993
earn_75      85.940376
u_74          0.105319
u_75          0.097363
dtype: float64

In [327]:
s_1 = (np.sum((treat_covars - treat_covars.mean())**2))/(len(treat_covars)-1)
s_1

  return reduction(axis=axis, out=out, **passkwargs)


age         51.194301
educ         4.042714
black        0.132902
hisp         0.056228
married      0.154230
nodegree     0.207814
earn_74     23.879058
earn_75     10.363576
u_74         0.207814
u_75         0.241304
dtype: float64

In [310]:
norm_dist = df_diff/np.sqrt((s_0 + s_1)/2)
norm_dist

age        -0.796183
educ       -0.678502
black       2.427747
hisp       -0.050697
married    -1.232648
nodegree    0.903811
earn_74    -1.568990
earn_75    -1.746428
u_74        1.487257
u_75        1.192450
dtype: float64

b. Estimate propensity score (linear covars)

In [311]:
df = pd.concat([df_treat, df_control], axis=0, ignore_index=True)
df

Unnamed: 0,treated,age,educ,black,hisp,married,nodegree,earn_74,earn_75,earn_78,u_74,u_75
0,1,37,11,1,0,1,1,0.000000,0.000000,9.930046,1,1
1,1,22,9,0,1,0,1,0.000000,0.000000,3.595894,1,1
2,1,30,12,1,0,0,0,0.000000,0.000000,24.909450,1,1
3,1,27,11,1,0,0,1,0.000000,0.000000,7.506146,1,1
4,1,33,8,1,0,0,1,0.000000,0.000000,0.289790,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
16172,0,22,12,1,0,0,0,3.975352,6.801435,2.757438,0,0
16173,0,20,12,1,0,1,0,1.445939,11.832240,6.895072,0,0
16174,0,37,12,0,0,0,0,1.733951,1.559371,4.221865,0,0
16175,0,47,9,0,0,1,1,16.914350,11.384660,13.671930,0,0


In [315]:
covars = df.iloc[:, 1:].drop(columns=['earn_78'])
covars = sm_tools.add_constant(covars)
treat = df['treated']

In [316]:
probit = sm_model.Probit(treat, covars)
probit_results = probit.fit(maxiter=10000)
print(probit_results.summary())

Optimization terminated successfully.
         Current function value: -0.000000
         Iterations 9612
                          Probit Regression Results                           
Dep. Variable:                treated   No. Observations:                16177
Model:                         Probit   Df Residuals:                    16166
Method:                           MLE   Df Model:                           10
Date:                Sat, 10 May 2025   Pseudo R-squ.:                     nan
Time:                        14:33:21   Log-Likelihood:                 0.0000
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.2856   8.74e+08   8.34e-09      1.000   -1.71e+09    1.71e+09
age            0.0982    

  return 1 - self.llf/self.llnull


In [333]:
# Calculate propensity scores
df['propensity_score'] = probit_results.predict(covars, linear=True)

# Examine the distribution of propensity scores
print(df['propensity_score'].describe())

count    16177.000000
mean        10.528588
std          1.269856
min          8.314903
25%          9.449845
50%         10.323356
75%         11.525749
max         13.738076
Name: propensity_score, dtype: float64


c. Calculate weighted ATT

In [338]:
df['weight'] = np.where(
    df['treated'] == 1,
    1,
    df['propensity_score'] / (1 - df['propensity_score'])    # equation when flag == 0
)

In [341]:
weights_control = df['weight'][df['treated'] == 0]
weights_control_norm = weights_control * len(weights_control)/sum(weights_control)
print(weights_control_norm.describe())

count    15992.000000
mean         1.000000
std          0.012524
min          0.974515
25%          0.989298
50%          1.000418
75%          1.010493
max          1.027105
Name: weight, dtype: float64


In [346]:
top_5_weights = weights_control_norm.sort_values(ascending=False).head(5)
top_5_weights

10833    1.027105
704      1.026181
884      1.026053
7079     1.025724
15673    1.025685
Name: weight, dtype: float64

In [349]:
covars.loc[top_5_weights.index]

Unnamed: 0,const,age,educ,black,hisp,married,nodegree,earn_74,earn_75,u_74,u_75
10833,1.0,17,14,0,0,0,0,1.780974,6.615242,0,0
704,1.0,17,14,0,0,0,0,0.0,1.226371,1,0
884,1.0,17,12,0,0,0,0,0.423202,5.061242,0,0
7079,1.0,17,12,0,0,0,0,0.211601,1.484177,0,0
15673,1.0,19,16,0,0,0,0,0.215519,2.857355,0,0


In [358]:
def att(df):
    treated = df['treated'] == 1
    control = df['treated'] == 0
        
    weights_control = df['propensity_score'][control] / (1 - df['propensity_score'][control])
    weights_control_norm = weights_control * len(weights_control) / sum(weights_control)
        
    att = np.mean(df['earn_78'][treated]) - np.average(df['earn_78'][control], weights=weights_control_norm)
    return att

In [359]:
att_estimate = att(df)
att_estimate

np.float64(-8.472230160094853)

In [362]:
att_boots = []
for _ in range(1000):
    boot_sample = resample(df)
    att_boots.append(att(boot_sample))

att_std_error = np.std(att_boots)
print("SE=", att_std_error)

SE= 0.574976052363246


d. Repeat with logit

In [363]:
logit = sm_model.Logit(treat, covars)
logit_results = logit.fit(maxiter=10000)
print("logit:",logit_results.summary())

Optimization terminated successfully.
         Current function value: inf
         Iterations 12
logit:                            Logit Regression Results                           
Dep. Variable:                treated   No. Observations:                16177
Model:                          Logit   Df Residuals:                    16166
Method:                           MLE   Df Model:                           10
Date:                Sat, 10 May 2025   Pseudo R-squ.:                     inf
Time:                        22:03:49   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.3408      0.812     -7.804      0.000      -7.933      -4.748
age           -0.0179     

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


In [367]:
# Calculate propensity scores
df['propensity_score'] = logit_results.predict(covars, linear=True)

# Examine the distribution of propensity scores
print(df['propensity_score'].describe())

count    16177.000000
mean        -7.947289
std          2.409537
min        -12.267045
25%         -9.937779
50%         -8.475962
75%         -6.350024
max          0.564578
Name: propensity_score, dtype: float64




In [368]:
df['weight'] = np.where(
    df['treated'] == 1,
    1,
    df['propensity_score'] / (1 - df['propensity_score'])    # equation when flag == 0
)

In [370]:
weights_control = df['weight'][df['treated'] == 0]
weights_control_norm = weights_control * len(weights_control)/sum(weights_control)
print(weights_control_norm.describe())

count    15992.000000
mean         1.000000
std          0.122440
min         -1.485408
25%          0.991857
50%          1.025243
75%          1.040981
max          1.059250
Name: weight, dtype: float64


In [371]:
top_5_weights = weights_control_norm.sort_values(ascending=False).head(5)
top_5_weights

7480     1.059250
14533    1.058716
15980    1.057552
7045     1.057309
9098     1.057222
Name: weight, dtype: float64

In [372]:
covars.loc[top_5_weights.index]

Unnamed: 0,const,age,educ,black,hisp,married,nodegree,earn_74,earn_75,u_74,u_75
7480,1.0,46,16,0,0,1,0,0.869914,25.24355,0,0
14533,1.0,51,16,0,0,1,0,3.60701,25.24355,0,0
15980,1.0,37,18,0,0,1,0,1.76334,25.24355,0,0
7045,1.0,44,12,0,0,1,0,6.208916,25.24355,0,0
9098,1.0,45,18,0,0,1,0,4.829593,25.24355,0,0


In [373]:
att_estimate_logit = att(df)
att_estimate_logit

np.float64(-8.799531451887969)

In [375]:
att_logit_boots = []
for _ in range(1000):
    boot_sample = resample(df)
    att_logit_boots.append(att(boot_sample))

att_logit_std_error = np.std(att_logit_boots)
print("SE=", att_logit_std_error)

SE= 0.5692725643236859


e. Construct strata

In [377]:
treated_ps = df[df['treated'] == 1]['propensity_score']
quintiles = np.percentile(treated_ps, [20, 40, 60, 80])
df['stratum'] = pd.cut(df['propensity_score'], 
                       bins=[-np.inf] + list(quintiles) + [np.inf], 
                       labels=[1, 2, 3, 4, 5])
control_counts = df[df['treated'] == 0]['stratum'].value_counts().sort_index()
control_counts

stratum
1    15640
2      228
3       57
4       41
5       26
Name: count, dtype: int64

f. Normalized differences in each stratum

In [427]:
s_0.index.values

array(['age', 'educ', 'black', 'hisp', 'married', 'nodegree', 'earn_74',
       'earn_75', 'u_74', 'u_75'], dtype=object)

In [441]:
s_0['age']

np.float64(3.895384615384616)

In [442]:
norm_dist[stratum]

{'age': age         1.010489
 educ        0.036830
 black       0.000000
 hisp        0.000000
 married     0.000000
 nodegree    0.000000
 earn_74     0.000000
 earn_75     0.000000
 u_74        0.000000
 u_75        0.000000
 dtype: float64,
 'educ': age         2.566712
 educ        0.093550
 black       0.000000
 hisp        0.000000
 married     0.000000
 nodegree    0.000000
 earn_74     0.000000
 earn_75     0.000000
 u_74        0.000000
 u_75        0.000000
 dtype: float64,
 'black': 0,
 'hisp': 0,
 'married': 0,
 'nodegree': 0,
 'earn_74': 0,
 'earn_75': 0,
 'u_74': 0,
 'u_75': 0}

In [450]:
norm_dist[1]

{'age': age        -0.834958
 educ       -0.130291
 black       0.030015
 hisp        0.013162
 married    -0.035405
 nodegree    0.028087
 earn_74    -1.046289
 earn_75    -1.075145
 u_74        0.036003
 u_75        0.021974
 dtype: float64,
 'educ': age        -3.188174
 educ       -0.497498
 black       0.114607
 hisp        0.050257
 married    -0.135190
 nodegree    0.107245
 earn_74    -3.995115
 earn_75    -4.105295
 u_74        0.137471
 u_75        0.083904
 dtype: float64,
 'black': age        -20.105287
 educ        -3.137325
 black        0.722733
 hisp         0.316932
 married     -0.852538
 nodegree     0.676310
 earn_74    -25.194028
 earn_75    -25.888848
 u_74         0.866923
 u_75         0.529113
 dtype: float64,
 'hisp': age        -22.384220
 educ        -3.492940
 black        0.804654
 hisp         0.352857
 married     -0.949173
 nodegree     0.752970
 earn_74    -28.049768
 earn_75    -28.823346
 u_74         0.965188
 u_75         0.589088
 dtype: float64,


why is this spitting out too many lists for each cov? there should just be one per stratum

In [453]:
norm_dist = {}
for stratum in range(1, 6):
    strat_cov = {}
    stratum_data = df[df['stratum'] == stratum].drop(columns=['earn_78','propensity_score', 'stratum','weight', 'weight_log'])
    treat_covars = stratum_data[stratum_data['treated'] == 1].iloc[:, 1:]
    control_covars = stratum_data[stratum_data['treated'] == 0].iloc[:, 1:]
    df_diff = treat_covars.mean()-control_covars.mean()
    s_0 = (np.sum((control_covars - control_covars.mean())**2))/(len(control_covars)-1)
    s_1 = (np.sum((treat_covars - treat_covars.mean())**2))/(len(treat_covars)-1)

    for cov in s_0.index.values:
        if s_0[cov] == 0 and s_1[cov] == 0:
            strat_cov[cov] = 0
        elif s_0[cov] == 0 or s_1[cov] == 0:
            strat_cov[cov] = np.inf if df_diff[cov]!= 0 else 0
        else:
            strat_cov[cov] = df_diff[cov]/np.sqrt((s_0[cov] + s_1[cov])/2)

        norm_dist[stratum] = strat_cov

norm_dist

  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


{1: {'age': np.float64(-0.8349576068166848),
  'educ': np.float64(-0.49749777871901985),
  'black': np.float64(0.7227325873537844),
  'hisp': np.float64(0.35285671315376954),
  'married': np.float64(-0.6707483642248573),
  'nodegree': np.float64(0.5248097322768831),
  'earn_74': np.float64(-1.1003250192884422),
  'earn_75': np.float64(-1.2610067155153504),
  'u_74': np.float64(0.7719165315845047),
  'u_75': np.float64(0.5046337189295692)},
 2: {'age': np.float64(-0.06071993986732214),
  'educ': np.float64(-0.29511108822457477),
  'black': np.float64(0.18113349331692472),
  'hisp': np.float64(-0.1811334933169248),
  'married': np.float64(-0.2169114450737581),
  'nodegree': np.float64(0.3387941813610128),
  'earn_74': np.float64(0.24556951625016615),
  'earn_75': np.float64(0.4130350497864614),
  'u_74': np.float64(-0.33615135864580875),
  'u_75': np.float64(-0.49820650586251625)},
 3: {'age': np.float64(-0.46155222426246095),
  'educ': np.float64(0.307899343379012),
  'black': 0,
  'his

In [440]:
stratum_sizes = df[df['treated'] == 1]['stratum'].value_counts().sort_index()

In [456]:
weighted_diffs = {}
for cov in covars.columns[1:]:
    weighted_diff = sum(norm_dist[s][cov] * stratum_sizes[s] for s in range(1, 6)) / sum(stratum_sizes)
    weighted_diffs[cov] = weighted_diff

for cov, diff in weighted_diffs.items():
    print(cov, diff)

age 0.0025856076774800458
educ -0.08280931144696672
black 0.18077321613414182
hisp 0.03434464396736895
married -0.15890677901347236
nodegree 0.12734497440106768
earn_74 -0.18303848846861157
earn_75 -0.2061529540586667
u_74 0.1593039724774064
u_75 0.1506833061701739


In [465]:
# Identify unbalanced covariates (e.g., |diff| > 0.1)
unbalanced = {cov: diff for cov, diff in weighted_diffs.items() if abs(diff) > 0.1}
if unbalanced:
    print("\nUnbalanced covariates:")
    for cov, diff in unbalanced.items():
        print(f"{cov}: {diff:.4f}")
else:
    print("\nNo covariates are very unbalanced.")

# Identify most imbalanced stratum
stratum_imbalance = {}
for stratum in range(1, 6):
    stratum_imbalance[stratum] = max(abs(diff) for diff in norm_dist[stratum].values())

most_imbalanced = max(stratum_imbalance, key=stratum_imbalance.get)
print(f"\nMost imbalanced stratum: {most_imbalanced} (max normalized difference: {stratum_imbalance[most_imbalanced]:.4f})")


Unbalanced covariates:
black: 0.1808
married: -0.1589
nodegree: 0.1273
earn_74: -0.1830
earn_75: -0.2062
u_74: 0.1593
u_75: 0.1507

Most imbalanced stratum: 1 (max normalized difference: 1.2610)


g. Avg effect within each strata 

In [475]:
def lin_model(df):
    covars = df.drop(columns=['earn_78'])
    covars = sm_tools.add_constant(covars)
    out = df['earn_78']

    lin_reg = sm.OLS(out, covars).fit()
    return lin_reg.params['treated'], lin_reg.bse['treated'], lin_reg.pvalues['treated'], lin_reg

results = {}
for strata in range(1, 6):
    df_stat = df[df['stratum'] == strata].drop(columns=['propensity_score', 'stratum','weight', 'weight_log'])
    effect, se, p_value, model = lin_model(df_stat)
    results[strata]= {
            'effect': effect,
            'se': se,
            'p_value': p_value,
            'n_treated': sum(stratum_data['treated'] == 1),
            'n_control': sum(stratum_data['treated'] == 0),
            'model': model
        }
results

{1: {'effect': np.float64(-0.5473752309469494),
  'se': np.float64(1.1584031156630086),
  'p_value': np.float64(0.6365581924021968),
  'n_treated': 37,
  'n_control': 26,
  'model': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x169b92dc0>},
 2: {'effect': np.float64(-0.5616618207808856),
  'se': np.float64(1.1020698805314393),
  'p_value': np.float64(0.6107443439556998),
  'n_treated': 37,
  'n_control': 26,
  'model': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x169b664c0>},
 3: {'effect': np.float64(5.148354818875074),
  'se': np.float64(1.4987489302534485),
  'p_value': np.float64(0.0009227051752241839),
  'n_treated': 37,
  'n_control': 26,
  'model': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x168984700>},
 4: {'effect': np.float64(5.1275315210809405),
  'se': np.float64(2.13162855721193),
  'p_value': np.float64(0.018760286790387357),
  'n_treated': 37,
  'n_control': 26,
  'model': <statsmodels.regression.linear_m

In [476]:
# Calculate overall weighted average effect
total_treated = sum(res['n_treated'] for res in results.values())
weighted_effect = sum(res['effect'] * res['n_treated'] for res in results.values()) / total_treated
weighted_effect

np.float64(1.8445073423172256)

h. ATT using within strata

In [481]:
def strata_att(df):
    strata_effect = {}
    strata_weight = {}

    for strata in range(1, 6):
        df_stat = df[df['stratum'] == strata].drop(columns=['propensity_score', 'stratum','weight', 'weight_log'])
        effect = results[strata]['effect']
        n_treated = results[strata]['n_treated']
        
        strata_effect[strata] = effect
        strata_weight[strata] = n_treated
    total_weight = sum(strata_weight.values())
    weighted_effect = sum(strata_effect[s] * strata_weight[s] for s in range(1, 6)) / total_weight

    return weighted_effect

In [482]:
att_strata = strata_att(df)

In [484]:
att_boots = []
for _ in range(1000):
    boot_sample = resample(df)
    att_boots.append(strata_att(boot_sample))

att_std_error = np.std(att_boots)
print("SE=", att_std_error)

SE= 2.220446049250313e-16


The standard error of the strata estimates is smaller than for the strictly weighted estimates. This is expected because by construction, although total N in the regression is less, the control group more closely resembles the treated group. This makes the variance of observables and outcomes smaller, leading to smaller SE. 

i. Are these estimates reasonable? 

These estimates both assume unconfoundedness, which may or may not be valid depending on whether you believe an argument about treatment being a related to the distrobutions of previous earnings or actions. Also, although the SE is smaller and the estimate is more precise for the stratified regression, this regression loses some of the nuance of the relationships of differences between groups as opposed to the more strictly defined differences within groups. 