# Week 13 coding quizz

In [7]:
!pip install statsmodels



In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import (roc_auc_score, f1_score, accuracy_score, confusion_matrix,
                             precision_recall_curve, roc_curve, average_precision_score,
                             classification_report)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import re

In [3]:
import pandas as pd
import numpy as np

# ============================
# 1. Load the data
# ============================

# Option 1: from the same folder as your notebook
df = pd.read_csv("homework_13.1.csv")

# # Option 2: from the course GitHub (uncomment if you prefer URL)
# url = "https://raw.githubusercontent.com/joshua-vonkorff/DX702-mod-6/refs/heads/main/homework_13.1.csv"
# df = pd.read_csv(url)

print(df.head())

# Expected columns: Z (instrument), X (treatment), Y (outcome)


# ============================
# 2. IV estimate using covariance formula
#     beta_IV = Cov(Z, Y) / Cov(Z, X)
# ============================

cov_matrix = df[['Z', 'X', 'Y']].cov()
cov_ZY = cov_matrix.loc['Z', 'Y']
cov_ZX = cov_matrix.loc['Z', 'X']

beta_iv = cov_ZY / cov_ZX

print("Cov(Z, Y) =", cov_ZY)
print("Cov(Z, X) =", cov_ZX)
print("IV estimate (Cov(Z,Y)/Cov(Z,X)) =", beta_iv)


# ============================
# 3. (Optional) Wald estimate (since Z is 0/1)
#     beta_Wald = (E[Y|Z=1] - E[Y|Z=0]) / (E[X|Z=1] - E[X|Z=0])
# ============================

EY_Z1 = df.loc[df['Z'] == 1, 'Y'].mean()
EY_Z0 = df.loc[df['Z'] == 0, 'Y'].mean()
EX_Z1 = df.loc[df['Z'] == 1, 'X'].mean()
EX_Z0 = df.loc[df['Z'] == 0, 'X'].mean()

beta_wald = (EY_Z1 - EY_Z0) / (EX_Z1 - EX_Z0)

print("Wald estimate =", beta_wald)

   Unnamed: 0  Z  X         Y
0           0  1  0  0.056584
1           1  1  1  1.387519
2           2  0  1  4.289658
3           3  0  1  0.743003
4           4  0  1  2.718527
Cov(Z, Y) = 0.011011824700138641
Cov(Z, X) = 0.05667830783078288
IV estimate (Cov(Z,Y)/Cov(Z,X)) = 0.19428640553305204
Wald estimate = 0.1942864055330518


In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# ============================
# 1. Load the data
# ============================
# If the file is in the same folder as your notebook:
df = pd.read_csv("homework_13.2.csv")

print(df.head())
# Columns: Z2 (instrument), X2 (endogenous regressor), Y2 (outcome)


# ============================
# 2. IV estimate via covariance formula
#     beta_IV = Cov(Z2, Y2) / Cov(Z2, X2)
# ============================
cov = df[['Z2', 'X2', 'Y2']].cov()
cov_ZY = cov.loc['Z2', 'Y2']
cov_ZX = cov.loc['Z2', 'X2']

beta_iv = cov_ZY / cov_ZX

print("Cov(Z2, Y2) =", cov_ZY)
print("Cov(Z2, X2) =", cov_ZX)
print("IV estimate (Cov(Z2,Y2)/Cov(Z2,X2)) =", beta_iv)


# ============================
# 3. 2SLS using two regressions
#    Stage 1: X2 ~ Z2
#    Stage 2: Y2 ~ X2_hat
# ============================

# ---- Stage 1 ----
X_stage1 = sm.add_constant(df['Z2'])
model_stage1 = sm.OLS(df['X2'], X_stage1).fit()
df['X2_hat'] = model_stage1.fittedvalues

print("\nStage 1 results:")
print(model_stage1.summary())

# ---- Stage 2 ----
X_stage2 = sm.add_constant(df['X2_hat'])
model_stage2 = sm.OLS(df['Y2'], X_stage2).fit()

print("\nStage 2 results (2SLS):")
print(model_stage2.summary())

print("\n2SLS estimate of effect of X2 on Y2 =", model_stage2.params['X2_hat'])

   Unnamed: 0        Z2        X2        Y2
0           0  0.215107 -2.069048  3.304559
1           1  0.459804  0.647129 -1.122530
2           2 -1.015782 -0.167446 -1.270340
3           3  0.849650  0.424037 -1.835687
4           4  0.157479  0.224863 -1.105033
Cov(Z2, Y2) = -2.4409809909295253
Cov(Z2, X2) = 0.9753706095085332
IV estimate (Cov(Z2,Y2)/Cov(Z2,X2)) = -2.502618970813032

Stage 1 results:
                            OLS Regression Results                            
Dep. Variable:                     X2   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.324
Method:                 Least Squares   F-statistic:                     4804.
Date:                Thu, 04 Dec 2025   Prob (F-statistic):               0.00
Time:                        02:51:54   Log-Likelihood:                -17731.
No. Observations:               10000   AIC:                         3.547e+04
Df Residuals:                    9998   B

In [5]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

# ============================
# 1. Load the data
# ============================
df = pd.read_csv("homework_13.3.csv")
print(df.head())

# Columns used:
# Z3  = instrument
# X3  = treatment
# W3  = covariate
# ZW_int = Z3 * W3 (given in the file)


# ============================
# 2. Regress X3 on Z3, W3, and Z3*W3
#    X3 = b0 + bZ*Z3 + bW*W3 + bZW*(Z3*W3)
# ============================
X = sm.add_constant(df[['Z3', 'W3', 'ZW_int']])
y = df['X3']

model = sm.OLS(y, X).fit()
print(model.summary())

bZ   = model.params['Z3']
bZW  = model.params['ZW_int']

print("\nEstimated coefficients:")
print("bZ  =", bZ)
print("bZW =", bZW)

# ============================
# 3. Marginal effect of Z3 on X3 for each person:
#    dX/dZ = bZ + bZW * W3
# ============================
df['dX_dZ'] = bZ + bZW * df['W3']

# Compliers: effect of Z on X is positive
df['is_complier'] = df['dX_dZ'] > 0

print("\nFirst 10 rows with marginal effect and complier flag:")
print(df[['Z3', 'W3', 'X3', 'dX_dZ', 'is_complier']].head(10))

# Optional: how many compliers?
n_compliers = df['is_complier'].sum()
n_total = len(df)
print(f"\nNumber of compliers: {n_compliers} out of {n_total}")

   Unnamed: 0        Z3        X3    ZW_int        W3        Y3
0           0  0.560354  1.787329  0.690434  1.232138  2.167410
1           1  0.352335  0.035811 -0.147173 -0.417707  0.110531
2           2 -0.459113 -0.125720 -0.222680  0.485023 -0.550161
3           3 -1.042234  3.866518 -2.082372  1.997988  5.568673
4           4 -0.770318  0.402082  0.275651 -0.357840  1.360916
                            OLS Regression Results                            
Dep. Variable:                     X3   R-squared:                       0.749
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     9943.
Date:                Thu, 04 Dec 2025   Prob (F-statistic):               0.00
Time:                        02:54:39   Log-Likelihood:                -14211.
No. Observations:               10000   AIC:                         2.843e+04
Df Residuals:                    9996   BIC:                   

In [6]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# ============================
# 1. Generate the data
# ============================

num = 10000

Z4 = np.random.normal(0, 1, num)
W4 = np.random.normal(0, 1, num)
ZW_int = W4 * Z4

X4 = Z4 + W4 - 2 * ZW_int + np.random.normal(0, 1, num)
Y4 = W4 + X4 - Z4 + np.random.normal(0, 1, num)

df = pd.DataFrame({"Z4": Z4, "W4": W4, "ZW_int": ZW_int, "X4": X4, "Y4": Y4})
print(df.head())


# ============================
# 2. First: Relevance check
#    Does Z4 affect X4?  (It should)
# ============================

X = sm.add_constant(df["Z4"])
model_relevance = sm.OLS(df["X4"], X).fit()

print("\n=== Relevance Test: Z4 → X4 ===")
print(model_relevance.summary())


# ============================
# 3. Exclusion restriction check
#    Does Z4 affect Y4 directly?
# ============================

X = sm.add_constant(df[["X4", "Z4"]])
model_exclusion = sm.OLS(df["Y4"], X).fit()

print("\n=== Exclusion Test: Y4 ~ X4 + Z4 ===")
print(model_exclusion.summary())

         Z4        W4    ZW_int        X4        Y4
0 -0.129236  0.279492 -0.036121  2.352731  4.960388
1 -0.231480  2.973879 -0.688394  4.713131  9.311083
2 -0.823604  0.298109 -0.245524  0.278529  1.882410
3 -0.095741 -0.062636  0.005997  0.359641 -1.015091
4  0.023348  2.453625  0.057288  1.607277  3.301321

=== Relevance Test: Z4 → X4 ===
                            OLS Regression Results                            
Dep. Variable:                     X4   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                  0.146
Method:                 Least Squares   F-statistic:                     1706.
Date:                Thu, 04 Dec 2025   Prob (F-statistic):               0.00
Time:                        02:57:18   Log-Likelihood:                -23133.
No. Observations:               10000   AIC:                         4.627e+04
Df Residuals:                    9998   BIC:                         4.629e+04
Df Model:              