In [None]:
# =========================================
# Causal Inference: Difference-in-Differences & Matching
# =========================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------
# Part 1: Simulated Policy Data
# ------------------------------

np.random.seed(42)

n = 500
time = np.tile([0,1], n)  # pre=0, post=1
group = np.repeat([0,1], n//2*2)  # control=0, treated=1

# Potential outcomes
baseline = 50 + 5*group + np.random.normal(0,5,n*1)
trend = 2*time
treatment_effect = 5 * (group * time)

Y = baseline + trend + treatment_effect

df = pd.DataFrame({"Y": Y, "time": time, "group": group})
df["treated"] = df["group"]

# ------------------------------
# Part 2: Difference-in-Differences
# ------------------------------

mean_outcomes = df.groupby(["group","time"])["Y"].mean().unstack()
did = (mean_outcomes.loc[1,1] - mean_outcomes.loc[1,0]) - (mean_outcomes.loc[0,1] - mean_outcomes.loc[0,0])

print("Difference-in-Differences Estimate:", did)

sns.lineplot(data=df, x="time", y="Y", hue="group", ci=None, estimator="mean")
plt.title("Average Outcomes Over Time (Treatment vs Control)")
plt.show()

# ------------------------------
# Part 3: Matching Example (Propensity Score)
# ------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# Simulated observational dataset
np.random.seed(123)
n = 200
X1 = np.random.normal(0,1,n)
X2 = np.random.normal(0,1,n)
propensity = 1/(1+np.exp(- (0.5*X1 - 0.25*X2)))
treat = np.random.binomial(1, propensity)

# Outcome with treatment effect
Y = 2*X1 + X2 + 3*treat + np.random.normal(0,1,n)

obs = pd.DataFrame({"X1":X1,"X2":X2,"treat":treat,"Y":Y})

# Estimate propensity score
logit = LogisticRegression()
logit.fit(obs[["X1","X2"]], obs["treat"])
obs["pscore"] = logit.predict_proba(obs[["X1","X2"]])[:,1]

# Nearest neighbor matching
treated = obs[obs["treat"]==1]
control = obs[obs["treat"]==0]

nn = NearestNeighbors(n_neighbors=1).fit(control[["pscore"]])
distances, indices = nn.kneighbors(treated[["pscore"]])

matched_control = control.iloc[indices.flatten()]
att = (treated["Y"].reset_index(drop=True) - matched_control["Y"].reset_index(drop=True)).mean()

print("Matching ATT Estimate:", att)

# ------------------------------
# Mission Task
# ------------------------------
# 1. Simulate a policy intervention where treatment has no effect.
#    Run DiD — what happens if parallel trends assumption fails?
# 2. Try propensity score matching with different covariates (X1, X2).
# 3. Discuss: When would you choose Matching vs DiD?
# 4. Explore adding a regression adjustment after matching.
