# week 7

In [1]:
#Q1 

import numpy as np

# random seed for reproducibility
np.random.seed(0)

# generate data from the question
W = np.random.normal(0, 1, 100000)
X = W + np.random.normal(0, 1, 100000)
Z = np.random.normal(0, 1, 100000)
Y = X + Z + W + np.random.normal(0, 1, 100000)

# define the true error term (the part of Y not explained by X)
u = Z + W + (Y - (X + Z + W))  # but simpler: u = Z + W + noise_Y
# or, if we separate explicitly:
epsilon_Y = Y - (X + Z + W)
u = Z + W + epsilon_Y

# compute correlation
corr = np.corrcoef(X, u)[0, 1]
print(f"Correlation between X and the error term u: {corr:.3f}")


Correlation between X and the error term u: 0.407


In [None]:
# Q2

# random seed
np.random.seed(0)

# generate data
n = 100000
W = np.random.normal(0, 1, n)
X = W + np.random.normal(0, 1, n)
Z = np.random.normal(0, 1, n)
Y = X + Z + W + np.random.normal(0, 1, n)

# error term when regressing Y on X and Z only:
# u = W + epsilon_Y, where epsilon_Y = Y - (X + Z + W)
epsilon_Y = Y - (X + Z + W)
u = W + epsilon_Y  # same as the "unobserved" part if W is omitted

# c
# ompute correlation between X and the regression error
corr = np.corrcoef(X, u)[0, 1]
print(f"Correlation between X and the error term (u): {corr:.3f}")


Correlation between X and the error term (u): 0.499


In [4]:
# Q3

import pandas as pd

# load
df = pd.read_csv("homework_7.1.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,X,W,Z,Y
0,0,1.137055,1.221768,0.327829,1.944532
1,1,-0.112905,0.465835,0.59965,0.655514
2,2,2.077755,1.795414,-0.063393,5.934411
3,3,0.456373,-0.512159,1.177413,-0.188064
4,4,-1.012402,0.080002,-0.275697,-0.533775


In [7]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading statsmodels-0.14.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Installing collected packages: patsy, statsmodels
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [statsmodels][0m [statsmodels]
[1A[2KSuccessfully installed patsy-1.0.1 statsmodels-0.14.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, r

In [9]:
import statsmodels.api as sm
import numpy as np

# drop the index col
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

# helper function to estimate coefficient of X when W ≈ constant
def estimate_coef_at_W(w_value, bandwidth=0.5):
    """Estimate coefficient of X at values of W near w_value."""
    subset = df[(df['W'] > w_value - bandwidth) & (df['W'] < w_value + bandwidth)]
    X_vars = sm.add_constant(subset[['X', 'Z']])
    model = sm.OLS(subset['Y'], X_vars).fit()
    return model.params['X'], len(subset)

# compute coefficient of X at W ≈ -1, 0, 1
results = {w: estimate_coef_at_W(w) for w in [-1, 0, 1]}
results

{-1: (np.float64(0.990090408694134), 2433),
 0: (np.float64(1.4859822514480032), 3821),
 1: (np.float64(1.9936504417092331), 2407)}

In [12]:
# Q4

# error generator with correlation
def make_error(corr_const, num):
    err = []
    prev = np.random.normal(0, 1)
    for n in range(num):
        prev = corr_const * prev + np.random.normal(0, 1)
        err.append(prev)
    return np.array(err)

# simulation settings
num_obs = 200
num_trials = 500
corr_values = [0.2, 0.5, 0.8]

results = {}

for rho in corr_values:
    betas = []
    se_estimates = []
    
    for _ in range(num_trials):
        # autocorrelated errors
        err_X = make_error(rho, num_obs)
        err_Y = make_error(rho, num_obs)
        
        # treatment and outcome (X and Y)
        X = 0.5 * np.random.normal(0, 1, num_obs) + err_X
        Y = 2 + 3 * X + err_Y  # True β_X = 3
        
        # fit regression WITH intercept
        model = sm.OLS(Y, sm.add_constant(X)).fit()
        
        betas.append(model.params[1])
        se_estimates.append(model.bse[1])
    
    results[rho] = {
        "beta_std": np.std(betas),
        "mean_SE_est": np.mean(se_estimates)
    }

# results
for rho, vals in results.items():
    print(f"corr_const = {rho}:  std(β̂_X) = {vals['beta_std']:.4f},  mean(SE) = {vals['mean_SE_est']:.4f}")


corr_const = 0.2:  std(β̂_X) = 0.0628,  mean(SE) = 0.0640
corr_const = 0.5:  std(β̂_X) = 0.0844,  mean(SE) = 0.0653
corr_const = 0.8:  std(β̂_X) = 0.1349,  mean(SE) = 0.0678
