In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

np.random.seed(42)

In [2]:
W = np.random.normal(0, 1, (1000,))
X = W + np.random.normal(0, 1, (1000,))
Z = np.random.normal(0, 1, (1000,))
Y = X + Z + W + np.random.normal(0, 1, (1000,))

df = pd.DataFrame({
    'W': W,
    'X': X,
    'Z': Z,
    'Y': Y
})

print("Data Summary:")
print(df.describe())


Data Summary:
                 W            X            Z            Y
count  1000.000000  1000.000000  1000.000000  1000.000000
mean      0.019332     0.090168     0.005834     0.096615
std       0.979216     1.369255     0.983454     2.576029
min      -3.241267    -4.002005    -3.019512    -6.852770
25%      -0.647590    -0.727131    -0.648000    -1.742549
50%       0.025301     0.042303    -0.000251     0.010380
75%       0.647944     0.943026     0.660915     1.825532
max       3.852731     5.602308     3.926238     9.905556


In [3]:
epsilon = Y - X - Z - W

correlation_X_epsilon = np.corrcoef(X, epsilon)[0, 1]

print(f"Correlation between X and error term: {correlation_X_epsilon:.6f}")


Correlation between X and error term: -0.049372


In [4]:
error_new = W + epsilon

correlation_X_error_new = np.corrcoef(X, error_new)[0, 1]

print(f"Correlation between X and new error term: {correlation_X_error_new:.6f}")


Correlation between X and new error term: 0.440364


In [5]:
df_hw = pd.read_csv('homework_7.1.csv')

print("Homework Data Summary:")
print(df_hw.describe())
print(f"Dataset shape: {df_hw.shape}")


Homework Data Summary:
        Unnamed: 0             X             W             Z             Y
count  10000.00000  10000.000000  10000.000000  10000.000000  10000.000000
mean    4999.50000     -0.021965     -0.001364      0.011044      0.479223
std     2886.89568      1.430124      1.006911      0.985831      3.345664
min        0.00000     -5.484932     -3.675430     -3.512546     -7.416559
25%     2499.75000     -1.007841     -0.675183     -0.655389     -1.938402
50%     4999.50000     -0.015165     -0.005634      0.007613     -0.050854
75%     7499.25000      0.915812      0.670535      0.687495      2.227702
max     9999.00000      5.807050      4.087876      3.698120     25.530534
Dataset shape: (10000, 5)


In [6]:
tolerance = 0.1
W_values = [-1, 0, 1]
coefficients = []

for w_target in W_values:
    mask = (df_hw['W'] >= w_target - tolerance) & (df_hw['W'] <= w_target + tolerance)
    df_subset = df_hw[mask]
    
    X_subset = df_subset[['X', 'Z']].values
    Y_subset = df_subset['Y'].values
    
    model = LinearRegression()
    model.fit(X_subset, Y_subset)
    
    coef_X = model.coef_[0]
    coef_Z = model.coef_[1]
    intercept = model.intercept_
    
    coefficients.append({
        'W_target': w_target,
        'W_mean': df_subset['W'].mean(),
        'n_obs': len(df_subset),
        'coef_X': coef_X,
        'coef_Z': coef_Z,
        'intercept': intercept,
        'R2': r2_score(Y_subset, model.predict(X_subset))
    })



In [None]:
df_coef = pd.DataFrame(coefficients)

print(f"At W = -1: Coefficient of X = {df_coef.iloc[0]['coef_X']:.4f}")
print(f"At W = 0: Coefficient of X = {df_coef.iloc[1]['coef_X']:.4f}")
print(f"At W = 1: Coefficient of X = {df_coef.iloc[2]['coef_X']:.4f}")


At W = -1: Coefficient of X = 0.8580
At W = 0: Coefficient of X = 1.3832
At W = 1: Coefficient of X = 1.9581


In [8]:
def make_error(corr_const, num):
    err = list()
    prev = np.random.normal(0, 1)
    
    for n in range(num):
        prev = corr_const * prev + (1 - corr_const) * np.random.normal(0, 1)
        err.append(prev)
    
    return np.array(err)


In [9]:
import statsmodels.api as sm

def run_simulation(corr_const, n_trials=1000, n_obs=100):
    np.random.seed(42)
    coefficients = []
    std_errors = []
    
    for trial in range(n_trials):
        error_X = make_error(corr_const, n_obs)
        error_Y = make_error(corr_const, n_obs)
        
        X_base = np.random.normal(0, 1, n_obs)
        X_treatment = X_base + error_X

        true_beta = 2.0
        true_intercept = 1.0
        Y = true_intercept + true_beta * X_treatment + error_Y
        
        X_with_intercept = sm.add_constant(X_treatment)
        
        model = sm.OLS(Y, X_with_intercept).fit()
        
        coefficients.append(model.params[1]) 
        std_errors.append(model.bse[1])
    
    return np.array(coefficients), np.array(std_errors)


In [10]:
corr_constants = [0.2, 0.5, 0.8]
results = []

for corr_const in corr_constants:
    coeffs, std_errs = run_simulation(corr_const, n_trials=1000, n_obs=100)
    
    std_of_estimates = np.std(coeffs, ddof=1)
    
    mean_of_std_errors = np.mean(std_errs)
    
    ratio = std_of_estimates / mean_of_std_errors
    
    results.append({
        'corr_const': corr_const,
        'std_of_estimates': std_of_estimates,
        'mean_of_std_errors': mean_of_std_errors,
        'ratio': ratio,
        'coeffs': coeffs,
        'std_errs': std_errs
    })



In [11]:
print(f"{'corr_const':<12} {'i':<20} {'ii':<20} {'i/ii':<15}")

for r in results:
    print(f"{r['corr_const']:<12.1f} {r['std_of_estimates']:<20.4f} {r['mean_of_std_errors']:<20.4f} {r['ratio']:<15.4f}")


corr_const   i                    ii                   i/ii           
0.2          0.0659               0.0641               1.0284         
0.5          0.0556               0.0503               1.1048         
0.8          0.0411               0.0323               1.2740         
