In [42]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import mahalanobis

np.random.seed(42)


Coding Quiz 5

In [43]:
num = 100000 

difficulty = np.random.uniform(0, 1, (num,))

speed = np.maximum(np.random.normal(15, 5, (num, )) - difficulty * 10, 0)

accident = np.minimum(np.maximum(0.03 * speed + 0.4 * difficulty + np.random.normal(0, 0.3, (num,)), 0), 1)

df = pd.DataFrame({'difficulty': difficulty, 'speed': speed, 'accident': accident})

print(f"Dataset shape: {df.shape}")

df.head(10)

print(df.corr())


Dataset shape: (100000, 3)
            difficulty     speed  accident
difficulty    1.000000 -0.499545  0.090820
speed        -0.499545  1.000000  0.323241
accident      0.090820  0.323241  1.000000


Question 1:

In [44]:
from sklearn.linear_model import LinearRegression

n_experiments = 1000
sample_size = 100000

coefficients = []

for i in range(n_experiments):
    difficulty = np.random.uniform(0, 1, (sample_size,))
    speed = np.maximum(np.random.normal(15, 5, (sample_size,)) - difficulty * 10, 0)
    
    model = LinearRegression()
    model.fit(difficulty.reshape(-1, 1), speed)
    
    coefficients.append(model.coef_[0])

coefficients = np.array(coefficients)

avg_coefficient = np.mean(coefficients)
std_coefficient = np.std(coefficients)
median_coefficient = np.median(coefficients)

print(f"Average coefficient of X: {avg_coefficient:.4f}")


Average coefficient of X: -9.6656


Question 2:

In [45]:
coefficients_x_with_z = []
coefficients_z = []

for i in range(n_experiments):
    difficulty = np.random.uniform(0, 1, (sample_size,))
    speed = np.maximum(np.random.normal(15, 5, (sample_size,)) - difficulty * 10, 0)
    accident = np.minimum(np.maximum(0.03 * speed + 0.4 * difficulty + np.random.normal(0, 0.3, (sample_size,)), 0), 1)
    
    X_predictors = np.column_stack([difficulty, accident])
    model = LinearRegression()
    model.fit(X_predictors, speed)

    coefficients_x_with_z.append(model.coef_[0])
    coefficients_z.append(model.coef_[1])

coefficients_x_with_z = np.array(coefficients_x_with_z)
coefficients_z = np.array(coefficients_z)

avg_coef_x = np.mean(coefficients_x_with_z)
std_coef_x = np.std(coefficients_x_with_z)
median_coef_x = np.median(coefficients_x_with_z)

avg_coef_z = np.mean(coefficients_z)
std_coef_z = np.std(coefficients_z)

print(f"Average coefficient of X: {avg_coef_x:.4f}")


Average coefficient of X: -10.3246


Coding Quiz 6

In [46]:
df = pd.read_csv('homework_6.1.csv', index_col=0)

print("Dataset shape:", df.shape)
print(df.head(10))
print("\nDataset summary:")
print(df.describe())

treated = df[df['X'] == 1].copy()
untreated = df[df['X'] == 0].copy()


Dataset shape: (1000, 3)
          Z  X         Y
0  0.548814  0 -0.823220
1  0.715189  1  0.842405
2  0.602763  1  0.898618
3  0.544883  0 -0.817325
4  0.423655  0 -0.635482
5  0.645894  0 -0.968841
6  0.437587  0 -0.656381
7  0.891773  1  0.754113
8  0.963663  1  0.718169
9  0.383442  1  1.008279

Dataset summary:
                 Z            X            Y
count  1000.000000  1000.000000  1000.000000
mean      0.495922     0.491000     0.163240
std       0.290744     0.500169     0.754302
min       0.000546     0.000000    -1.459379
25%       0.247613     0.000000    -0.457581
50%       0.481323     0.000000    -0.017813
75%       0.737260     1.000000     0.855020
max       0.999809     1.000000     1.190403


Question 1:

In [47]:
nn_untreated = NearestNeighbors(n_neighbors=1)
nn_untreated.fit(untreated[['Z']])
distances_t, indices_t = nn_untreated.kneighbors(treated[['Z']])

nn_treated = NearestNeighbors(n_neighbors=1)
nn_treated.fit(treated[['Z']])
distances_u, indices_u = nn_treated.kneighbors(untreated[['Z']])

treated_effects = treated['Y'].values - untreated.iloc[indices_t.flatten()]['Y'].values

untreated_effects = treated.iloc[indices_u.flatten()]['Y'].values - untreated['Y'].values

ate = np.mean(np.concatenate([treated_effects, untreated_effects]))

print(f"ATE: {ate:.6f}")


ATE: 1.695270


Question 2:

In [48]:
att = np.mean(treated_effects)

print(f"ATE on the Treated: {att:.6f}")

ATE on the Treated: 1.846409


Question 3:

In [49]:
atu = np.mean(untreated_effects)

print(f"ATE on the Untreated: {atu:.6f}")


ATE on the Untreated: 1.549477


Question 4:

In [50]:
optimal_te = np.max(untreated_effects)

print(f"OTE: {optimal_te:.6f}")


OTE: 2.172470


Coding Quiz 7

In [51]:
W = np.random.normal(0, 1, (1000,))
X = W + np.random.normal(0, 1, (1000,))
Z = np.random.normal(0, 1, (1000,))
Y = X + Z + W + np.random.normal(0, 1, (1000,))

df = pd.DataFrame({
    'W': W,
    'X': X,
    'Z': Z,
    'Y': Y
})

print("Data Summary:")
print(df.describe())


Data Summary:
                 W            X            Z            Y
count  1000.000000  1000.000000  1000.000000  1000.000000
mean      0.036230    -0.016050    -0.010848     0.040669
std       0.996836     1.381812     1.013083     2.600948
min      -2.900977    -4.394492    -2.920314    -8.923099
25%      -0.669570    -0.959383    -0.713635    -1.603463
50%       0.040267    -0.014778    -0.010873    -0.061440
75%       0.694238     0.887522     0.691716     1.753937
max       3.678982     4.362581     2.741314     7.979927


Question 1:

In [52]:
epsilon = Y - X - Z - W

correlation_X_epsilon = np.corrcoef(X, epsilon)[0, 1]

print(f"Correlation between X and error term: {correlation_X_epsilon:.6f}")


Correlation between X and error term: -0.018938


Question 2:

In [53]:
error_new = W + epsilon

correlation_X_error_new = np.corrcoef(X, error_new)[0, 1]

print(f"Correlation between X and new error term: {correlation_X_error_new:.6f}")


Correlation between X and new error term: 0.494580


Question 3:

In [54]:
df_hw = pd.read_csv('homework_7.1.csv')

print("Homework Data Summary:")
print(df_hw.describe())
print(f"Dataset shape: {df_hw.shape}")


Homework Data Summary:
        Unnamed: 0             X             W             Z             Y
count  10000.00000  10000.000000  10000.000000  10000.000000  10000.000000
mean    4999.50000     -0.021965     -0.001364      0.011044      0.479223
std     2886.89568      1.430124      1.006911      0.985831      3.345664
min        0.00000     -5.484932     -3.675430     -3.512546     -7.416559
25%     2499.75000     -1.007841     -0.675183     -0.655389     -1.938402
50%     4999.50000     -0.015165     -0.005634      0.007613     -0.050854
75%     7499.25000      0.915812      0.670535      0.687495      2.227702
max     9999.00000      5.807050      4.087876      3.698120     25.530534
Dataset shape: (10000, 5)


In [55]:
tolerance = 0.1
W_values = [-1, 0, 1]
coefficients = []

for w_target in W_values:
    mask = (df_hw['W'] >= w_target - tolerance) & (df_hw['W'] <= w_target + tolerance)
    df_subset = df_hw[mask]
    
    X_subset = df_subset[['X', 'Z']].values
    Y_subset = df_subset['Y'].values
    
    model = LinearRegression()
    model.fit(X_subset, Y_subset)
    
    coef_X = model.coef_[0]
    coef_Z = model.coef_[1]
    intercept = model.intercept_
    
    coefficients.append({
        'W_target': w_target,
        'W_mean': df_subset['W'].mean(),
        'n_obs': len(df_subset),
        'coef_X': coef_X,
        'coef_Z': coef_Z,
        'intercept': intercept,
        'R2': r2_score(Y_subset, model.predict(X_subset))
    })

df_coef = pd.DataFrame(coefficients)

print(f"At W = -1: Coefficient of X = {df_coef.iloc[0]['coef_X']:.4f}")
print(f"At W = 0: Coefficient of X = {df_coef.iloc[1]['coef_X']:.4f}")
print(f"At W = 1: Coefficient of X = {df_coef.iloc[2]['coef_X']:.4f}")


At W = -1: Coefficient of X = 0.8580
At W = 0: Coefficient of X = 1.3832
At W = 1: Coefficient of X = 1.9581


Question 4:

In [56]:
def make_error(corr_const, num):
    err = list()
    prev = np.random.normal(0, 1)
    
    for n in range(num):
        prev = corr_const * prev + (1 - corr_const) * np.random.normal(0, 1)
        err.append(prev)
    
    return np.array(err)

def run_simulation(corr_const, n_trials=1000, n_obs=100):
    np.random.seed(42)
    coefficients = []
    std_errors = []
    
    for trial in range(n_trials):
        error_X = make_error(corr_const, n_obs)
        error_Y = make_error(corr_const, n_obs)
        
        X_base = np.random.normal(0, 1, n_obs)
        X_treatment = X_base + error_X

        true_beta = 2.0
        true_intercept = 1.0
        Y = true_intercept + true_beta * X_treatment + error_Y
        
        X_with_intercept = sm.add_constant(X_treatment)
        
        model = sm.OLS(Y, X_with_intercept).fit()
        
        coefficients.append(model.params[1]) 
        std_errors.append(model.bse[1])
    
    return np.array(coefficients), np.array(std_errors)

corr_constants = [0.2, 0.5, 0.8]
results = []

for corr_const in corr_constants:
    coeffs, std_errs = run_simulation(corr_const, n_trials=1000, n_obs=100)
    
    std_of_estimates = np.std(coeffs, ddof=1)
    
    mean_of_std_errors = np.mean(std_errs)
    
    ratio = std_of_estimates / mean_of_std_errors
    
    results.append({
        'corr_const': corr_const,
        'std_of_estimates': std_of_estimates,
        'mean_of_std_errors': mean_of_std_errors,
        'ratio': ratio,
        'coeffs': coeffs,
        'std_errs': std_errs
    })

print(f"{'corr_const':<12} {'i':<20} {'ii':<20} {'i/ii':<15}")

for r in results:
    print(f"{r['corr_const']:<12.1f} {r['std_of_estimates']:<20.4f} {r['mean_of_std_errors']:<20.4f} {r['ratio']:<15.4f}")


corr_const   i                    ii                   i/ii           
0.2          0.0659               0.0641               1.0284         
0.5          0.0556               0.0503               1.1048         
0.8          0.0411               0.0323               1.2740         


Coding Quiz 8:

Question 1:

In [57]:
df = pd.read_csv('homework_8.1.csv')

print("Dataset shape:", df.shape)
print(df.head())
print("\nBasic statistics:")
print(df.describe())


Dataset shape: (1000, 4)
   Unnamed: 0  X         Y         Z
0           0  1  4.109218  1.764052
1           1  0  2.259504  0.400157
2           2  0 -0.647584  0.978738
3           3  0  2.106071  2.240893
4           4  1  3.583464  1.867558

Basic statistics:
        Unnamed: 0            X            Y            Z
count  1000.000000  1000.000000  1000.000000  1000.000000
mean    499.500000     0.481000     1.014397    -0.045257
std     288.819436     0.499889     1.998531     0.987527
min       0.000000     0.000000    -5.491184    -3.046143
25%     249.750000     0.000000    -0.509696    -0.698420
50%     499.500000     0.000000     1.013902    -0.058028
75%     749.250000     1.000000     2.567960     0.606951
max     999.000000     1.000000     6.150865     2.759355


In [63]:
X_treatment = df['X']
Y_outcome = df['Y']

Z_covariates = df.filter(regex='^Z')

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(Z_covariates, X_treatment)

propensity_scores = logreg.predict_proba(Z_covariates.values.reshape(-1, 1))[:, 1]

weights = np.where(X_treatment == 1, 
                   1 / propensity_scores,   
                   1 / (1 - propensity_scores))

treated_mask = (X_treatment == 1)
weighted_outcome_treated = (weights[treated_mask] * Y_outcome[treated_mask]).sum()
sum_weights_treated = weights[treated_mask].sum()
avg_outcome_treated = weighted_outcome_treated / sum_weights_treated

control_mask = (X_treatment == 0)
weighted_outcome_control = (weights[control_mask] * Y_outcome[control_mask]).sum()
sum_weights_control = weights[control_mask].sum()
avg_outcome_control = weighted_outcome_control / sum_weights_control

ATE = avg_outcome_treated - avg_outcome_control

print(f"Average Treatment Effect: {ATE:.6f}")


Average Treatment Effect: 2.274341




Question 2:

In [59]:
print(propensity_scores[:3])

[0.84011371 0.58464597 0.71108245]


Question 3:

In [None]:
df2 = pd.read_csv('homework_8.2.csv')

print(f"Shape: {df2.shape}")
print("\nFirst few rows:")
print(df2.head(10))
print("\nBasic statistics:")
print(df2.describe())

X2 = df2['X']
Y2 = df2['Y']

Z_cols_2 = df2.filter(regex='^Z').columns.tolist()

treated_df = df2[X2 == 1].copy()
untreated_df = df2[X2 == 0].copy()

Z_matrix = df2[Z_cols_2].values.T

cov_matrix = np.cov(Z_matrix)

inv_cov_matrix = np.linalg.inv(cov_matrix)

matches = []

treated_Z = treated_df[Z_cols_2].values
untreated_Z = untreated_df[Z_cols_2].values

for i, treated_point in enumerate(treated_Z):
    min_distance = float('inf')
    best_match_idx = -1
    
    for j, untreated_point in enumerate(untreated_Z):
        distance = mahalanobis(treated_point, untreated_point, inv_cov_matrix)
        
        if distance < min_distance:
            min_distance = distance
            best_match_idx = j
    
    matches.append({
        'treated_idx': treated_df.index[i],
        'untreated_idx': untreated_df.index[best_match_idx],
        'distance': min_distance,
        'treated_Z1': treated_point[0],
        'treated_Z2': treated_point[1],
        'untreated_Z1': untreated_Z[best_match_idx][0],
        'untreated_Z2': untreated_Z[best_match_idx][1]
    })

matches_df = pd.DataFrame(matches)

treatment_effects = []

for idx, row in matches_df.iterrows():
    treated_idx = row['treated_idx']
    untreated_idx = row['untreated_idx']
    
    Y_treated = df2.loc[treated_idx, 'Y']
    Y_untreated = df2.loc[untreated_idx, 'Y']
    
    effect = Y_treated - Y_untreated
    treatment_effects.append(effect)

ATE_matched = np.mean(treatment_effects)

print(f"Average Treatment Effect: {ATE_matched:.6f}")


Shape: (1000, 5)

First few rows:
   Unnamed: 0  X         Y        Z1        Z2
0           0  1  4.652085  1.764052  2.320015
1           1  1  2.590221  0.400157  1.292631
2           2  1  3.898981  0.978738  0.556423
3           3  1  5.857179  2.240893  2.345607
4           4  1  3.647489  1.867558  2.095611
5           5  1  2.813448 -0.977278 -0.775798
6           6  1  2.842384  0.950088  1.490862
7           7  0 -0.065011 -0.151357 -1.969435
8           8  0 -0.104002 -0.103219 -0.152543
9           9  1  4.003199  0.410599  0.649632

Basic statistics:
        Unnamed: 0            X            Y           Z1           Z2
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean    499.500000     0.483000     1.552274    -0.045257    -0.031640
std     288.819436     0.499961     2.650695     0.987527     1.361324
min       0.000000     0.000000    -5.797134    -3.046143    -4.024931
25%     249.750000     0.000000    -0.601263    -0.698420    -1.007259
50% 

Question 4:

In [None]:
max_distance_idx = matches_df['distance'].idxmax()
max_distance_row = matches_df.loc[max_distance_idx]

untreated_idx = max_distance_row['untreated_idx']
untreated_Z1 = df2.loc[untreated_idx, 'Z1']
untreated_Z2 = df2.loc[untreated_idx, 'Z2']

print(f"Z1 = {untreated_Z1:.6f}")
print(f"Z2 = {untreated_Z2:.6f}")


  Z1 = 1.519995
  Z2 = -1.282208
