In [5]:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 10000

lightning = np.random.uniform(0, 10, n)

bears = np.maximum(50 - 3 * lightning + np.random.normal(0, 5, n), 0)

deer = np.maximum(100 - 2 * lightning - 0.5 * bears + np.random.normal(0, 8, n), 0)

flowers = np.maximum(80 + 4 * lightning - 0.3 * deer + np.random.normal(0, 10, n), 0)

df = pd.DataFrame({
    'lightning': lightning,
    'bears': bears,
    'deer': deer,
    'flowers': flowers
})

print(df.describe())
print(f'\n{df.head()}')


          lightning         bears          deer       flowers
count  10000.000000  10000.000000  10000.000000  10000.000000
mean       4.941596     35.236990     72.483995     77.820916
std        2.876301     10.021085      8.387137     15.706696
min        0.000116      4.202123     38.650502     26.481312
25%        2.463289     27.460030     66.802101     66.623606
50%        4.925286     35.343102     72.457393     77.707862
75%        7.400063     42.913020     78.053531     89.075974
max        9.997177     67.477813    105.099210    130.746881

   lightning      bears       deer    flowers
0   3.745401  31.324488  71.917769  81.243094
1   9.507143  15.852640  69.917996  97.128560
2   7.319939  29.984276  62.113371  97.193879
3   5.986585  26.170879  67.827374  73.454957
4   1.560186  50.882612  74.913336  65.200607


In [19]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

np.random.seed(42)
n = 5000

Z = np.random.normal(0, 1, n)

X = (Z + np.random.normal(0, 0.5, n) > 0).astype(int)

true_effect = 2 + 0.5 * Z
Y = 5 + true_effect * X + 2 * Z + np.random.normal(0, 1, n)

df = pd.DataFrame({'Z': Z, 'X': X, 'Y': Y})

treated = df[df['X'] == 1].copy()
untreated = df[df['X'] == 0].copy()

nn_treated = NearestNeighbors(n_neighbors=1)
nn_treated.fit(treated[['Z']])
distances, indices = nn_treated.kneighbors(untreated[['Z']])

untreated_effects = treated.iloc[indices.flatten()]['Y'].values - untreated['Y'].values

naive_mte = np.max(untreated_effects)

top_10_percent_cutoff = np.percentile(untreated_effects, 90)
top_candidates = untreated_effects[untreated_effects >= top_10_percent_cutoff]
robust_mte = np.percentile(top_candidates, 95)

print(f"Naive MTE: {naive_mte:.4f}")
print(f"Robust MTE: {robust_mte:.4f}")
print(f"Difference: {naive_mte - robust_mte:.4f}")
print(f"\nNumber of top candidates: {len(top_candidates)}")
print(f"Mean of top candidates: {np.mean(top_candidates):.4f}")
print(f"Std of top candidates: {np.std(top_candidates):.4f}")


Naive MTE: 6.4472
Robust MTE: 5.3958
Difference: 1.0514

Number of top candidates: 251
Mean of top candidates: 4.2247
Std of top candidates: 0.6019


In [18]:
import numpy as np

np.random.seed(42)
n = 5000

true_effects = np.random.normal(10, 3, n)

measurement_noise = np.random.normal(loc=0, scale=2, size=n)

observed_effects = true_effects + measurement_noise

true_max_effect = np.max(true_effects)
true_90th_percentile_effect = np.percentile(true_effects, 90)

mte_by_max = np.max(observed_effects)

mte_by_percentile = np.percentile(observed_effects, 90)

print(f"True Max Effect: {true_max_effect:.4f}")
print(f"True 90th Percentile: {true_90th_percentile_effect:.4f}")

print(f"\nProblematic MTE: {mte_by_max:.4f}")
print(f"Robust MTE: {mte_by_percentile:.4f}")

print(f"\nMax method error: {abs(mte_by_max - true_max_effect):.4f}")
print(f"Percentile method error: {abs(mte_by_percentile - true_90th_percentile_effect):.4f}")

True Max Effect: 21.7787
True 90th Percentile: 13.8184

Problematic MTE: 23.0254
Robust MTE: 14.6763

Max method error: 1.2467
Percentile method error: 0.8579


In [17]:
import numpy as np
from sklearn.linear_model import LinearRegression

np.random.seed(42)
n = 1000

W = np.random.normal(0, 1, n)

X = W + np.random.normal(0, 1, n)

Z = np.random.normal(0, 1, n)

Y = X + Z + W + np.random.normal(0, 1, n)

X_partial = np.column_stack([X, Z])
model_biased = LinearRegression()
model_biased.fit(X_partial, Y)

X_full = np.column_stack([X, Z, W])
model_correct = LinearRegression()
model_correct.fit(X_full, Y)

print("Model without W:")
print(f"Coefficient of X: {model_biased.coef_[0]:.4f}")
print(f"Coefficient of Z: {model_biased.coef_[1]:.4f}")

print("\nModel with W:")
print(f"Coefficient of X: {model_correct.coef_[0]:.4f}")
print(f"Coefficient of Z: {model_correct.coef_[1]:.4f}")
print(f"Coefficient of W: {model_correct.coef_[2]:.4f}")



Model without W:
Coefficient of X: 1.4531
Coefficient of Z: 1.0397

Model with W:
Coefficient of X: 0.9433
Coefficient of Z: 1.0223
Coefficient of W: 1.0399


In [21]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from scipy import stats

np.random.seed(42)
n_experiments = 1000
n_samples = 100
p_values = []

for i in range(n_experiments):
    W = np.random.normal(0, 1, n_samples)
    X = np.random.normal(0, 1, n_samples)
    Y = 2 * X + np.random.normal(0, 1, n_samples)
    
    predictors = np.column_stack([W, X])
    model = LinearRegression()
    model.fit(predictors, Y)
    
    predictions = model.predict(predictors)
    residuals = Y - predictions
    dof = n_samples - 3
    mse = np.sum(residuals**2) / dof
    
    X_with_intercept = np.column_stack([np.ones(n_samples), predictors])
    var_covar = mse * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
    se_W = np.sqrt(var_covar[1, 1])
    
    t_stat = model.coef_[0] / se_W
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), dof))
    p_values.append(p_value)

p_values = np.array(p_values)
min_p_value = np.min(p_values)
num_significant = np.sum(p_values < 0.05)

print(f"Smallest p-value: {min_p_value:.6f}")


Smallest p-value: 0.000475
