In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
def simulate(A=1, B=1, C=10, D=1000):
    W = np.random.normal(0, 1, D)
    X = W + np.random.normal(0, B, D)
    Y = A*X - W + np.random.normal(0, C, D)
    return Y, X, W

In [3]:
# Question 1: Probability of detecting nonzero effect of X on Y with W in regression
# Parameters: A=1, B=1, C=10, D=1000
# We need to find P(|t-value for X| > 1.96)

np.random.seed(42)  # For reproducibility
num_simulations = 10000
significant_count = 0

for i in range(num_simulations):
    # Generate data
    Y, X, W = simulate(A=1, B=1, C=10, D=1000)
    
    # Prepare data for regression: Y ~ X + W
    X_with_W = sm.add_constant(np.column_stack([X, W]))
    
    # Fit the model
    model = sm.OLS(Y, X_with_W).fit()
    
    # Get t-value for X (which is the second coefficient, index 1)
    t_value_X = model.tvalues[1]
    
    # Check if significant (|t| > 1.96)
    if abs(t_value_X) > 1.96:
        significant_count += 1

# Calculate probability
probability = significant_count / num_simulations
print(f"Probability of detecting nonzero effect of X on Y: {probability:.4f} or {probability*100:.2f}%")
print(f"Number of significant results: {significant_count} out of {num_simulations}")

Probability of detecting nonzero effect of X on Y: 0.8853 or 88.53%
Number of significant results: 8853 out of 10000


In [4]:
# Question 2: Calculate the skewness of the X coefficient estimates
# We need to collect all the coefficient estimates from the simulations

np.random.seed(42)  # For reproducibility
num_simulations = 10000
coefficient_estimates = []

for i in range(num_simulations):
    # Generate data
    Y, X, W = simulate(A=1, B=1, C=10, D=1000)
    
    # Prepare data for regression: Y ~ X + W
    X_with_W = sm.add_constant(np.column_stack([X, W]))
    
    # Fit the model
    model = sm.OLS(Y, X_with_W).fit()
    
    # Store the coefficient estimate for X (index 1)
    coefficient_estimates.append(model.params[1])

# Convert to numpy array
coefficient_estimates = np.array(coefficient_estimates)

# Calculate skewness using scipy
skewness = stats.skew(coefficient_estimates)

print(f"Skewness of X coefficient estimates: {skewness:.4f}")
print(f"\nAdditional statistics:")
print(f"Mean: {np.mean(coefficient_estimates):.4f}")
print(f"Std: {np.std(coefficient_estimates):.4f}")
print(f"Min: {np.min(coefficient_estimates):.4f}")
print(f"Max: {np.max(coefficient_estimates):.4f}")


Skewness of X coefficient estimates: -0.0151

Additional statistics:
Mean: 0.9991
Std: 0.3164
Min: -0.3156
Max: 2.1066


In [5]:
# Question 3: Find value of B where detection probability is about 50%
# Fixed parameters: A=1, C=10, D=1000
# We need to test different values of B

def test_B_value(B_value, num_sims=1000):
    """Test a specific B value and return detection probability"""
    significant_count = 0
    
    for i in range(num_sims):
        Y, X, W = simulate(A=1, B=B_value, C=10, D=1000)
        X_with_W = sm.add_constant(np.column_stack([X, W]))
        model = sm.OLS(Y, X_with_W).fit()
        t_value_X = model.tvalues[1]
        
        if abs(t_value_X) > 1.96:
            significant_count += 1
    
    return significant_count / num_sims

# Test the given options
np.random.seed(42)
B_options = [5.4, 0.2, 1.8, 0.6]

print("Testing different B values (A=1, C=10, D=1000):")
print("=" * 60)

for B_val in B_options:
    prob = test_B_value(B_val, num_sims=2000)
    print(f"B = {B_val:4.1f} → Detection probability: {prob:.4f} ({prob*100:.2f}%)")
    
print("=" * 60)


Testing different B values (A=1, C=10, D=1000):
B =  5.4 → Detection probability: 1.0000 (100.00%)
B =  5.4 → Detection probability: 1.0000 (100.00%)
B =  0.2 → Detection probability: 0.1050 (10.50%)
B =  0.2 → Detection probability: 0.1050 (10.50%)
B =  1.8 → Detection probability: 1.0000 (100.00%)
B =  1.8 → Detection probability: 1.0000 (100.00%)
B =  0.6 → Detection probability: 0.4600 (46.00%)
B =  0.6 → Detection probability: 0.4600 (46.00%)


In [6]:
# Let's do a finer search around B=0.6 to confirm
np.random.seed(42)
B_values_fine = np.arange(0.4, 0.9, 0.1)

print("\nFiner search around B=0.6:")
print("=" * 60)

for B_val in B_values_fine:
    prob = test_B_value(B_val, num_sims=2000)
    diff_from_50 = abs(prob - 0.5)
    print(f"B = {B_val:.1f} → Detection: {prob:.4f} ({prob*100:.2f}%) | Distance from 50%: {diff_from_50:.4f}")
    
print("=" * 60)
print("\nAnswer: B = 0.6 gives the closest probability to 50%")



Finer search around B=0.6:
B = 0.4 → Detection: 0.2465 (24.65%) | Distance from 50%: 0.2535
B = 0.5 → Detection: 0.3530 (35.30%) | Distance from 50%: 0.1470
B = 0.6 → Detection: 0.4705 (47.05%) | Distance from 50%: 0.0295
B = 0.7 → Detection: 0.5970 (59.70%) | Distance from 50%: 0.0970
B = 0.8 → Detection: 0.7360 (73.60%) | Distance from 50%: 0.2360

Answer: B = 0.6 gives the closest probability to 50%


In [7]:
# Question 4: Find value of A where detection probability is about 50%
# Fixed parameters: B=1, C=10, D=100 (note smaller sample size)
# We need to test different values of A

def test_A_value(A_value, num_sims=2000):
    """Test a specific A value and return detection probability"""
    significant_count = 0
    
    for i in range(num_sims):
        Y, X, W = simulate(A=A_value, B=1, C=10, D=100)
        X_with_W = sm.add_constant(np.column_stack([X, W]))
        model = sm.OLS(Y, X_with_W).fit()
        t_value_X = model.tvalues[1]
        
        if abs(t_value_X) > 1.96:
            significant_count += 1
    
    return significant_count / num_sims

# Test the given options
np.random.seed(42)
A_options = [0.5, 4.0, 1.0, 2.0]

print("Testing different A values (B=1, C=10, D=100):")
print("=" * 60)

for A_val in A_options:
    prob = test_A_value(A_val, num_sims=2000)
    diff_from_50 = abs(prob - 0.5)
    print(f"A = {A_val:4.1f} → Detection: {prob:.4f} ({prob*100:.2f}%) | Distance from 50%: {diff_from_50:.4f}")
    
print("=" * 60)


Testing different A values (B=1, C=10, D=100):
A =  0.5 → Detection: 0.0830 (8.30%) | Distance from 50%: 0.4170
A =  4.0 → Detection: 0.9730 (97.30%) | Distance from 50%: 0.4730
A =  1.0 → Detection: 0.1760 (17.60%) | Distance from 50%: 0.3240
A =  2.0 → Detection: 0.4925 (49.25%) | Distance from 50%: 0.0075


In [8]:
# Verify with a finer search around A=2.0
np.random.seed(42)
A_values_fine = np.arange(1.5, 2.6, 0.25)

print("\nFiner search around A=2.0:")
print("=" * 60)

for A_val in A_values_fine:
    prob = test_A_value(A_val, num_sims=2000)
    diff_from_50 = abs(prob - 0.5)
    print(f"A = {A_val:.2f} → Detection: {prob:.4f} ({prob*100:.2f}%) | Distance from 50%: {diff_from_50:.4f}")
    
print("=" * 60)
print("\nAnswer: A = 2.0 gives the closest probability to 50%")



Finer search around A=2.0:
A = 1.50 → Detection: 0.3180 (31.80%) | Distance from 50%: 0.1820
A = 1.75 → Detection: 0.4285 (42.85%) | Distance from 50%: 0.0715
A = 2.00 → Detection: 0.5075 (50.75%) | Distance from 50%: 0.0075
A = 2.25 → Detection: 0.5925 (59.25%) | Distance from 50%: 0.0925
A = 2.50 → Detection: 0.6890 (68.90%) | Distance from 50%: 0.1890

Answer: A = 2.0 gives the closest probability to 50%
