# Chapter 1: Introduction to Statistical Inference - Solutions

<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 10px; color: white;'>
<h1 style='margin: 0; font-size: 2.5em;'>PLAI Academy</h1>
<p style='margin: 10px 0 0 0; font-size: 1.2em; opacity: 0.9;'>Statistical Inference • Chapter 1 Solutions</p>
</div>

---

In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

sns.set_theme(style='whitegrid', palette='husl')
np.random.seed(42)

---

## Part 1: Fundamental Concepts

### Solution 1.1: Population vs Sample

In [None]:
# 1. Population: All 10,000 boxes produced today. Sample: 30 randomly selected boxes.

# 2. Generate population
population = np.random.normal(loc=500, scale=15, size=10000)
true_mu = population.mean()
print(f"True population mean: {true_mu:.2f}g")

# 3. Draw random sample
sample = np.random.choice(population, size=30, replace=False)

# 4. Calculate sample mean and compare
sample_mean = sample.mean()
print(f"Sample mean: {sample_mean:.2f}g")
print(f"Difference: {abs(sample_mean - true_mu):.2f}g")

# 5. Repeat 10 times
sample_means = [np.random.choice(population, size=30, replace=False).mean() for _ in range(10)]
print(f"\n10 sample means: {[f'{m:.2f}' for m in sample_means]}")
print(f"Average of sample means: {np.mean(sample_means):.2f}g")
print(f"Std dev of sample means: {np.std(sample_means):.2f}g")

# Observation: Sample means vary around true mean (unbiasedness)
# Standard deviation ≈ σ/√n = 15/√30 ≈ 2.74

### Solution 1.2: Sampling Distribution Simulation

In [None]:
# 1-2. Create population and draw 1000 samples
population = stats.norm(loc=100, scale=15)  # variance = 225
sample_means = [population.rvs(25).mean() for _ in range(1000)]

# 3. Calculate sample mean for each (done above)

# 4. Plot distribution
plt.figure(figsize=(10, 6))
plt.hist(sample_means, bins=40, density=True, alpha=0.7, edgecolor='black', label='Empirical')

# Overlay theoretical distribution: N(100, 225/25) = N(100, 9)
x = np.linspace(90, 110, 100)
plt.plot(x, stats.norm(100, 3).pdf(x), 'r-', linewidth=2, label='Theoretical N(100, 3²)')

plt.xlabel('Sample Mean')
plt.ylabel('Density')
plt.title('Sampling Distribution of X̄ (n=25)')
plt.legend()
plt.axvline(100, color='green', linestyle='--', linewidth=2, label='True μ')
plt.show()

# 5. Compare to theoretical values
print(f"Empirical mean: {np.mean(sample_means):.3f} (theoretical: 100)")
print(f"Empirical SE: {np.std(sample_means):.3f} (theoretical: 3.0)")

### Solution 1.3: Central Limit Theorem with Non-Normal Data

In [None]:
# 1. Create exponential population
population = stats.expon(scale=2)  # mean=2, variance=4

# 2. For each sample size
sample_sizes = [5, 10, 30, 50, 100]
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for idx, n in enumerate(sample_sizes):
    # Draw 1000 samples and calculate means
    sample_means = [population.rvs(n).mean() for _ in range(1000)]
    
    # Plot distribution
    axes[idx].hist(sample_means, bins=30, density=True, alpha=0.7, edgecolor='black')
    
    # Overlay theoretical normal
    x = np.linspace(min(sample_means), max(sample_means), 100)
    theoretical_std = 2 / np.sqrt(n)  # σ/√n
    axes[idx].plot(x, stats.norm(2, theoretical_std).pdf(x), 'r-', linewidth=2)
    
    axes[idx].set_title(f'n = {n}')
    axes[idx].axvline(2, color='green', linestyle='--')

axes[-1].axis('off')
plt.tight_layout()
plt.show()

# 4. Calculate skewness (measures departure from normality)
from scipy.stats import skew
print("\nSkewness by sample size:")
for n in sample_sizes:
    sample_means = [population.rvs(n).mean() for _ in range(1000)]
    print(f"n={n:3d}: skewness = {skew(sample_means):.3f}")
print("\nSkewness → 0 as n increases (approaching normality)")

### Solution 1.4: Confidence Intervals

In [None]:
# True parameters
true_mu = 75
true_sigma = 12
n = 50
alpha = 0.05

# 1-2. Generate 100 confidence intervals
intervals = []
contains_mu = []

for _ in range(100):
    sample = np.random.normal(true_mu, true_sigma, n)
    sample_mean = sample.mean()
    sample_std = sample.std(ddof=1)
    
    # t critical value
    t_crit = stats.t.ppf(1 - alpha/2, df=n-1)
    
    # Confidence interval
    margin = t_crit * (sample_std / np.sqrt(n))
    ci_lower = sample_mean - margin
    ci_upper = sample_mean + margin
    
    intervals.append((ci_lower, ci_upper))
    contains_mu.append(ci_lower <= true_mu <= ci_upper)

# 4. Plot all intervals
plt.figure(figsize=(12, 8))
for i, (ci_lower, ci_upper) in enumerate(intervals):
    color = 'green' if contains_mu[i] else 'red'
    plt.plot([ci_lower, ci_upper], [i, i], color=color, linewidth=1)

plt.axvline(true_mu, color='blue', linestyle='--', linewidth=2, label=f'True μ = {true_mu}')
plt.xlabel('Value')
plt.ylabel('CI Number')
plt.title('100 Confidence Intervals (95% level)')
plt.legend()
plt.show()

# 5. Calculate coverage rate
coverage = np.mean(contains_mu)
print(f"Coverage rate: {coverage:.1%} (expected: 95%)")
print(f"Number containing μ: {sum(contains_mu)}/100")

### Solution 1.5: Hypothesis Testing and P-values

In [None]:
# Given data
mu_0 = 70  # null hypothesis mean
x_bar = 73.5
s = 8
n = 36

# 1. Hypotheses: H₀: μ = 70, H₁: μ > 70 (one-sided)

# 2. Calculate test statistic
t_stat = (x_bar - mu_0) / (s / np.sqrt(n))
print(f"Test statistic: t = {t_stat:.3f}")

# 3. Calculate p-value (one-sided)
df = n - 1
p_value = 1 - stats.t.cdf(t_stat, df)
print(f"P-value: {p_value:.4f}")

# 4. Decision at α = 0.05
alpha = 0.05
decision = "Reject H₀" if p_value < alpha else "Fail to reject H₀"
print(f"\nDecision at α = {alpha}: {decision}")
print(f"Conclusion: There {'is' if p_value < alpha else 'is not'} significant evidence that the new method increases scores.")

# 5. Visualize null distribution
x = np.linspace(-4, 4, 200)
plt.figure(figsize=(10, 6))
plt.plot(x, stats.t(df).pdf(x), 'b-', linewidth=2, label='Null distribution t(35)')
plt.axvline(t_stat, color='red', linestyle='--', linewidth=2, label=f'Observed t = {t_stat:.2f}')

# Shade rejection region
t_crit = stats.t.ppf(1 - alpha, df)
x_reject = np.linspace(t_crit, 4, 100)
plt.fill_between(x_reject, 0, stats.t(df).pdf(x_reject), alpha=0.3, color='red', label='Rejection region')

plt.xlabel('t-value')
plt.ylabel('Density')
plt.title('One-sided t-test: H₀: μ = 70 vs H₁: μ > 70')
plt.legend()
plt.show()

---

## Part 2: Advanced Statistical Problems

### Solution 1.6: Power Analysis

In [None]:
# Parameters
mu_0 = 100
mu_1 = 105  # true mean under H₁
sigma = 15
n = 25
alpha = 0.05

# 2. Critical value for rejecting H₀ (one-sided test)
z_alpha = stats.norm.ppf(1 - alpha)
critical_value = mu_0 + z_alpha * (sigma / np.sqrt(n))
print(f"Critical value: {critical_value:.2f}")
print(f"Reject H₀ if X̄ > {critical_value:.2f}")

# 3. Probability of Type II error when μ = 105
# β = P(X̄ < critical_value | μ = 105)
beta = stats.norm.cdf(critical_value, loc=mu_1, scale=sigma/np.sqrt(n))
print(f"\nType II error (β): {beta:.4f}")

# 4. Power = 1 - β
power = 1 - beta
print(f"Power: {power:.4f}")

# 5. Power curve for different true means
true_means = np.linspace(100, 110, 50)
powers = [1 - stats.norm.cdf(critical_value, loc=mu, scale=sigma/np.sqrt(n)) for mu in true_means]

plt.figure(figsize=(10, 6))
plt.plot(true_means, powers, 'b-', linewidth=2, label='n=25')
plt.axhline(0.05, color='red', linestyle='--', label='α = 0.05 (Type I error)')
plt.axvline(mu_0, color='gray', linestyle='--', label='H₀: μ = 100')
plt.xlabel('True Mean (μ)')
plt.ylabel('Power')
plt.title('Power Curve')
plt.legend()
plt.grid(True, alpha=0.3)

# 6. Compare with n=50
critical_value_50 = mu_0 + z_alpha * (sigma / np.sqrt(50))
powers_50 = [1 - stats.norm.cdf(critical_value_50, loc=mu, scale=sigma/np.sqrt(50)) for mu in true_means]
plt.plot(true_means, powers_50, 'g-', linewidth=2, label='n=50')
plt.legend()
plt.show()

print(f"\nPower at μ=105: n=25: {power:.3f}, n=50: {powers_50[25]:.3f}")
print("Increasing sample size increases power.")

### Solution 1.7: Comparing Two Populations

In [None]:
# 1. Generate data for one experiment
n_A, n_B = 40, 40
data_A = np.random.normal(1000, 100, n_A)
data_B = np.random.normal(1050, 100, n_B)

# 2. Two-sample t-test
t_stat, p_value = stats.ttest_ind(data_B, data_A)
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")
print(f"Conclusion: {'Reject H₀' if p_value < 0.05 else 'Fail to reject H₀'}")

# 3. 95% CI for difference in means
mean_diff = data_B.mean() - data_A.mean()
se_diff = np.sqrt(data_A.var(ddof=1)/n_A + data_B.var(ddof=1)/n_B)
df = n_A + n_B - 2
t_crit = stats.t.ppf(0.975, df)
ci_lower = mean_diff - t_crit * se_diff
ci_upper = mean_diff + t_crit * se_diff
print(f"\n95% CI for μ_B - μ_A: [{ci_lower:.2f}, {ci_upper:.2f}]")

# 4. Repeat 1000 times
n_simulations = 1000
rejections = 0
p_values_list = []

for _ in range(n_simulations):
    data_A = np.random.normal(1000, 100, n_A)
    data_B = np.random.normal(1050, 100, n_B)  # true difference = 50
    t_stat, p_val = stats.ttest_ind(data_B, data_A)
    p_values_list.append(p_val)
    if p_val < 0.05:
        rejections += 1

empirical_power = rejections / n_simulations
print(f"\nEmpirical power: {empirical_power:.3f}")

# Distribution of p-values when H₀ is false
plt.figure(figsize=(10, 6))
plt.hist(p_values_list, bins=50, density=True, alpha=0.7, edgecolor='black')
plt.axvline(0.05, color='red', linestyle='--', linewidth=2, label='α = 0.05')
plt.xlabel('P-value')
plt.ylabel('Density')
plt.title('Distribution of P-values when H₁ is True')
plt.legend()
plt.show()
print("P-values skewed toward 0 when alternative hypothesis is true.")

### Solution 1.8: Chi-Squared Distribution and Sample Variance

In [None]:
# Parameters
true_variance = 100
n = 20
df = n - 1

# 1-2. Generate samples and calculate standardized quantities
standardized_variances = []
for _ in range(5000):
    sample = np.random.normal(50, np.sqrt(true_variance), n)
    s_squared = sample.var(ddof=1)
    standardized = (n - 1) * s_squared / true_variance
    standardized_variances.append(standardized)

# 3-4. Plot histogram with theoretical χ²(19)
plt.figure(figsize=(10, 6))
plt.hist(standardized_variances, bins=50, density=True, alpha=0.7, edgecolor='black', label='Empirical')

x = np.linspace(0, 50, 200)
plt.plot(x, stats.chi2(df).pdf(x), 'r-', linewidth=2, label=f'Theoretical χ²({df})')

plt.xlabel('(n-1)s²/σ²')
plt.ylabel('Density')
plt.title('Distribution of Standardized Sample Variance')
plt.legend()
plt.show()

# 5. 95% CI for σ²
sample = np.random.normal(50, np.sqrt(true_variance), n)
s_squared = sample.var(ddof=1)

# Critical values from χ² distribution
chi2_lower = stats.chi2.ppf(0.025, df)
chi2_upper = stats.chi2.ppf(0.975, df)

# CI for σ²: [(n-1)s²/χ²_upper, (n-1)s²/χ²_lower]
ci_lower = (n - 1) * s_squared / chi2_upper
ci_upper = (n - 1) * s_squared / chi2_lower

print(f"\nSample variance: s² = {s_squared:.2f}")
print(f"95% CI for σ²: [{ci_lower:.2f}, {ci_upper:.2f}]")
print(f"True σ² = {true_variance} {'is' if ci_lower <= true_variance <= ci_upper else 'is not'} in CI")

### Solution 1.9: Multiple Testing Problem

In [None]:
# Parameters
n_tests = 20
alpha = 0.05
n_simulations = 1000
n_samples = 30

# 1-4. Simulate multiple testing scenarios
at_least_one_rejection = 0

for _ in range(n_simulations):
    # Run 20 tests where all H₀ are true
    rejections_this_sim = 0
    
    for test in range(n_tests):
        # Generate data under H₀: μ = 0
        data = np.random.normal(0, 1, n_samples)
        t_stat, p_val = stats.ttest_1samp(data, 0)
        
        if p_val < alpha:
            rejections_this_sim += 1
    
    if rejections_this_sim > 0:
        at_least_one_rejection += 1

# 5. Calculate family-wise error rate
fwer_empirical = at_least_one_rejection / n_simulations
fwer_theoretical = 1 - (1 - alpha)**n_tests

print(f"Empirical FWER: {fwer_empirical:.3f}")
print(f"Theoretical FWER: {fwer_theoretical:.3f}")
print(f"Expected under independence: {fwer_theoretical:.3f}")

# 6. Apply Bonferroni correction
alpha_bonf = alpha / n_tests
at_least_one_bonf = 0

for _ in range(n_simulations):
    rejections_this_sim = 0
    
    for test in range(n_tests):
        data = np.random.normal(0, 1, n_samples)
        t_stat, p_val = stats.ttest_1samp(data, 0)
        
        if p_val < alpha_bonf:
            rejections_this_sim += 1
    
    if rejections_this_sim > 0:
        at_least_one_bonf += 1

fwer_bonf = at_least_one_bonf / n_simulations
print(f"\nBonferroni correction (α = {alpha_bonf:.4f}):")
print(f"Empirical FWER: {fwer_bonf:.3f} (target: {alpha})")
print("Bonferroni controls family-wise error rate at nominal level.")

### Solution 1.10: Sample Size Determination

In [None]:
# Given
E = 2  # margin of error
sigma = 12
alpha = 0.05
z_alpha = stats.norm.ppf(1 - alpha/2)

# 3. Calculate required sample size
n_required = ((z_alpha * sigma) / E)**2
n_required = int(np.ceil(n_required))
print(f"Required sample size for E={E}: n = {n_required}")

# 4. Verify by simulation
ci_widths = []
for _ in range(1000):
    sample = np.random.normal(100, sigma, n_required)
    mean = sample.mean()
    std = sample.std(ddof=1)
    t_crit = stats.t.ppf(0.975, n_required-1)
    margin = t_crit * (std / np.sqrt(n_required))
    ci_widths.append(2 * margin)  # total width

avg_width = np.mean(ci_widths)
print(f"Average CI width: {avg_width:.2f} (target: {2*E})")

# 5. Sample size for different margins
print("\nRequired sample sizes for different margins of error:")
for E_new in [2, 1, 0.5]:
    n_new = int(np.ceil(((z_alpha * sigma) / E_new)**2))
    print(f"E = {E_new}: n = {n_new}")

# Visualize relationship
margins = np.linspace(0.5, 5, 50)
sample_sizes = [((z_alpha * sigma) / m)**2 for m in margins]

plt.figure(figsize=(10, 6))
plt.plot(margins, sample_sizes, 'b-', linewidth=2)
plt.xlabel('Margin of Error (E)')
plt.ylabel('Required Sample Size (n)')
plt.title('Sample Size vs Margin of Error')
plt.grid(True, alpha=0.3)
plt.show()
print("\nSample size increases quadratically as margin of error decreases.")

---

## Part 3: AI/Machine Learning Applications

### Solution 1.11: Train-Test Split and Sampling Distributions

In [None]:
# 1. Create dataset with known relationship
np.random.seed(42)
n = 1000
x = np.random.uniform(0, 10, n)
y = 2*x + 3 + np.random.normal(0, 2, n)  # y = 2x + 3 + noise

# 2-3. Perform 100 different splits and evaluate
train_r2_scores = []
test_r2_scores = []

for seed in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        x.reshape(-1, 1), y, test_size=0.2, random_state=seed
    )
    
    # Fit linear model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Evaluate on both sets
    train_r2 = model.score(X_train, y_train)
    test_r2 = model.score(X_test, y_test)
    
    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)

# 4. Plot distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(train_r2_scores, bins=20, alpha=0.7, edgecolor='black', label='Train R²')
axes[0].axvline(np.mean(train_r2_scores), color='red', linestyle='--', linewidth=2, label='Mean')
axes[0].set_xlabel('R² Score')
axes[0].set_title('Training R² Distribution')
axes[0].legend()

axes[1].hist(test_r2_scores, bins=20, alpha=0.7, edgecolor='black', label='Test R²', color='orange')
axes[1].axvline(np.mean(test_r2_scores), color='red', linestyle='--', linewidth=2, label='Mean')
axes[1].set_xlabel('R² Score')
axes[1].set_title('Test R² Distribution')
axes[1].legend()

plt.tight_layout()
plt.show()

# 5. Calculate statistics
print(f"Training R² - Mean: {np.mean(train_r2_scores):.4f}, Std: {np.std(train_r2_scores):.4f}")
print(f"Test R² - Mean: {np.mean(test_r2_scores):.4f}, Std: {np.std(test_r2_scores):.4f}")

# 6. Interpretation
print("\nObservations:")
print("1. Test R² has higher variance than training R² (smaller sample size)")
print("2. Both distributions are approximately normal (CLT)")
print("3. Training R² slightly higher due to overfitting")
print("4. Understanding this variability is crucial for model evaluation")

### Solution 1.12: Bias-Variance Decomposition in Practice

In [None]:
# 1. Generate data from true function
np.random.seed(42)
x_true = np.linspace(0, 1, 100)
f_true = np.sin(2 * np.pi * x_true)  # true function
noise_std = 0.1

# Test point for bias-variance calculation
x_test = 0.5
y_test_true = np.sin(2 * np.pi * x_test)

# 2. For each polynomial degree
degrees = [1, 2, 5, 10, 20]
bias_squared = []
variance = []
mse = []

for d in degrees:
    predictions = []
    
    # Generate 100 training datasets
    for _ in range(100):
        # Random training data
        x_train = np.random.uniform(0, 1, 50)
        y_train = np.sin(2 * np.pi * x_train) + np.random.normal(0, noise_std, 50)
        
        # Fit polynomial
        coeffs = np.polyfit(x_train, y_train, d)
        pred = np.polyval(coeffs, x_test)
        predictions.append(pred)
    
    # 3. Calculate bias² and variance
    mean_pred = np.mean(predictions)
    bias_sq = (mean_pred - y_test_true)**2
    var = np.var(predictions)
    total_mse = bias_sq + var + noise_std**2
    
    bias_squared.append(bias_sq)
    variance.append(var)
    mse.append(total_mse)

# 4. Plot bias-variance tradeoff
plt.figure(figsize=(10, 6))
plt.plot(degrees, bias_squared, 'o-', label='Bias²', linewidth=2)
plt.plot(degrees, variance, 's-', label='Variance', linewidth=2)
plt.plot(degrees, mse, '^-', label='Total MSE', linewidth=2)
plt.xlabel('Model Complexity (Polynomial Degree)')
plt.ylabel('Error')
plt.title('Bias-Variance Tradeoff')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 5. Identify optimal complexity
optimal_d = degrees[np.argmin(mse)]
print(f"\nOptimal polynomial degree: {optimal_d}")
print(f"MSE decomposition at optimal:")
print(f"  Bias²: {bias_squared[degrees.index(optimal_d)]:.4f}")
print(f"  Variance: {variance[degrees.index(optimal_d)]:.4f}")
print(f"  Noise: {noise_std**2:.4f}")
print(f"  Total MSE: {mse[degrees.index(optimal_d)]:.4f}")

### Solution 1.13: Confidence Intervals for Model Performance

In [None]:
# 1. Create classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                          n_redundant=5, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=200, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 2. Evaluate on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
n_test = len(y_test)

# 3. Confidence intervals
# Normal approximation
z_crit = stats.norm.ppf(0.975)
se = np.sqrt(accuracy * (1 - accuracy) / n_test)
ci_normal_lower = accuracy - z_crit * se
ci_normal_upper = accuracy + z_crit * se

# Wilson score interval (better for proportions)
from statsmodels.stats.proportion import proportion_confint
n_correct = int(accuracy * n_test)
ci_wilson = proportion_confint(n_correct, n_test, alpha=0.05, method='wilson')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"\n95% Confidence Intervals:")
print(f"Normal approx: [{ci_normal_lower:.4f}, {ci_normal_upper:.4f}]")
print(f"Wilson score: [{ci_wilson[0]:.4f}, {ci_wilson[1]:.4f}]")

# 4-5. Verify coverage
coverages_normal = []
coverages_wilson = []

for seed in range(1000):
    # Generate new data
    X, y = make_classification(n_samples=1000, n_features=20, random_state=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=200, random_state=seed)
    
    # Train and get population accuracy
    model = LogisticRegression(max_iter=1000, random_state=seed)
    model.fit(X_train, y_train)
    
    # True population accuracy (evaluate on large test set)
    X_pop, y_pop = make_classification(n_samples=10000, n_features=20, random_state=seed)
    true_acc = model.score(X_pop, y_pop)
    
    # Observed accuracy on small test set
    y_pred = model.predict(X_test)
    obs_acc = accuracy_score(y_test, y_pred)
    
    # Construct CIs
    se = np.sqrt(obs_acc * (1 - obs_acc) / n_test)
    ci_norm = (obs_acc - z_crit * se, obs_acc + z_crit * se)
    
    n_correct = int(obs_acc * n_test)
    ci_wils = proportion_confint(n_correct, n_test, alpha=0.05, method='wilson')
    
    # Check coverage
    coverages_normal.append(ci_norm[0] <= true_acc <= ci_norm[1])
    coverages_wilson.append(ci_wils[0] <= true_acc <= ci_wils[1])

print(f"\nEmpirical Coverage Rates (1000 repetitions):")
print(f"Normal approximation: {np.mean(coverages_normal):.3f} (target: 0.95)")
print(f"Wilson score: {np.mean(coverages_wilson):.3f} (target: 0.95)")

# 6. Effect of sample size
print(f"\nCI width vs sample size:")
for n in [50, 100, 200, 500, 1000]:
    se = np.sqrt(0.8 * 0.2 / n)  # assume accuracy=0.8
    width = 2 * z_crit * se
    print(f"n={n:4d}: width = {width:.4f}")

### Solution 1.14: Hypothesis Testing for Model Comparison

In [None]:
# 1. Train two models
X, y = make_classification(n_samples=500, n_features=20, n_informative=15, random_state=42)

model1 = LogisticRegression(max_iter=1000, random_state=42)
model2 = RandomForestClassifier(n_estimators=50, random_state=42)

# 2. 10-fold cross-validation
scores1 = cross_val_score(model1, X, y, cv=10, scoring='accuracy')
scores2 = cross_val_score(model2, X, y, cv=10, scoring='accuracy')

print(f"Logistic Regression: {scores1.mean():.4f} ± {scores1.std():.4f}")
print(f"Random Forest: {scores2.mean():.4f} ± {scores2.std():.4f}")

# 3. Paired t-test
differences = scores2 - scores1
t_stat, p_value = stats.ttest_rel(scores2, scores1)

print(f"\nPaired t-test:")
print(f"Mean difference: {differences.mean():.4f}")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")

# 4. Effect size (Cohen's d)
cohens_d = differences.mean() / differences.std()
print(f"Cohen's d: {cohens_d:.3f}")

# Interpretation
if p_value < 0.05:
    print(f"\nConclusion: Random Forest performs significantly {'better' if differences.mean() > 0 else 'worse'} than Logistic Regression")
else:
    print(f"\nConclusion: No significant difference between models")

# Visualize differences
plt.figure(figsize=(10, 6))
plt.boxplot([scores1, scores2], labels=['Logistic Regression', 'Random Forest'])
plt.ylabel('Accuracy')
plt.title('Model Performance Comparison (10-fold CV)')
plt.grid(True, alpha=0.3)
plt.show()

# 5. Why paired test?
print("\nWhy use paired test:")
print("- Same folds used for both models (matched pairs)")
print("- Removes variation due to different train/test splits")
print("- More powerful than independent samples test")
print("- Controls for fold-to-fold variability")

### Solution 1.15: P-values and Model Selection

In [None]:
# 1. Generate dataset with sparse signal
np.random.seed(42)
n = 200
p = 100

# True predictive features: first 5
X = np.random.randn(n, p)
true_beta = np.zeros(p)
true_beta[:5] = [2, -1.5, 1, -2, 1.5]  # only first 5 features matter
y = X @ true_beta + np.random.randn(n)

# 2. Test each feature individually
from scipy.stats import pearsonr
p_values = []
for j in range(p):
    _, p_val = pearsonr(X[:, j], y)
    p_values.append((j, p_val))

# Sort by p-value
p_values.sort(key=lambda x: x[1])

# 3. Select features with p < 0.05
selected_uncorrected = [feat for feat, pval in p_values if pval < 0.05]
print(f"Selected features (uncorrected): {selected_uncorrected}")

# 4. Count false positives
true_features = set(range(5))
selected_set = set(selected_uncorrected)
false_positives = selected_set - true_features
false_negatives = true_features - selected_set

print(f"\nTrue features: {sorted(true_features)}")
print(f"False positives: {sorted(false_positives)} (count: {len(false_positives)})")
print(f"False negatives: {sorted(false_negatives)} (count: {len(false_negatives)})")

# 5. Benjamini-Hochberg FDR control
from statsmodels.stats.multitest import multipletests
_, p_values_sorted, _, _ = multipletests([pv for _, pv in p_values], alpha=0.05, method='fdr_bh')

# Reconstruct feature selection with FDR
selected_fdr = [feat for feat, pval in zip([f for f, _ in p_values], p_values_sorted) if pval < 0.05]

print(f"\nSelected features (FDR-corrected): {selected_fdr}")

# 6. Compare selections
selected_fdr_set = set(selected_fdr)
false_positives_fdr = selected_fdr_set - true_features
false_negatives_fdr = true_features - selected_fdr_set

print(f"False positives (FDR): {sorted(false_positives_fdr)} (count: {len(false_positives_fdr)})")
print(f"False negatives (FDR): {sorted(false_negatives_fdr)} (count: {len(false_negatives_fdr)})")

# Visualize p-values
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
colors = ['red' if i < 5 else 'gray' for i in range(p)]
plt.scatter(range(p), [pv for _, pv in sorted(p_values, key=lambda x: x[0])], c=colors, alpha=0.6)
plt.axhline(0.05, color='blue', linestyle='--', label='α = 0.05')
plt.xlabel('Feature Index')
plt.ylabel('P-value')
plt.title('P-values for Each Feature (red = true signal)')
plt.legend()
plt.yscale('log')

plt.subplot(1, 2, 2)
comparison = ['Uncorrected', 'FDR']
fp_counts = [len(false_positives), len(false_positives_fdr)]
fn_counts = [len(false_negatives), len(false_negatives_fdr)]
x = np.arange(len(comparison))
plt.bar(x - 0.2, fp_counts, 0.4, label='False Positives', color='red', alpha=0.7)
plt.bar(x + 0.2, fn_counts, 0.4, label='False Negatives', color='blue', alpha=0.7)
plt.xticks(x, comparison)
plt.ylabel('Count')
plt.title('Error Comparison')
plt.legend()

plt.tight_layout()
plt.show()

print("\nFDR correction reduces false positives while maintaining power.")

---

## Part 4: Contemporary Problems (2025+)

**Note**: Solutions 1.16-1.20 involve advanced topics requiring simulation of modern ML systems. Complete implementations would require additional libraries and extended code. Below are conceptual solution outlines with key statistical techniques.

### Solution 1.16: Uncertainty Quantification in LLMs

In [None]:
# Simulate LLM classification task
n_samples = 1000
n_classes = 4

# Generate softmax probabilities (simulating LLM output)
np.random.seed(42)
logits = np.random.randn(n_samples, n_classes)
probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)

# True labels (simulate)
true_labels = np.random.choice(n_classes, n_samples)
predicted_labels = probs.argmax(axis=1)

# Calculate predictive entropy
entropy = -np.sum(probs * np.log(probs + 1e-10), axis=1)
max_prob = probs.max(axis=1)

print(f"Entropy range: [{entropy.min():.3f}, {entropy.max():.3f}]")
print(f"Max prob range: [{max_prob.min():.3f}, {max_prob.max():.3f}]")

# Establish confidence threshold
threshold = np.percentile(entropy, 25)  # bottom 25% entropy = high confidence
high_confidence = entropy < threshold

print(f"\nHigh confidence predictions: {high_confidence.sum()}/{n_samples}")
print(f"Accuracy (high conf): {(predicted_labels[high_confidence] == true_labels[high_confidence]).mean():.3f}")
print(f"Accuracy (low conf): {(predicted_labels[~high_confidence] == true_labels[~high_confidence]).mean():.3f}")

# Calibration plot
n_bins = 10
prob_bins = np.linspace(0, 1, n_bins + 1)
bin_accuracies = []
bin_confidences = []

for i in range(n_bins):
    mask = (max_prob >= prob_bins[i]) & (max_prob < prob_bins[i+1])
    if mask.sum() > 0:
        bin_acc = (predicted_labels[mask] == true_labels[mask]).mean()
        bin_conf = max_prob[mask].mean()
        bin_accuracies.append(bin_acc)
        bin_confidences.append(bin_conf)

# Plot calibration
plt.figure(figsize=(8, 8))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect calibration')
plt.plot(bin_confidences, bin_accuracies, 'o-', linewidth=2, markersize=8, label='Model')
plt.xlabel('Confidence')
plt.ylabel('Accuracy')
plt.title('Calibration Plot')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("\nFor LLM outputs, confidence intervals can be constructed via:")
print("1. Ensemble methods (multiple models)")
print("2. Monte Carlo dropout")
print("3. Bayesian approximations")
print("4. Conformal prediction")

### Solution 1.17-1.20: Advanced Contemporary Problems

**Solutions for exercises 1.17-1.20 follow similar patterns:**

**Key techniques demonstrated:**
- Simulation of complex systems (networks, neural networks, reward models)
- Bootstrap and resampling for uncertainty quantification
- Adjustment for bias and confounding
- Construction of confidence intervals under non-standard conditions
- Hypothesis testing with modern metrics

**Statistical principles applied:**
- Sampling distributions remain fundamental
- Central Limit Theorem enables inference at scale
- Bootstrap provides distribution-free uncertainty estimates
- Proper experimental design controls for confounding
- Multiple testing corrections prevent false discoveries

**Note**: Full implementations would require:
- Network libraries (networkx)
- Deep learning frameworks (PyTorch/TensorFlow)
- Advanced statistical packages (statsmodels, sklearn)
- Causal inference tools (DoWhy, EconML)

The core statistical inference principles from Chapter 1 apply directly to these modern problems.

---

<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;'>
<p style='margin: 0; font-size: 1.1em;'>Solutions by <strong>PLAI Academy</strong></p>
<p style='margin: 5px 0 0 0; opacity: 0.8;'>Statistical Inference • 2025</p>
</div>