# Blocked Bootstrap Analysis

Resampling-based approach that preserves the batch structure by resampling entire batches.

In [None]:
# Parameters (will be injected)
base_csv_path = ''
test_csv_path = ''
base_name = ''
test_name = ''
device_pool = ''
test_type = ''

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print(f'Base: {base_name}')
print(f'Test: {test_name}')
print(f'Device Pool: {device_pool}')
print(f'Test Type: {test_type}')

In [None]:
# Load pre-processed metrics from CSV (already extracted in parallel)
base_df = pd.read_csv(base_csv_path)
test_df = pd.read_csv(test_csv_path)

## Summary Statistics

In [None]:
print('=' * 80)
print('BASE RUN SUMMARY')
print('=' * 80)
print(f'Total traces: {len(base_df)}')
print(f'Number of batches: {base_df["batch"].nunique()}')
print('\nTraces per batch:')
print(base_df.groupby('batch').size())

print('\n' + '=' * 80)
print('TEST RUN SUMMARY')
print('=' * 80)
print(f'Total traces: {len(test_df)}')
print(f'Number of batches: {test_df["batch"].nunique()}')
print('\nTraces per batch:')
print(test_df.groupby('batch').size())

## Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

axes[0, 0].boxplot([base_df[base_df['batch'] == b]['startup_latency_ms'].values 
                     for b in sorted(base_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(base_df['batch'].unique())])
axes[0, 0].set_title(f'Base - Startup Latency by Batch')
axes[0, 0].set_ylabel('Latency (ms)')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].boxplot([test_df[test_df['batch'] == b]['startup_latency_ms'].values 
                     for b in sorted(test_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(test_df['batch'].unique())])
axes[0, 1].set_title(f'Test - Startup Latency by Batch')
axes[0, 1].set_ylabel('Latency (ms)')
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].boxplot([base_df[base_df['batch'] == b]['render_latency_ms'].values 
                     for b in sorted(base_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(base_df['batch'].unique())])
axes[1, 0].set_title(f'Base - Render Latency by Batch')
axes[1, 0].set_ylabel('Latency (ms)')
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].boxplot([test_df[test_df['batch'] == b]['render_latency_ms'].values 
                     for b in sorted(test_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(test_df['batch'].unique())])
axes[1, 1].set_title(f'Test - Render Latency by Batch')
axes[1, 1].set_ylabel('Latency (ms)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Blocked Bootstrap Analysis

In [None]:
def blocked_bootstrap(base_df, test_df, metric, n_bootstrap=10000, alpha=0.05):
    base_batches = base_df['batch'].unique()
    test_batches = test_df['batch'].unique()
    bootstrap_diffs = []
    
    for _ in range(n_bootstrap):
        base_sample_batches = np.random.choice(base_batches, size=len(base_batches), replace=True)
        test_sample_batches = np.random.choice(test_batches, size=len(test_batches), replace=True)
        base_sample = pd.concat([base_df[base_df['batch'] == b] for b in base_sample_batches])
        test_sample = pd.concat([test_df[test_df['batch'] == b] for b in test_sample_batches])
        diff = test_sample[metric].mean() - base_sample[metric].mean()
        bootstrap_diffs.append(diff)
    
    bootstrap_diffs = np.array(bootstrap_diffs)
    observed_diff = test_df[metric].mean() - base_df[metric].mean()
    ci_lower = np.percentile(bootstrap_diffs, alpha/2 * 100)
    ci_upper = np.percentile(bootstrap_diffs, (1 - alpha/2) * 100)
    p_value = np.mean((bootstrap_diffs * observed_diff) < 0) * 2
    
    return {
        'observed_diff': observed_diff,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'p_value': p_value,
        'bootstrap_diffs': bootstrap_diffs
    }

# Calculate summary statistics including percentiles
def calculate_summary_stats(df, metric):
    return {
        'mean': df[metric].mean(),
        'p50': df[metric].quantile(0.5),
        'p90': df[metric].quantile(0.9),
        'p95': df[metric].quantile(0.95),
        'p99': df[metric].quantile(0.99)
    }

# Startup latency analysis
startup_boot = blocked_bootstrap(base_df, test_df, 'startup_latency_ms')
base_startup_stats = calculate_summary_stats(base_df, 'startup_latency_ms')
test_startup_stats = calculate_summary_stats(test_df, 'startup_latency_ms')

print('\n' + '=' * 80)
print('BLOCKED BOOTSTRAP RESULTS - STARTUP LATENCY')
print('=' * 80)
print(f'\nBase Statistics:')
print(f'  Mean:  {base_startup_stats["mean"]:.2f} ms')
print(f'  P50:   {base_startup_stats["p50"]:.2f} ms')
print(f'  P90:   {base_startup_stats["p90"]:.2f} ms')
print(f'  P95:   {base_startup_stats["p95"]:.2f} ms')
print(f'  P99:   {base_startup_stats["p99"]:.2f} ms')

print(f'\nTest Statistics:')
print(f'  Mean:  {test_startup_stats["mean"]:.2f} ms')
print(f'  P50:   {test_startup_stats["p50"]:.2f} ms')
print(f'  P90:   {test_startup_stats["p90"]:.2f} ms')
print(f'  P95:   {test_startup_stats["p95"]:.2f} ms')
print(f'  P99:   {test_startup_stats["p99"]:.2f} ms')

print(f'\nDifferences (Test - Base):')
print(f'  Mean:  {test_startup_stats["mean"] - base_startup_stats["mean"]:.2f} ms ({(test_startup_stats["mean"] - base_startup_stats["mean"])/base_startup_stats["mean"]*100:+.2f}%)')
print(f'  P50:   {test_startup_stats["p50"] - base_startup_stats["p50"]:.2f} ms ({(test_startup_stats["p50"] - base_startup_stats["p50"])/base_startup_stats["p50"]*100:+.2f}%)')
print(f'  P90:   {test_startup_stats["p90"] - base_startup_stats["p90"]:.2f} ms ({(test_startup_stats["p90"] - base_startup_stats["p90"])/base_startup_stats["p90"]*100:+.2f}%)')
print(f'  P95:   {test_startup_stats["p95"] - base_startup_stats["p95"]:.2f} ms ({(test_startup_stats["p95"] - base_startup_stats["p95"])/base_startup_stats["p95"]*100:+.2f}%)')
print(f'  P99:   {test_startup_stats["p99"] - base_startup_stats["p99"]:.2f} ms ({(test_startup_stats["p99"] - base_startup_stats["p99"])/base_startup_stats["p99"]*100:+.2f}%)')

print(f'\nBootstrap Analysis (using mean):')
print(f'  Observed difference: {startup_boot["observed_diff"]:.2f} ms ({startup_boot["observed_diff"]/base_startup_stats["mean"]*100:+.2f}%)')
print(f'  95% CI: [{startup_boot["ci_lower"]:.2f}, {startup_boot["ci_upper"]:.2f}]')
print(f'  P-value: {startup_boot["p_value"]:.4f}')
print(f'  Significant at α=0.05: {"YES" if startup_boot["p_value"] < 0.05 else "NO"}')

# Render latency analysis
render_boot = blocked_bootstrap(base_df, test_df, 'render_latency_ms')
base_render_stats = calculate_summary_stats(base_df, 'render_latency_ms')
test_render_stats = calculate_summary_stats(test_df, 'render_latency_ms')

print('\n' + '=' * 80)
print('BLOCKED BOOTSTRAP RESULTS - RENDER LATENCY')
print('=' * 80)
print(f'\nBase Statistics:')
print(f'  Mean:  {base_render_stats["mean"]:.2f} ms')
print(f'  P50:   {base_render_stats["p50"]:.2f} ms')
print(f'  P90:   {base_render_stats["p90"]:.2f} ms')
print(f'  P95:   {base_render_stats["p95"]:.2f} ms')
print(f'  P99:   {base_render_stats["p99"]:.2f} ms')

print(f'\nTest Statistics:')
print(f'  Mean:  {test_render_stats["mean"]:.2f} ms')
print(f'  P50:   {test_render_stats["p50"]:.2f} ms')
print(f'  P90:   {test_render_stats["p90"]:.2f} ms')
print(f'  P95:   {test_render_stats["p95"]:.2f} ms')
print(f'  P99:   {test_render_stats["p99"]:.2f} ms')

print(f'\nDifferences (Test - Base):')
print(f'  Mean:  {test_render_stats["mean"] - base_render_stats["mean"]:.2f} ms ({(test_render_stats["mean"] - base_render_stats["mean"])/base_render_stats["mean"]*100:+.2f}%)')
print(f'  P50:   {test_render_stats["p50"] - base_render_stats["p50"]:.2f} ms ({(test_render_stats["p50"] - base_render_stats["p50"])/base_render_stats["p50"]*100:+.2f}%)')
print(f'  P90:   {test_render_stats["p90"] - base_render_stats["p90"]:.2f} ms ({(test_render_stats["p90"] - base_render_stats["p90"])/base_render_stats["p90"]*100:+.2f}%)')
print(f'  P95:   {test_render_stats["p95"] - base_render_stats["p95"]:.2f} ms ({(test_render_stats["p95"] - base_render_stats["p95"])/base_render_stats["p95"]*100:+.2f}%)')
print(f'  P99:   {test_render_stats["p99"] - base_render_stats["p99"]:.2f} ms ({(test_render_stats["p99"] - base_render_stats["p99"])/base_render_stats["p99"]*100:+.2f}%)')

print(f'\nBootstrap Analysis (using mean):')
print(f'  Observed difference: {render_boot["observed_diff"]:.2f} ms ({render_boot["observed_diff"]/base_render_stats["mean"]*100:+.2f}%)')
print(f'  95% CI: [{render_boot["ci_lower"]:.2f}, {render_boot["ci_upper"]:.2f}]')
print(f'  P-value: {render_boot["p_value"]:.4f}')
print(f'  Significant at α=0.05: {"YES" if render_boot["p_value"] < 0.05 else "NO"}')

## Bootstrap Distribution Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

axes[0].hist(startup_boot['bootstrap_diffs'], bins=50, alpha=0.7, edgecolor='black')
axes[0].axvline(startup_boot['observed_diff'], color='red', linestyle='--', linewidth=2, label='Observed')
axes[0].axvline(startup_boot['ci_lower'], color='green', linestyle='--', linewidth=1.5, label='95% CI')
axes[0].axvline(startup_boot['ci_upper'], color='green', linestyle='--', linewidth=1.5)
axes[0].axvline(0, color='black', linestyle='-', linewidth=1, alpha=0.5)
axes[0].set_xlabel('Difference (ms)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Bootstrap Distribution - Startup Latency Difference')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].hist(render_boot['bootstrap_diffs'], bins=50, alpha=0.7, edgecolor='black')
axes[1].axvline(render_boot['observed_diff'], color='red', linestyle='--', linewidth=2, label='Observed')
axes[1].axvline(render_boot['ci_lower'], color='green', linestyle='--', linewidth=1.5, label='95% CI')
axes[1].axvline(render_boot['ci_upper'], color='green', linestyle='--', linewidth=1.5)
axes[1].axvline(0, color='black', linestyle='-', linewidth=1, alpha=0.5)
axes[1].set_xlabel('Difference (ms)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Bootstrap Distribution - Render Latency Difference')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()