# Batch Aggregation Analysis

Conservative statistical approach that aggregates measurements to batch-level means before performing hypothesis tests.

In [None]:
# Parameters (will be injected)
base_csv_path = ''
test_csv_path = ''
base_name = ''
test_name = ''
device_pool = ''
test_type = ''

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print(f'Base: {base_name}')
print(f'Test: {test_name}')
print(f'Device Pool: {device_pool}')
print(f'Test Type: {test_type}')

In [None]:
# Load pre-processed metrics from CSV (already extracted in parallel)
base_df = pd.read_csv(base_csv_path)
test_df = pd.read_csv(test_csv_path)

## Summary Statistics

In [None]:
print('=' * 80)
print('BASE RUN SUMMARY')
print('=' * 80)
print(f'Total traces: {len(base_df)}')
print(f'Number of batches: {base_df["batch"].nunique()}')
print('\nTraces per batch:')
print(base_df.groupby('batch').size())

print('\n' + '=' * 80)
print('TEST RUN SUMMARY')
print('=' * 80)
print(f'Total traces: {len(test_df)}')
print(f'Number of batches: {test_df["batch"].nunique()}')
print('\nTraces per batch:')
print(test_df.groupby('batch').size())

## Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

axes[0, 0].boxplot([base_df[base_df['batch'] == b]['startup_latency_ms'].values 
                     for b in sorted(base_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(base_df['batch'].unique())])
axes[0, 0].set_title(f'Base - Startup Latency by Batch')
axes[0, 0].set_ylabel('Latency (ms)')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].boxplot([test_df[test_df['batch'] == b]['startup_latency_ms'].values 
                     for b in sorted(test_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(test_df['batch'].unique())])
axes[0, 1].set_title(f'Test - Startup Latency by Batch')
axes[0, 1].set_ylabel('Latency (ms)')
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].boxplot([base_df[base_df['batch'] == b]['render_latency_ms'].values 
                     for b in sorted(base_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(base_df['batch'].unique())])
axes[1, 0].set_title(f'Base - Render Latency by Batch')
axes[1, 0].set_ylabel('Latency (ms)')
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].boxplot([test_df[test_df['batch'] == b]['render_latency_ms'].values 
                     for b in sorted(test_df['batch'].unique())],
                    labels=[f'Batch {b}' for b in sorted(test_df['batch'].unique())])
axes[1, 1].set_title(f'Test - Render Latency by Batch')
axes[1, 1].set_ylabel('Latency (ms)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Batch Aggregation Analysis

In [None]:
base_batch_stats = base_df.groupby('batch').agg({
    'startup_latency_ms': ['mean', 'std', 'count'],
    'render_latency_ms': ['mean', 'std', 'count']
}).reset_index()

test_batch_stats = test_df.groupby('batch').agg({
    'startup_latency_ms': ['mean', 'std', 'count'],
    'render_latency_ms': ['mean', 'std', 'count']
}).reset_index()

print('=' * 80)
print('BATCH-LEVEL STATISTICS')
print('=' * 80)
print('\nBase Run - Batch Means:')
print(base_batch_stats)
print('\nTest Run - Batch Means:')
print(test_batch_stats)

In [None]:
base_startup_means = base_batch_stats['startup_latency_ms']['mean'].values
test_startup_means = test_batch_stats['startup_latency_ms']['mean'].values
base_render_means = base_batch_stats['render_latency_ms']['mean'].values
test_render_means = test_batch_stats['render_latency_ms']['mean'].values

def permutation_test(group1, group2, n_permutations=10000):
    observed_diff = np.mean(group2) - np.mean(group1)
    combined = np.concatenate([group1, group2])
    n1 = len(group1)
    perm_diffs = []
    for _ in range(n_permutations):
        np.random.shuffle(combined)
        perm_diff = np.mean(combined[n1:]) - np.mean(combined[:n1])
        perm_diffs.append(perm_diff)
    p_value = np.mean(np.abs(perm_diffs) >= np.abs(observed_diff))
    return observed_diff, p_value

startup_diff, startup_pval = permutation_test(base_startup_means, test_startup_means)
print('\n' + '=' * 80)
print('BATCH AGGREGATION RESULTS - STARTUP LATENCY')
print('=' * 80)
print(f'Base mean (batch averages): {np.mean(base_startup_means):.2f} ms')
print(f'Test mean (batch averages): {np.mean(test_startup_means):.2f} ms')
print(f'Difference: {startup_diff:.2f} ms ({startup_diff/np.mean(base_startup_means)*100:+.2f}%)')
print(f'P-value (permutation test): {startup_pval:.4f}')
print(f'Significant at α=0.05: {"YES" if startup_pval < 0.05 else "NO"}')

render_diff, render_pval = permutation_test(base_render_means, test_render_means)
print('\n' + '=' * 80)
print('BATCH AGGREGATION RESULTS - RENDER LATENCY')
print('=' * 80)
print(f'Base mean (batch averages): {np.mean(base_render_means):.2f} ms')
print(f'Test mean (batch averages): {np.mean(test_render_means):.2f} ms')
print(f'Difference: {render_diff:.2f} ms ({render_diff/np.mean(base_render_means)*100:+.2f}%)')
print(f'P-value (permutation test): {render_pval:.4f}')
print(f'Significant at α=0.05: {"YES" if render_pval < 0.05 else "NO"}')