# QAT Rounding Mode Analysis

**Hypothesis**: Systematic rounding bias (floor/ceil) is easier to optimize against than nearest rounding's "random flipping".

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
df = pd.read_csv('../runs/experiment_20260118_151239/summary.csv')
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nRounding modes: {df['rounding'].unique()}")
print(f"Widths: {sorted(df['width'].unique())}")
print(f"Depths: {sorted(df['depth'].unique())}")
print(f"Seeds: {sorted(df['seed'].unique())}")
df.head()

## 1. Overall Comparison: Rounding Modes

In [None]:
# Mean accuracy by rounding mode
summary = df.groupby('rounding')['final_test_acc'].agg(['mean', 'std', 'min', 'max', 'count'])
summary = summary.round(4)
print("Overall accuracy by rounding mode:")
summary

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
order = ['baseline', 'nearest', 'floor', 'ceil']
sns.boxplot(data=df, x='rounding', y='final_test_acc', order=order, ax=axes[0])
axes[0].set_title('Test Accuracy Distribution by Rounding Mode')
axes[0].set_ylabel('Test Accuracy')
axes[0].set_xlabel('Rounding Mode')

# Bar plot with error bars
means = df.groupby('rounding')['final_test_acc'].mean().reindex(order)
stds = df.groupby('rounding')['final_test_acc'].std().reindex(order)
axes[1].bar(order, means, yerr=stds, capsize=5, alpha=0.7, color=['gray', 'blue', 'green', 'red'])
axes[1].set_title('Mean Test Accuracy (± std)')
axes[1].set_ylabel('Test Accuracy')
axes[1].set_xlabel('Rounding Mode')
axes[1].set_ylim(0.5, 1.0)

plt.tight_layout()
plt.show()

## 2. Statistical Tests

In [None]:
# Paired comparison: for each (seed, width, depth), compare rounding modes
# Pivot to get each config as a row
pivot = df.pivot_table(index=['seed', 'width', 'depth'], columns='rounding', values='final_test_acc')

print("Paired t-tests (vs nearest):")
for mode in ['floor', 'ceil', 'baseline']:
    t_stat, p_val = stats.ttest_rel(pivot[mode], pivot['nearest'])
    diff = (pivot[mode] - pivot['nearest']).mean()
    print(f"  {mode} vs nearest: diff={diff:+.4f}, t={t_stat:.2f}, p={p_val:.4f}")

print("\nPaired t-tests (floor vs ceil):")
t_stat, p_val = stats.ttest_rel(pivot['floor'], pivot['ceil'])
diff = (pivot['floor'] - pivot['ceil']).mean()
print(f"  floor vs ceil: diff={diff:+.4f}, t={t_stat:.2f}, p={p_val:.4f}")

## 3. Accuracy by Model Capacity (Width × Depth)

In [None]:
# Heatmaps: mean accuracy by width × depth for each rounding mode
fig, axes = plt.subplots(1, 4, figsize=(18, 4))

for i, mode in enumerate(['baseline', 'nearest', 'floor', 'ceil']):
    subset = df[df['rounding'] == mode]
    heatmap_data = subset.pivot_table(index='depth', columns='width', values='final_test_acc', aggfunc='mean')
    sns.heatmap(heatmap_data, annot=True, fmt='.2f', cmap='RdYlGn', vmin=0.5, vmax=1.0, ax=axes[i])
    axes[i].set_title(f'{mode}')
    axes[i].set_xlabel('Width')
    axes[i].set_ylabel('Depth' if i == 0 else '')

plt.suptitle('Mean Test Accuracy by Width × Depth', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Difference heatmaps: how much better/worse than nearest?
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

nearest_pivot = df[df['rounding'] == 'nearest'].pivot_table(
    index='depth', columns='width', values='final_test_acc', aggfunc='mean')

for i, mode in enumerate(['baseline', 'floor', 'ceil']):
    mode_pivot = df[df['rounding'] == mode].pivot_table(
        index='depth', columns='width', values='final_test_acc', aggfunc='mean')
    diff = mode_pivot - nearest_pivot
    sns.heatmap(diff, annot=True, fmt='+.2f', cmap='RdBu', center=0, vmin=-0.2, vmax=0.2, ax=axes[i])
    axes[i].set_title(f'{mode} - nearest')
    axes[i].set_xlabel('Width')
    axes[i].set_ylabel('Depth' if i == 0 else '')

plt.suptitle('Accuracy Difference vs Nearest Rounding (positive = better)', y=1.02)
plt.tight_layout()
plt.show()

## 4. Filter Out Collapsed Models

Models with very small width often collapse to random (50%). Let's analyze only models that actually learned.

In [None]:
# Consider a model "learned" if accuracy > 0.55
df_learned = df[df['final_test_acc'] > 0.55].copy()
print(f"Total runs: {len(df)}")
print(f"Learned (acc > 0.55): {len(df_learned)} ({100*len(df_learned)/len(df):.1f}%)")
print(f"\nBreakdown by rounding mode:")
print(df.groupby('rounding').apply(lambda x: (x['final_test_acc'] > 0.55).sum()))

In [None]:
# Re-do comparison on learned models only
print("Learned models only - accuracy by rounding mode:")
summary_learned = df_learned.groupby('rounding')['final_test_acc'].agg(['mean', 'std', 'count'])
summary_learned.round(4)

## 5. Analysis by Width Category

In [None]:
# Group widths: small (4-8), medium (16-32), large (64-128)
def width_category(w):
    if w <= 8: return 'small (4-8)'
    elif w <= 32: return 'medium (16-32)'
    else: return 'large (64-128)'

df['width_cat'] = df['width'].apply(width_category)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
order = ['baseline', 'nearest', 'floor', 'ceil']

for i, cat in enumerate(['small (4-8)', 'medium (16-32)', 'large (64-128)']):
    subset = df[df['width_cat'] == cat]
    sns.boxplot(data=subset, x='rounding', y='final_test_acc', order=order, ax=axes[i])
    axes[i].set_title(f'Width: {cat}')
    axes[i].set_ylim(0.4, 1.0)
    
plt.suptitle('Accuracy by Rounding Mode, Split by Model Width', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Stats for each width category
print("Mean accuracy by rounding mode and width category:")
pd.pivot_table(df, index='width_cat', columns='rounding', values='final_test_acc', aggfunc='mean').round(3)

## 6. Variance Analysis: Which rounding mode is more consistent?

In [None]:
# For each (width, depth), compute std across seeds for each rounding mode
variance_by_config = df.groupby(['width', 'depth', 'rounding'])['final_test_acc'].std().reset_index()
variance_by_config.columns = ['width', 'depth', 'rounding', 'std_across_seeds']

print("Mean std across seeds by rounding mode:")
print(variance_by_config.groupby('rounding')['std_across_seeds'].mean().round(4))

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
sns.boxplot(data=variance_by_config, x='rounding', y='std_across_seeds', order=order)
ax.set_title('Consistency: Std of Accuracy Across Seeds')
ax.set_ylabel('Std (lower = more consistent)')
plt.show()

## 7. Summary & Conclusions

In [None]:
print("="*60)
print("SUMMARY")
print("="*60)

# Overall winner
means = df.groupby('rounding')['final_test_acc'].mean()
print(f"\nOverall mean accuracy:")
for mode in order:
    print(f"  {mode:10s}: {means[mode]:.4f}")

print(f"\nBest overall: {means.idxmax()} ({means.max():.4f})")

# QAT comparison (excluding baseline)
qat_modes = ['nearest', 'floor', 'ceil']
qat_means = means[qat_modes]
print(f"\nBest QAT mode: {qat_means.idxmax()} ({qat_means.max():.4f})")

# Hypothesis test
print(f"\nHypothesis: systematic bias (floor/ceil) beats nearest")
floor_vs_nearest = (pivot['floor'] - pivot['nearest']).mean()
ceil_vs_nearest = (pivot['ceil'] - pivot['nearest']).mean()
print(f"  floor - nearest: {floor_vs_nearest:+.4f}")
print(f"  ceil - nearest:  {ceil_vs_nearest:+.4f}")