# Principal Component — Label Alignment Analysis

This notebook investigates whether the principal components (PCs) of HTS-AT attention head representations encode **task-relevant semantic information** about audio classes.

**Hypothesis**: If PC directions in a head's representation space capture semantic structure, then projecting samples onto those directions should yield features that are more discriminative than random projections of equal dimensionality.

**Method**: For each attention head, we:
1. Compute PCA on the pooled head representations $\mathbf{R} \in \mathbb{R}^{n \times d_h}$
2. Train a logistic regression classifier using only the top-$k$ PCs as features
3. Compare accuracy against a random-projection baseline of equal dimensionality
4. Aggregate results across heads, blocks, and layers to identify specialization patterns

**Interpretation**: Heads where PC-based accuracy significantly exceeds the random baseline are those whose principal directions are semantically aligned with class structure — precisely the heads that are candidates for ResiDual spectral reweighting.

## 0. Imports & Configuration

In [1]:
import torch
import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Scientific computing
from scipy import stats

# ML & Analysis
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec
import seaborn as sns
import pandas as pd
import os

# ── Palette & Style ──────────────────────────────────────────────────────────
COLORS = {
    'layer0': '#2d77a6',
    'layer1': '#bf7b04',
    'layer2': '#6ea66d',
    'layer3': '#808080',
    'accent':  '#d62728',
    'neutral': '#7f7f7f'
}
LAYER_COLORS = [COLORS['layer0'], COLORS['layer1'], COLORS['layer2'], COLORS['layer3']]

plt.style.use('seaborn-v0_8-paper')
sns.set_context("paper", font_scale=1.2)
plt.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Times New Roman'],
    'mathtext.fontset': 'stix',
    'axes.labelsize': 11,
    'axes.titlesize': 12,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9,
    'legend.fontsize': 9,
    'figure.titlesize': 13,
    'axes.grid': True,
    'grid.alpha': 0.3,
    'axes.axisbelow': True
})

# ── Architecture constants ───────────────────────────────────────────────────
HTSAT_DEPTHS = [2, 2, 6, 2]
HTSAT_HEADS  = [4, 8, 16, 32]
HTSAT_EMBED  = 96
HEAD_DIM     = 24

SAVE_DIR     = 'heads_representations'
FIGURES_DIR  = 'figures'
os.makedirs(FIGURES_DIR, exist_ok=True)

# ── Block index table ────────────────────────────────────────────────────────
block_info = []
global_block = 0
for layer_idx, (depth, n_heads) in enumerate(zip(HTSAT_DEPTHS, HTSAT_HEADS)):
    for block_idx in range(depth):
        block_info.append({
            'global_block':   global_block,
            'layer':          layer_idx,
            'block_in_layer': block_idx,
            'n_heads':        n_heads,
            'head_dim':       HEAD_DIM,
        })
        global_block += 1
block_info_df = pd.DataFrame(block_info)

print(f'✅ Configuration loaded — {len(block_info_df)} blocks, '
      f'{sum(d*h for d,h in zip(HTSAT_DEPTHS, HTSAT_HEADS))} total heads')

✅ Configuration loaded — 12 blocks, 184 total heads


## 1. Load Saved Head Representations

In [3]:
DATASET_NAME = 'esc50'  # ← change to 'tinysol' or 'vocalsound' as needed

load_path = f"{SAVE_DIR}/{DATASET_NAME}_head_outputs_final.pt"
data = torch.load(load_path, map_location='cpu', weights_only=False)

head_outputs = data['head_outputs_final']   # dict: head_id → tensor [N, 24]
labels       = data['labels']               # np.array [N]

N_SAMPLES  = len(labels)
N_CLASSES  = len(np.unique(labels))
head_ids   = sorted(head_outputs.keys())
N_HEADS    = len(head_ids)

print(f'✅ Loaded {DATASET_NAME.upper()}')
print(f'   Samples  : {N_SAMPLES}')
print(f'   Classes  : {N_CLASSES}')
print(f'   Heads    : {N_HEADS}')
print(f'   Head dim : {HEAD_DIM}')

✅ Loaded ESC50
   Samples  : 2000
   Classes  : 50
   Heads    : 184
   Head dim : 24


## 2. Linear Probe Analysis

For each head $h$ with representation matrix $\mathbf{R}_h \in \mathbb{R}^{n \times d_h}$:

1. Compute PCA: obtain eigenvectors $\mathbf{V} = [\mathbf{v}_1, \ldots, \mathbf{v}_{d_h}]$ ordered by explained variance
2. Project: $\mathbf{Z}^{(k)} = \mathbf{R}_h \mathbf{V}_{:,1:k} \in \mathbb{R}^{n \times k}$
3. Evaluate: 5-fold stratified cross-validation accuracy of logistic regression on $\mathbf{Z}^{(k)}$
4. Baseline: same procedure with $k$ random orthonormal directions (averaged over 10 seeds)

In [None]:
N_RANDOM_SEEDS = 10
K_VALUES       = [1, 2, 3, 5]   # number of PCs to use
CV_FOLDS       = 5
MAX_ITER       = 1000

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)

results = []

for head_id in tqdm(head_ids, desc='Linear probe per head'):
    # Parse metadata from head_id string e.g. 'L0_B1_H3'
    parts      = head_id.split('_')
    layer_idx  = int(parts[0][1:])
    block_idx  = int(parts[1][1:])
    head_idx   = int(parts[2][1:])
    global_blk = block_info_df[
        (block_info_df['layer'] == layer_idx) &
        (block_info_df['block_in_layer'] == block_idx)
    ]['global_block'].values[0]

    R = head_outputs[head_id].numpy()   # [N, 24]

    # Standardise before PCA (zero mean, unit variance per feature)
    scaler = StandardScaler()
    R_scaled = scaler.fit_transform(R)

    # PCA decomposition
    pca = PCA(n_components=HEAD_DIM, random_state=42)
    pca.fit(R_scaled)
    R_pca = pca.transform(R_scaled)   # [N, 24] — all PCs

    for k in K_VALUES:
        clf = LogisticRegression(max_iter=MAX_ITER, random_state=42,
                                  solver='lbfgs', multi_class='multinomial')

        # ── PC-based probe ───────────────────────────────────────────────────
        Z_pc  = R_pca[:, :k]
        acc_pc = cross_val_score(clf, Z_pc, labels, cv=cv, scoring='accuracy').mean()

        # ── Random baseline (averaged over seeds) ────────────────────────────
        acc_rand_list = []
        for seed in range(N_RANDOM_SEEDS):
            rng = np.random.default_rng(seed)
            # Random orthonormal basis via QR decomposition
            rand_mat = rng.standard_normal((HEAD_DIM, k))
            Q, _ = np.linalg.qr(rand_mat)
            Z_rand = R_scaled @ Q[:, :k]
            acc_rand_list.append(
                cross_val_score(clf, Z_rand, labels, cv=cv, scoring='accuracy').mean()
            )
        acc_rand = np.mean(acc_rand_list)
        std_rand = np.std(acc_rand_list)

        # ── Delta: how much better than random ──────────────────────────────
        delta = acc_pc - acc_rand

        results.append({
            'head_id':      head_id,
            'layer':        layer_idx,
            'global_block': global_blk,
            'block':        block_idx,
            'head':         head_idx,
            'k':            k,
            'acc_pc':       acc_pc,
            'acc_rand':     acc_rand,
            'std_rand':     std_rand,
            'delta':        delta,
        })

results_df = pd.DataFrame(results)
print(f'✅ Linear probe complete — {len(results_df)} rows')
display(results_df.head(10))

Linear probe per head:   0%|          | 0/184 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Saving
RESULTS_DIR = 'probe_results'
os.makedirs(RESULTS_DIR, exist_ok=True)

# ── 1. Full results dataframe ─────────────────────────────────────────────────
results_df.to_csv(f'{RESULTS_DIR}/probe_{DATASET_NAME}.csv', index=False)

# ── 2. Metadata needed by the analysis notebook ──────────────────────────────
import json
meta = {
    'dataset_name': DATASET_NAME,
    'n_samples':    int(N_SAMPLES),
    'n_classes':    int(N_CLASSES),
    'head_dim':     int(HEAD_DIM),
    'k_values':     K_VALUES,
    'cv_folds':     CV_FOLDS,
    'n_random_seeds': N_RANDOM_SEEDS,
    'htsat_depths': HTSAT_DEPTHS,
    'htsat_heads':  HTSAT_HEADS,
}
with open(f'{RESULTS_DIR}/meta_{DATASET_NAME}.json', 'w') as f:
    json.dump(meta, f, indent=2)

# ── 3. Block info table ───────────────────────────────────────────────────────
block_info_df.to_csv(f'{RESULTS_DIR}/block_info.csv', index=False)

print(f'✅ Saved to {RESULTS_DIR}/')
print(f'   probe_{DATASET_NAME}.csv      — {len(results_df)} rows')
print(f'   meta_{DATASET_NAME}.json      — experiment metadata')
print(f'   block_info.csv               — block/layer/head mapping')

## 3. Summary Statistics by Layer

In [None]:
# Focus on k=1 (PC1 alone) as primary indicator of semantic concentration
results_df = pd.read_csv(f'{RESULTS_DIR}/probe_{DATASET_NAME}.csv')
k1_df = results_df[results_df['k'] == 1].copy()

layer_summary = k1_df.groupby('layer').agg(
    acc_pc_mean   = ('acc_pc',   'mean'),
    acc_pc_std    = ('acc_pc',   'std'),
    acc_rand_mean = ('acc_rand', 'mean'),
    acc_rand_std  = ('acc_rand', 'std'),
    delta_mean    = ('delta',    'mean'),
    delta_max     = ('delta',    'max'),
    n_heads       = ('head_id',  'count')
).reset_index()

print('Layer-wise summary (k=1, PC1 only):')
display(layer_summary.round(4))

# Top-10 most semantically informative heads (by delta at k=1)
print('\nTop-10 heads by Δ accuracy (PC1 vs random):')
display(
    k1_df.nlargest(10, 'delta')[['head_id','layer','global_block','head','acc_pc','acc_rand','delta']]
    .round(4)
)

## 4. Visualisation

### Figure A — PC1 accuracy vs. random baseline across all heads, grouped by layer

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True)
fig.suptitle(
    r'Linear Probe Accuracy: PC$_1$ vs. Random Baseline (k=1)',
    fontsize=13, y=1.02
)

layer_labels = ['Stage 1 (L0)', 'Stage 2 (L1)', 'Stage 3 (L2)', 'Stage 4 (L3)']

for layer_idx, ax in enumerate(axes):
    sub = k1_df[k1_df['layer'] == layer_idx]
    color = LAYER_COLORS[layer_idx]

    x = np.arange(len(sub))
    ax.bar(x, sub['acc_pc'].values,  color=color,             alpha=0.85,
           label='PC$_1$', zorder=3)
    ax.bar(x, sub['acc_rand'].values, color=COLORS['neutral'], alpha=0.45,
           label='Random', zorder=2)

    # Error bars for random baseline std
    ax.errorbar(x, sub['acc_rand'].values, yerr=sub['std_rand'].values,
                fmt='none', color='#333333', capsize=2, linewidth=0.8, zorder=4)

    # Chance level
    chance = 1.0 / N_CLASSES
    ax.axhline(chance, color=COLORS['accent'], linewidth=1.0,
               linestyle='--', label=f'Chance ({chance:.2f})', zorder=5)

    ax.set_title(layer_labels[layer_idx], color=color, fontweight='bold')
    ax.set_xlabel('Head index')
    ax.set_xticks(x)
    ax.set_xticklabels(sub['head'].values, fontsize=7)
    if layer_idx == 0:
        ax.set_ylabel('5-fold CV accuracy')
    ax.set_ylim(0, min(1.05, k1_df['acc_pc'].max() * 1.25))

# Shared legend
handles = [
    mpatches.Patch(color=LAYER_COLORS[0], alpha=0.85, label='PC$_1$ accuracy'),
    mpatches.Patch(color=COLORS['neutral'], alpha=0.45, label='Random baseline'),
    plt.Line2D([0],[0], color=COLORS['accent'], linestyle='--', label='Chance level'),
]
fig.legend(handles=handles, loc='lower center', ncol=3,
           bbox_to_anchor=(0.5, -0.08), frameon=True, edgecolor='#cccccc')

plt.tight_layout()
fig.savefig(f'{FIGURES_DIR}/pc1_probe_by_layer.pdf', bbox_inches='tight', dpi=300)
fig.savefig(f'{FIGURES_DIR}/pc1_probe_by_layer.png', bbox_inches='tight', dpi=300)
plt.show()
print('✅ Saved figure A')

### Figure B — Delta heatmap: Δ accuracy (PC vs random) across blocks × heads

In [None]:
# Build a 2D matrix: rows = global blocks (0–11), cols = head index within block
# Max heads per block is 32 (Stage 4); pad with NaN
max_heads = max(HTSAT_HEADS)
N_BLOCKS  = sum(HTSAT_DEPTHS)

delta_matrix = np.full((N_BLOCKS, max_heads), np.nan)
acc_matrix   = np.full((N_BLOCKS, max_heads), np.nan)

for _, row in k1_df.iterrows():
    b = int(row['global_block'])
    h = int(row['head'])
    delta_matrix[b, h] = row['delta']
    acc_matrix[b, h]   = row['acc_pc']

# ── Plot ─────────────────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(14, 5.5))
fig.suptitle(
    r'Head-level Semantic Alignment: PC$_1$ Linear Probe on ' + DATASET_NAME.upper(),
    fontsize=13
)

# --- Panel (a): Δ accuracy heatmap ---
ax = axes[0]
im = ax.imshow(delta_matrix, aspect='auto', cmap='RdYlGn',
               vmin=-0.05, vmax=delta_matrix[~np.isnan(delta_matrix)].max())

# Stage boundary lines
stage_boundaries = np.cumsum(HTSAT_DEPTHS)[:-1] - 0.5
for boundary in stage_boundaries:
    ax.axhline(boundary, color='black', linewidth=1.5, linestyle='--', alpha=0.7)

# Annotate values
for b in range(N_BLOCKS):
    n_h = HTSAT_HEADS[block_info_df.loc[block_info_df['global_block']==b, 'layer'].values[0]]
    for h in range(n_h):
        val = delta_matrix[b, h]
        if not np.isnan(val):
            ax.text(h, b, f'{val:.2f}', ha='center', va='center',
                    fontsize=6, color='black' if abs(val) < 0.08 else 'white')

cb = plt.colorbar(im, ax=ax, fraction=0.03, pad=0.03)
cb.set_label(r'$\Delta$ accuracy (PC$_1$ − random)', fontsize=9)

ax.set_xlabel('Head index within block')
ax.set_ylabel('Global block index')
ax.set_title(r'(a) $\Delta$ accuracy: PC$_1$ vs. random baseline')
ax.set_yticks(range(N_BLOCKS))
ax.set_yticklabels([f'B{b}' for b in range(N_BLOCKS)], fontsize=8)

# Stage annotations on the right
stage_centers = []
cum = 0
for d in HTSAT_DEPTHS:
    stage_centers.append(cum + d/2 - 0.5)
    cum += d
stage_names = ['S1','S2','S3','S4']
ax2 = ax.twinx()
ax2.set_ylim(ax.get_ylim())
ax2.set_yticks(stage_centers)
ax2.set_yticklabels(stage_names, fontsize=9, fontweight='bold',
                    color=[LAYER_COLORS[i] for i in range(4)])
ax2.tick_params(length=0)

# --- Panel (b): PC1 accuracy heatmap ---
ax = axes[1]
chance = 1.0 / N_CLASSES
im2 = ax.imshow(acc_matrix, aspect='auto', cmap='Blues',
                vmin=chance, vmax=acc_matrix[~np.isnan(acc_matrix)].max())

for boundary in stage_boundaries:
    ax.axhline(boundary, color='black', linewidth=1.5, linestyle='--', alpha=0.7)

for b in range(N_BLOCKS):
    n_h = HTSAT_HEADS[block_info_df.loc[block_info_df['global_block']==b, 'layer'].values[0]]
    for h in range(n_h):
        val = acc_matrix[b, h]
        if not np.isnan(val):
            ax.text(h, b, f'{val:.2f}', ha='center', va='center',
                    fontsize=6, color='white' if val > (chance + 0.15) else 'black')

cb2 = plt.colorbar(im2, ax=ax, fraction=0.03, pad=0.03)
cb2.set_label(r'PC$_1$ probe accuracy', fontsize=9)

ax.set_xlabel('Head index within block')
ax.set_ylabel('Global block index')
ax.set_title(r'(b) Absolute PC$_1$ probe accuracy')
ax.set_yticks(range(N_BLOCKS))
ax.set_yticklabels([f'B{b}' for b in range(N_BLOCKS)], fontsize=8)

ax3 = ax.twinx()
ax3.set_ylim(ax.get_ylim())
ax3.set_yticks(stage_centers)
ax3.set_yticklabels(stage_names, fontsize=9, fontweight='bold',
                    color=[LAYER_COLORS[i] for i in range(4)])
ax3.tick_params(length=0)

plt.tight_layout()
fig.savefig(f'{FIGURES_DIR}/pc1_alignment_heatmap.pdf', bbox_inches='tight', dpi=300)
fig.savefig(f'{FIGURES_DIR}/pc1_alignment_heatmap.png', bbox_inches='tight', dpi=300)
plt.show()
print('✅ Saved figure B')

### Figure C — Effect of k: accuracy vs. number of PCs retained

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True)
fig.suptitle('Probe Accuracy vs. Number of PCs Retained, by Stage', fontsize=13, y=1.02)

for layer_idx, ax in enumerate(axes):
    sub = results_df[results_df['layer'] == layer_idx]
    color = LAYER_COLORS[layer_idx]

    # Aggregate over heads within layer for each k
    agg = sub.groupby('k').agg(
        acc_pc_mean   = ('acc_pc',   'mean'),
        acc_pc_std    = ('acc_pc',   'std'),
        acc_rand_mean = ('acc_rand', 'mean'),
        acc_rand_std  = ('acc_rand', 'std'),
    ).reset_index()

    ax.plot(agg['k'], agg['acc_pc_mean'], color=color, marker='o',
            linewidth=1.8, markersize=5, label='PC directions', zorder=4)
    ax.fill_between(agg['k'],
                    agg['acc_pc_mean'] - agg['acc_pc_std'],
                    agg['acc_pc_mean'] + agg['acc_pc_std'],
                    alpha=0.15, color=color)

    ax.plot(agg['k'], agg['acc_rand_mean'], color=COLORS['neutral'],
            marker='s', linewidth=1.4, markersize=4,
            linestyle='--', label='Random dirs', zorder=3)
    ax.fill_between(agg['k'],
                    agg['acc_rand_mean'] - agg['acc_rand_std'],
                    agg['acc_rand_mean'] + agg['acc_rand_std'],
                    alpha=0.10, color=COLORS['neutral'])

    ax.axhline(1.0/N_CLASSES, color=COLORS['accent'], linewidth=1.0,
               linestyle=':', label='Chance', zorder=2)

    ax.set_title(f'Stage {layer_idx+1} (L{layer_idx})', color=color, fontweight='bold')
    ax.set_xlabel('Number of PCs ($k$)')
    if layer_idx == 0:
        ax.set_ylabel('Mean 5-fold CV accuracy')
    ax.set_xticks(K_VALUES)

handles, hlabels = axes[0].get_legend_handles_labels()
fig.legend(handles, hlabels, loc='lower center', ncol=3,
           bbox_to_anchor=(0.5, -0.10), frameon=True, edgecolor='#cccccc')

plt.tight_layout()
fig.savefig(f'{FIGURES_DIR}/probe_vs_k.pdf', bbox_inches='tight', dpi=300)
fig.savefig(f'{FIGURES_DIR}/probe_vs_k.png', bbox_inches='tight', dpi=300)
plt.show()
print('✅ Saved figure C')

### Figure D — Distribution of Δ accuracy per layer (violin plot)

In [None]:
fig, ax = plt.subplots(figsize=(7, 4))

layer_names = ['Stage 1\n(L0)', 'Stage 2\n(L1)', 'Stage 3\n(L2)', 'Stage 4\n(L3)']

parts = ax.violinplot(
    [k1_df[k1_df['layer'] == l]['delta'].values for l in range(4)],
    positions=range(4),
    showmedians=True,
    showextrema=True,
)

for i, pc in enumerate(parts['bodies']):
    pc.set_facecolor(LAYER_COLORS[i])
    pc.set_alpha(0.6)
parts['cmedians'].set_color('black')
parts['cmedians'].set_linewidth(1.5)
parts['cmaxes'].set_color('#333333')
parts['cmins'].set_color('#333333')
parts['cbars'].set_color('#333333')

# Overlay jittered scatter
for l in range(4):
    deltas = k1_df[k1_df['layer'] == l]['delta'].values
    jitter = np.random.default_rng(42).uniform(-0.08, 0.08, len(deltas))
    ax.scatter(l + jitter, deltas, s=18, color=LAYER_COLORS[l],
               alpha=0.5, zorder=3, edgecolors='none')

ax.axhline(0, color=COLORS['accent'], linewidth=1.0,
           linestyle='--', label='No gain over random')

ax.set_xticks(range(4))
ax.set_xticklabels(layer_names)
ax.set_ylabel(r'$\Delta$ accuracy = PC$_1$ probe $-$ random baseline')
ax.set_title(r'Distribution of Semantic Gain ($\Delta$ accuracy) per Stage')
ax.legend(frameon=True, edgecolor='#cccccc')

plt.tight_layout()
fig.savefig(f'{FIGURES_DIR}/delta_violin.pdf', bbox_inches='tight', dpi=300)
fig.savefig(f'{FIGURES_DIR}/delta_violin.png', bbox_inches='tight', dpi=300)
plt.show()
print('✅ Saved figure D')

## 5. Statistical Validation

In [None]:
print('=' * 60)
print('Statistical tests: PC1 accuracy vs. random baseline (k=1)')
print('One-sample t-test: H0: mean(delta) = 0')
print('=' * 60)

for layer_idx in range(4):
    deltas = k1_df[k1_df['layer'] == layer_idx]['delta'].values
    t_stat, p_val = stats.ttest_1samp(deltas, popmean=0)
    print(f'Stage {layer_idx+1} (L{layer_idx}): '
          f'mean Δ={deltas.mean():.4f}, '
          f't={t_stat:.3f}, p={p_val:.4f} '
          f'{"***" if p_val<0.001 else "**" if p_val<0.01 else "*" if p_val<0.05 else "ns"}')

print()
print('── Top specialised heads (Δ > 2σ above mean Δ) ──')
global_mean = k1_df['delta'].mean()
global_std  = k1_df['delta'].std()
threshold   = global_mean + 2 * global_std
top_heads   = k1_df[k1_df['delta'] > threshold].sort_values('delta', ascending=False)
display(top_heads[['head_id','layer','global_block','head','acc_pc','acc_rand','delta']].round(4))
print(f'\nThreshold (μ + 2σ) = {threshold:.4f}')
print(f'Specialised heads  = {len(top_heads)} / {len(k1_df)} '
      f'({100*len(top_heads)/len(k1_df):.1f}%)')

## 6. Save Results

In [None]:
results_df.to_csv(f'{FIGURES_DIR}/pc_probe_results_{DATASET_NAME}.csv', index=False)
print(f'✅ Results saved → {FIGURES_DIR}/pc_probe_results_{DATASET_NAME}.csv')

print('\n── Final summary ──')
print(f'Dataset   : {DATASET_NAME.upper()}')
print(f'Samples   : {N_SAMPLES}   Classes: {N_CLASSES}')
print(f'Chance    : {1/N_CLASSES:.4f}')
print()
for l in range(4):
    sub = k1_df[k1_df['layer']==l]
    print(f'  Stage {l+1}: mean PC1 acc={sub["acc_pc"].mean():.4f} '
          f'rand={sub["acc_rand"].mean():.4f} '
          f'Δ={sub["delta"].mean():.4f} '
          f'(best head: {sub.loc[sub["delta"].idxmax(),"head_id"]}, '
          f'Δ={sub["delta"].max():.4f})')