# Accents-PT-BR — Accent Classifier Ablation (CNN vs wav2vec2)

**Projeto:** Controle Explícito de Sotaque Regional em pt-BR  
**Objetivo:** Treinar e avaliar classificadores de sotaque (CNN mel-spectrogram vs wav2vec2 fine-tuned) no dataset combinado Accents-PT-BR (CORAA-MUPE + Common Voice PT). Esses classificadores servem como **avaliadores externos** para os Stages 2-3 (medir se o áudio gerado pelo LoRA carrega o sotaque-alvo).  
**Config:** `configs/accent_classifier.yaml` (single source of truth).  
**Dataset:** Accents-PT-BR = CORAA-MUPE (entrevistados) + Common Voice PT (accent label normalizado).  

**Seções:**
1. Setup do ambiente
2. Dataset pipeline (shared with dataset notebook)
3. CNN accent classifier (treinamento + avaliação)
4. wav2vec2 accent classifier (treinamento + avaliação)
5. Robustness check (multiple seeds)
6. Cross-source evaluation (confound check)
7. Ablation summary + report

Este notebook é a **camada de orquestração**. Toda lógica está em `src/` (testável, auditável).  
O notebook apenas: instala deps → configura ambiente → chama módulos → exibe resultados.

In [None]:
# Bootstrap: clone repo, install deps, check NumPy ABI.
# This module uses only stdlib — safe to import before pip install.
# On first Colab run, this cell may restart the runtime once (NumPy ABI fix).
from src.utils.notebook_bootstrap import bootstrap
bootstrap()

In [None]:
import sys, yaml, json, logging
from pathlib import Path
from collections import Counter

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Platform-aware persistent cache setup
from src.utils.platform import detect_platform, setup_environment

platform = detect_platform()
setup_environment(platform)

# Fix #1: use seed_worker from src (includes random.seed — the inline version missed it)
from src.utils.seed import set_global_seed, seed_worker
from src.utils.git import get_commit_hash
from src.data.manifest import compute_file_hash
from src.classifier import (
    AccentCNN, AccentWav2Vec2,
    train_classifier, evaluate_classifier,
    TrainingConfig, TrainingResult,
)
from src.classifier.mel_dataset import MelSpectrogramDataset
from src.classifier.wav2vec2_dataset import WaveformDataset
from src.classifier.trainer import compute_class_weights

# Load config — single source of truth for all experiment parameters
with open('configs/accent_classifier.yaml') as f:
    config = yaml.safe_load(f)

SEED = config['seed']['global']
generator = set_global_seed(SEED)

logging.basicConfig(
    level=logging.INFO,
    format='%(name)s - %(levelname)s - %(message)s',
)

print(f'Platform: {platform.name}')
print(f'Config loaded: {config["experiment"]["name"]}')
print(f'Seed global: {SEED}')

In [None]:
# Environment check: GPU, CUDA, PyTorch versions
print(f'Python: {sys.version}')
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'CUDA device: {torch.cuda.get_device_name(0)}')
    print(f'CUDA version: {torch.version.cuda}')
    print(f'VRAM total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'\nUsando device: {DEVICE}')

# Drive cache base directory — platform-aware
DRIVE_BASE = platform.cache_base
DRIVE_BASE.mkdir(parents=True, exist_ok=True)
print(f'Cache base: {DRIVE_BASE}')

## 2. Dataset Pipeline

Executa a pipeline completa via `src.data.pipeline.load_or_build_accents_dataset()`.  
A mesma função usada pelo dataset notebook — elimina duplicação (DRY).

In [None]:
from src.data.pipeline import load_or_build_accents_dataset

bundle = load_or_build_accents_dataset(config, DRIVE_BASE)

combined_entries = bundle.combined_entries
split_info = bundle.split_info
split_entries = bundle.split_entries
confound_results = bundle.confound_results

train_entries = split_entries['train']
val_entries = split_entries['val']
test_entries = split_entries['test']

# Cross-tabulations for reference
gender_table = pd.crosstab(
    [e.accent for e in combined_entries],
    [e.gender for e in combined_entries],
    margins=True,
)
print('\n=== ACCENT x GENDER TABLE ===')
print(gender_table)

source_table = pd.crosstab(
    [e.accent for e in combined_entries],
    [e.source for e in combined_entries],
    margins=True,
)
print('\n=== ACCENT x SOURCE TABLE ===')
print(source_table)

## 3. CNN Accent Classifier

3-block CNN operating on mel-spectrograms.  
Architecture: Conv2d -> BatchNorm -> ReLU -> MaxPool (x3) -> AdaptiveAvgPool -> Linear.  
Early stopping on validation balanced accuracy.  
Class-weighted CrossEntropyLoss for imbalanced accent distributions.

In [None]:
# Build label mapping (sorted, deterministic)
label_to_idx = MelSpectrogramDataset.build_label_mapping(combined_entries)
idx_to_label = {v: k for k, v in label_to_idx.items()}
n_classes = len(label_to_idx)
label_names = [idx_to_label[i] for i in range(n_classes)]

print(f'Classes ({n_classes}): {label_names}')
print(f'Label mapping: {label_to_idx}')

# Persist label_to_idx for reproducibility — ensures same mapping across runs
label_map_path = Path(config['output']['report_dir']) / 'label_to_idx.json'
label_map_path.parent.mkdir(parents=True, exist_ok=True)
with open(label_map_path, 'w') as f:
    json.dump(label_to_idx, f, indent=2)
print(f'Label mapping saved to: {label_map_path}')

# CNN hyperparameters from config
cnn_cfg = config['cnn']

# Create datasets
train_mel_ds = MelSpectrogramDataset(
    entries=train_entries,
    label_to_idx=label_to_idx,
    n_mels=cnn_cfg['n_mels'],
    max_frames=cnn_cfg['max_frames'],
)
val_mel_ds = MelSpectrogramDataset(
    entries=val_entries,
    label_to_idx=label_to_idx,
    n_mels=cnn_cfg['n_mels'],
    max_frames=cnn_cfg['max_frames'],
)
test_mel_ds = MelSpectrogramDataset(
    entries=test_entries,
    label_to_idx=label_to_idx,
    n_mels=cnn_cfg['n_mels'],
    max_frames=cnn_cfg['max_frames'],
)

print(f'\nMel datasets: train={len(train_mel_ds)}, val={len(val_mel_ds)}, test={len(test_mel_ds)}')

# DataLoaders with reproducible worker seeds
# seed_worker from src.utils.seed includes both np.random.seed AND random.seed
g = torch.Generator()
g.manual_seed(SEED)

cnn_batch_size = cnn_cfg['training']['batch_size']
cnn_num_workers = cnn_cfg['training']['num_workers']

train_mel_loader = torch.utils.data.DataLoader(
    train_mel_ds, batch_size=cnn_batch_size, shuffle=True,
    num_workers=cnn_num_workers, worker_init_fn=seed_worker, generator=g, pin_memory=True,
)
val_mel_loader = torch.utils.data.DataLoader(
    val_mel_ds, batch_size=cnn_batch_size, shuffle=False,
    num_workers=cnn_num_workers, pin_memory=True,
)
test_mel_loader = torch.utils.data.DataLoader(
    test_mel_ds, batch_size=cnn_batch_size, shuffle=False,
    num_workers=cnn_num_workers, pin_memory=True,
)

# Compute class weights for imbalanced data (shared across both classifiers)
train_labels = [label_to_idx[e.accent] for e in train_entries]
cnn_class_weights = compute_class_weights(train_labels, n_classes)
print(f'CNN class weights: {cnn_class_weights.tolist()}')

In [None]:
# Train CNN
cnn_model = AccentCNN(
    n_classes=n_classes,
    n_mels=cnn_cfg['n_mels'],
    conv_channels=cnn_cfg['conv_channels'],
)

cnn_checkpoint_dir = Path(config['output']['checkpoint_dir']) / 'cnn'

cnn_training_config = TrainingConfig(
    learning_rate=cnn_cfg['training']['learning_rate'],
    batch_size=cnn_cfg['training']['batch_size'],
    n_epochs=cnn_cfg['training']['n_epochs'],
    patience=cnn_cfg['training']['patience'],
    device=DEVICE,
    seed=SEED,
    checkpoint_dir=cnn_checkpoint_dir,
    experiment_name='accent_cnn',
    use_amp=cnn_cfg['training']['use_amp'],
)

print(f'Training CNN: lr={cnn_training_config.learning_rate}, '
      f'epochs={cnn_training_config.n_epochs}, '
      f'patience={cnn_training_config.patience}')

cnn_result = train_classifier(
    model=cnn_model,
    train_loader=train_mel_loader,
    val_loader=val_mel_loader,
    config=cnn_training_config,
    class_weights=cnn_class_weights,
)

print(f'\nCNN training complete:')
print(f'  Best epoch: {cnn_result.best_epoch}')
print(f'  Best val bal_acc: {cnn_result.best_val_bal_acc:.4f}')
print(f'  Total epochs: {cnn_result.total_epochs_run}')
print(f'  Checkpoint: {cnn_result.best_checkpoint_path}')

# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(cnn_result.train_losses, label='Train Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('CNN Training Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(cnn_result.val_bal_accs, label='Val Balanced Accuracy', color='orange')
ax2.axhline(y=1.0/n_classes, color='red', linestyle='--', label=f'Chance ({1.0/n_classes:.2f})')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Balanced Accuracy')
ax2.set_title('CNN Validation Balanced Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
figures_dir = Path(config['output']['figures_dir'])
figures_dir.mkdir(parents=True, exist_ok=True)
plt.savefig(figures_dir / 'cnn_training_curves.png', dpi=150)
plt.show()

In [None]:
# Evaluate CNN on test set
# weights_only=False required: our checkpoint dict contains non-tensor metadata
# (config, epoch, optimizer state) — safe since we saved it ourselves
checkpoint = torch.load(cnn_result.best_checkpoint_path, map_location=DEVICE, weights_only=False)
cnn_model.load_state_dict(checkpoint['model_state_dict'])

cnn_eval = evaluate_classifier(
    model=cnn_model,
    test_loader=test_mel_loader,
    label_names=label_names,
    device=DEVICE,
    n_bootstrap=config['evaluation']['bootstrap_n_samples'],
)

print('=== CNN TEST EVALUATION ===')
print(f'Balanced Accuracy: {cnn_eval["balanced_accuracy"]:.4f} '
      f'(CI 95%: [{cnn_eval["ci_95_lower"]:.4f}, {cnn_eval["ci_95_upper"]:.4f}])')
print(f'F1 Macro: {cnn_eval["f1_macro"]:.4f}')
print(f'Chance level: {1.0/n_classes:.4f}')
print(f'\nPer-class recall:')
for name, recall in cnn_eval['per_class_recall'].items():
    print(f'  {name}: {recall:.4f}')

# Display confusion matrix
cm = np.array(cnn_eval['confusion_matrix'])
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_names,
            yticklabels=label_names, cmap='Blues', ax=ax1)
ax1.set_title('CNN Confusion Matrix (counts)')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('True')

sns.heatmap(cm_norm, annot=True, fmt='.2f', xticklabels=label_names,
            yticklabels=label_names, cmap='Blues', ax=ax2)
ax2.set_title('CNN Confusion Matrix (row-normalized recall)')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('True')

plt.tight_layout()
plt.savefig(figures_dir / 'cnn_confusion_matrix.png', dpi=150)
plt.show()

## 4. wav2vec2 Accent Classifier

Pre-trained wav2vec2-base with frozen CNN feature extractor + fine-tuned transformer + linear head.  
Operates on raw waveforms (no mel-spectrogram preprocessing).  
Smaller batch size due to VRAM constraints.

In [None]:
# wav2vec2 hyperparameters from config
w2v_cfg = config['wav2vec2']

# Create waveform datasets
train_wav_ds = WaveformDataset(
    entries=train_entries,
    label_to_idx=label_to_idx,
    max_length_s=w2v_cfg['max_length_s'],
)
val_wav_ds = WaveformDataset(
    entries=val_entries,
    label_to_idx=label_to_idx,
    max_length_s=w2v_cfg['max_length_s'],
)
test_wav_ds = WaveformDataset(
    entries=test_entries,
    label_to_idx=label_to_idx,
    max_length_s=w2v_cfg['max_length_s'],
)

print(f'Waveform datasets: train={len(train_wav_ds)}, val={len(val_wav_ds)}, test={len(test_wav_ds)}')

# DataLoaders (smaller batch for wav2vec2 VRAM)
g_w2v = torch.Generator()
g_w2v.manual_seed(SEED)

w2v_batch_size = w2v_cfg['training']['batch_size']
w2v_num_workers = w2v_cfg['training']['num_workers']

train_wav_loader = torch.utils.data.DataLoader(
    train_wav_ds, batch_size=w2v_batch_size, shuffle=True,
    num_workers=w2v_num_workers, worker_init_fn=seed_worker, generator=g_w2v, pin_memory=True,
)
val_wav_loader = torch.utils.data.DataLoader(
    val_wav_ds, batch_size=w2v_batch_size, shuffle=False,
    num_workers=w2v_num_workers, pin_memory=True,
)
test_wav_loader = torch.utils.data.DataLoader(
    test_wav_ds, batch_size=w2v_batch_size, shuffle=False,
    num_workers=w2v_num_workers, pin_memory=True,
)

# Class weights — same train distribution, reuses train_labels computed above
w2v_class_weights = compute_class_weights(train_labels, n_classes)
print(f'wav2vec2 class weights: {w2v_class_weights.tolist()}')

In [None]:
# Train wav2vec2
# Free CNN memory before loading wav2vec2
del cnn_model
torch.cuda.empty_cache()

w2v_model = AccentWav2Vec2(
    n_classes=n_classes,
    model_name=w2v_cfg['model_name'],
    freeze_feature_extractor=w2v_cfg['freeze_feature_extractor'],
)

w2v_checkpoint_dir = Path(config['output']['checkpoint_dir']) / 'wav2vec2'

w2v_training_config = TrainingConfig(
    learning_rate=w2v_cfg['training']['learning_rate'],
    batch_size=w2v_cfg['training']['batch_size'],
    n_epochs=w2v_cfg['training']['n_epochs'],
    patience=w2v_cfg['training']['patience'],
    device=DEVICE,
    seed=SEED,
    checkpoint_dir=w2v_checkpoint_dir,
    experiment_name='accent_wav2vec2',
    use_amp=w2v_cfg['training']['use_amp'],
)

print(f'Training wav2vec2: lr={w2v_training_config.learning_rate}, '
      f'epochs={w2v_training_config.n_epochs}, '
      f'patience={w2v_training_config.patience}')
print(f'VRAM before training: {torch.cuda.memory_allocated()/1e9:.2f} GB')

w2v_result = train_classifier(
    model=w2v_model,
    train_loader=train_wav_loader,
    val_loader=val_wav_loader,
    config=w2v_training_config,
    class_weights=w2v_class_weights,
)

print(f'\nwav2vec2 training complete:')
print(f'  Best epoch: {w2v_result.best_epoch}')
print(f'  Best val bal_acc: {w2v_result.best_val_bal_acc:.4f}')
print(f'  Total epochs: {w2v_result.total_epochs_run}')
print(f'  Checkpoint: {w2v_result.best_checkpoint_path}')

# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(w2v_result.train_losses, label='Train Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('wav2vec2 Training Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(w2v_result.val_bal_accs, label='Val Balanced Accuracy', color='orange')
ax2.axhline(y=1.0/n_classes, color='red', linestyle='--', label=f'Chance ({1.0/n_classes:.2f})')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Balanced Accuracy')
ax2.set_title('wav2vec2 Validation Balanced Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(figures_dir / 'wav2vec2_training_curves.png', dpi=150)
plt.show()

In [None]:
# Evaluate wav2vec2 on test set
# weights_only=False required: our checkpoint dict contains non-tensor metadata
# (config, epoch, optimizer state) — safe since we saved it ourselves
checkpoint_w2v = torch.load(w2v_result.best_checkpoint_path, map_location=DEVICE, weights_only=False)
w2v_model.load_state_dict(checkpoint_w2v['model_state_dict'])

w2v_eval = evaluate_classifier(
    model=w2v_model,
    test_loader=test_wav_loader,
    label_names=label_names,
    device=DEVICE,
    n_bootstrap=config['evaluation']['bootstrap_n_samples'],
)

print('=== WAV2VEC2 TEST EVALUATION ===')
print(f'Balanced Accuracy: {w2v_eval["balanced_accuracy"]:.4f} '
      f'(CI 95%: [{w2v_eval["ci_95_lower"]:.4f}, {w2v_eval["ci_95_upper"]:.4f}])')
print(f'F1 Macro: {w2v_eval["f1_macro"]:.4f}')
print(f'Chance level: {1.0/n_classes:.4f}')
print(f'\nPer-class recall:')
for name, recall in w2v_eval['per_class_recall'].items():
    print(f'  {name}: {recall:.4f}')

# Display confusion matrix
cm_w2v = np.array(w2v_eval['confusion_matrix'])
cm_w2v_norm = cm_w2v.astype(float) / cm_w2v.sum(axis=1, keepdims=True)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm_w2v, annot=True, fmt='d', xticklabels=label_names,
            yticklabels=label_names, cmap='Greens', ax=ax1)
ax1.set_title('wav2vec2 Confusion Matrix (counts)')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('True')

sns.heatmap(cm_w2v_norm, annot=True, fmt='.2f', xticklabels=label_names,
            yticklabels=label_names, cmap='Greens', ax=ax2)
ax2.set_title('wav2vec2 Confusion Matrix (row-normalized recall)')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('True')

plt.tight_layout()
plt.savefig(figures_dir / 'wav2vec2_confusion_matrix.png', dpi=150)
plt.show()

## 5. Robustness Check (Multiple Seeds)

Protocolo requer mínimo 3 seeds para claims válidos (`experiment-protocol.md`).  
Retreina ambos os classifiers com seeds [42, 1337, 7] e reporta média ± std da balanced accuracy.  
Se diferença entre CNN e wav2vec2 for menor que o desvio, NÃO afirmar superioridade.

**Nota:** esta seção retreina 2 modelos × 3 seeds = 6 treinamentos completos.  
Estimativa: ~2-4h em GPU (CNN ~10min/run, wav2vec2 ~30-40min/run).

In [None]:
ROBUSTNESS_SEEDS = config['seed']['robustness_seeds']
print(f'=== ROBUSTNESS CHECK (seeds: {ROBUSTNESS_SEEDS}) ===')
print(f'Training {len(ROBUSTNESS_SEEDS)} seeds x 2 models = {len(ROBUSTNESS_SEEDS)*2} runs\n')

# Free wav2vec2 model before robustness runs
del w2v_model
torch.cuda.empty_cache()

robustness_results = {'cnn': [], 'wav2vec2': []}

for s in ROBUSTNESS_SEEDS:
    print(f'--- Seed {s} ---')
    set_global_seed(s)

    # CNN: retrain from scratch with this seed
    g_rob = torch.Generator()
    g_rob.manual_seed(s)

    rob_train_loader = torch.utils.data.DataLoader(
        train_mel_ds, batch_size=cnn_batch_size, shuffle=True,
        num_workers=cnn_num_workers, worker_init_fn=seed_worker, generator=g_rob, pin_memory=True,
    )

    rob_cnn = AccentCNN(
        n_classes=n_classes, n_mels=cnn_cfg['n_mels'],
        conv_channels=cnn_cfg['conv_channels'],
    )
    rob_cnn_config = TrainingConfig(
        learning_rate=cnn_cfg['training']['learning_rate'],
        batch_size=cnn_cfg['training']['batch_size'],
        n_epochs=cnn_cfg['training']['n_epochs'],
        patience=cnn_cfg['training']['patience'],
        device=DEVICE, seed=s,
        checkpoint_dir=Path(config['output']['checkpoint_dir']) / f'cnn_seed{s}',
        experiment_name=f'accent_cnn_seed{s}',
        use_amp=cnn_cfg['training']['use_amp'],
    )
    rob_cnn_result = train_classifier(rob_cnn, rob_train_loader, val_mel_loader, rob_cnn_config, cnn_class_weights)

    # weights_only=False: checkpoint contains non-tensor metadata (safe, self-saved)
    ckpt = torch.load(rob_cnn_result.best_checkpoint_path, map_location=DEVICE, weights_only=False)
    rob_cnn.load_state_dict(ckpt['model_state_dict'])
    rob_cnn_eval = evaluate_classifier(rob_cnn, test_mel_loader, label_names, DEVICE, n_bootstrap=0)
    robustness_results['cnn'].append(rob_cnn_eval['balanced_accuracy'])
    print(f'  CNN: bal_acc={rob_cnn_eval["balanced_accuracy"]:.4f}')
    del rob_cnn
    torch.cuda.empty_cache()

    # wav2vec2: retrain from scratch with this seed
    g_rob_w2v = torch.Generator()
    g_rob_w2v.manual_seed(s)

    rob_wav_loader = torch.utils.data.DataLoader(
        train_wav_ds, batch_size=w2v_batch_size, shuffle=True,
        num_workers=w2v_num_workers, worker_init_fn=seed_worker, generator=g_rob_w2v, pin_memory=True,
    )

    rob_w2v = AccentWav2Vec2(
        n_classes=n_classes, model_name=w2v_cfg['model_name'],
        freeze_feature_extractor=w2v_cfg['freeze_feature_extractor'],
    )
    rob_w2v_config = TrainingConfig(
        learning_rate=w2v_cfg['training']['learning_rate'],
        batch_size=w2v_cfg['training']['batch_size'],
        n_epochs=w2v_cfg['training']['n_epochs'],
        patience=w2v_cfg['training']['patience'],
        device=DEVICE, seed=s,
        checkpoint_dir=Path(config['output']['checkpoint_dir']) / f'wav2vec2_seed{s}',
        experiment_name=f'accent_wav2vec2_seed{s}',
        use_amp=w2v_cfg['training']['use_amp'],
    )
    rob_w2v_result = train_classifier(rob_w2v, rob_wav_loader, val_wav_loader, rob_w2v_config, w2v_class_weights)

    # weights_only=False: checkpoint contains non-tensor metadata (safe, self-saved)
    ckpt_w = torch.load(rob_w2v_result.best_checkpoint_path, map_location=DEVICE, weights_only=False)
    rob_w2v.load_state_dict(ckpt_w['model_state_dict'])
    rob_w2v_eval = evaluate_classifier(rob_w2v, test_wav_loader, label_names, DEVICE, n_bootstrap=0)
    robustness_results['wav2vec2'].append(rob_w2v_eval['balanced_accuracy'])
    print(f'  wav2vec2: bal_acc={rob_w2v_eval["balanced_accuracy"]:.4f}')
    del rob_w2v
    torch.cuda.empty_cache()

# Summary
print(f'\n=== ROBUSTNESS SUMMARY ({len(ROBUSTNESS_SEEDS)} seeds) ===')
for model_name, accs in robustness_results.items():
    mean_acc = np.mean(accs)
    std_acc = np.std(accs)
    print(f'  {model_name}: {mean_acc:.4f} +/- {std_acc:.4f}  '
          f'(seeds: {dict(zip(ROBUSTNESS_SEEDS, [f"{a:.4f}" for a in accs]))})')

# Check if difference is meaningful given the variance
cnn_mean = np.mean(robustness_results['cnn'])
w2v_mean = np.mean(robustness_results['wav2vec2'])
pooled_std = np.sqrt(np.std(robustness_results['cnn'])**2 + np.std(robustness_results['wav2vec2'])**2)
if pooled_std > 0 and abs(cnn_mean - w2v_mean) < pooled_std:
    print(f'\nDifference ({abs(cnn_mean - w2v_mean):.4f}) < pooled std ({pooled_std:.4f})')
    print('Cannot claim one model is superior based on seed variance.')
else:
    winner = 'wav2vec2' if w2v_mean > cnn_mean else 'CNN'
    print(f'\n{winner} appears consistently better across seeds.')

# Restore original seed
set_global_seed(SEED)

## 6. Cross-Source Evaluation (Source Confound Check)

**Objetivo:** verificar se o classificador aprendeu *sotaque* ou *fonte*.  
- Treinamos na fonte A (CORAA-MUPE), testamos na fonte B (Common Voice) e vice-versa.  
- Se ambas as direcoes ficam em chance level -> classificador aprendeu source, nao accent.  
- Se ao menos uma direcao fica acima de chance -> sinal de accent e transferivel entre fontes.  

Usamos o CNN (mais rapido) para este cross-source check.

In [None]:
# Cross-source threshold from config (percentage points above chance)
above_chance_margin = config['cross_source']['above_chance_margin_pp'] / 100

# Check that primary classifier is above chance before cross-source
chance = 1.0 / n_classes
best_primary = max(cnn_eval['balanced_accuracy'], w2v_eval['balanced_accuracy'])
if best_primary <= chance + above_chance_margin:
    print(f'WARNING: Best primary classifier ({best_primary:.4f}) is at chance ({chance:.4f}).')
    print('Cross-source evaluation may not be meaningful.')

# Split combined entries by source
coraa_train = [e for e in train_entries if e.source == 'CORAA-MUPE']
coraa_test_split = [e for e in test_entries if e.source == 'CORAA-MUPE']
cv_train = [e for e in train_entries if e.source == 'CommonVoice-PT']
cv_test_split = [e for e in test_entries if e.source == 'CommonVoice-PT']

print(f'CORAA-MUPE: train={len(coraa_train)}, test={len(coraa_test_split)}')
print(f'CommonVoice-PT: train={len(cv_train)}, test={len(cv_test_split)}')

cross_source_results = {}

# Direction 1: Train on CORAA-MUPE, test on CommonVoice-PT
if len(coraa_train) > 0 and len(cv_test_split) > 0:
    print('\n--- Direction 1: Train CORAA-MUPE -> Test CommonVoice-PT ---')

    cs_train_ds = MelSpectrogramDataset(
        coraa_train, label_to_idx, n_mels=cnn_cfg['n_mels'], max_frames=cnn_cfg['max_frames'],
    )
    cs_test_ds = MelSpectrogramDataset(
        cv_test_split, label_to_idx, n_mels=cnn_cfg['n_mels'], max_frames=cnn_cfg['max_frames'],
    )

    # Validation: use same-source val split (never train data)
    coraa_val = [e for e in val_entries if e.source == 'CORAA-MUPE']
    if len(coraa_val) < 10:
        # Not enough val samples from this source — use full val split as fallback
        # This is safe: val is speaker-disjoint from train, just mixes sources
        coraa_val = list(val_entries)
        print(f'  Val fallback: using full val split ({len(coraa_val)} entries, mixed source)')

    cs_val_ds = MelSpectrogramDataset(
        coraa_val, label_to_idx, n_mels=cnn_cfg['n_mels'], max_frames=cnn_cfg['max_frames'],
    )

    g_cs1 = torch.Generator()
    g_cs1.manual_seed(SEED)

    cs_train_loader = torch.utils.data.DataLoader(
        cs_train_ds, batch_size=cnn_batch_size, shuffle=True,
        num_workers=cnn_num_workers, worker_init_fn=seed_worker, generator=g_cs1, pin_memory=True,
    )
    cs_val_loader = torch.utils.data.DataLoader(
        cs_val_ds, batch_size=cnn_batch_size, shuffle=False, num_workers=cnn_num_workers, pin_memory=True,
    )
    cs_test_loader = torch.utils.data.DataLoader(
        cs_test_ds, batch_size=cnn_batch_size, shuffle=False, num_workers=cnn_num_workers, pin_memory=True,
    )

    cs_labels = [label_to_idx[e.accent] for e in coraa_train]
    cs_weights = compute_class_weights(cs_labels, n_classes)

    cs_model_1 = AccentCNN(n_classes=n_classes, n_mels=cnn_cfg['n_mels'], conv_channels=cnn_cfg['conv_channels'])
    cs_config_1 = TrainingConfig(
        learning_rate=cnn_cfg['training']['learning_rate'],
        batch_size=cnn_batch_size, n_epochs=cnn_cfg['training']['n_epochs'],
        patience=cnn_cfg['training']['patience'], device=DEVICE, seed=SEED,
        checkpoint_dir=Path(config['output']['checkpoint_dir']) / 'cross_source_coraa2cv',
        experiment_name='cross_source_coraa2cv', use_amp=cnn_cfg['training']['use_amp'],
    )

    cs_result_1 = train_classifier(cs_model_1, cs_train_loader, cs_val_loader, cs_config_1, cs_weights)

    # weights_only=False: checkpoint contains non-tensor metadata (safe, self-saved)
    checkpoint_cs1 = torch.load(cs_result_1.best_checkpoint_path, map_location=DEVICE, weights_only=False)
    cs_model_1.load_state_dict(checkpoint_cs1['model_state_dict'])

    cs_eval_1 = evaluate_classifier(cs_model_1, cs_test_loader, label_names, DEVICE)
    cross_source_results['coraa2cv'] = cs_eval_1

    print(f'CORAA->CV bal_acc: {cs_eval_1["balanced_accuracy"]:.4f} '
          f'(CI: [{cs_eval_1["ci_95_lower"]:.4f}, {cs_eval_1["ci_95_upper"]:.4f}])')

    del cs_model_1
    torch.cuda.empty_cache()
else:
    print('Skipping direction 1: insufficient data')

# Direction 2: Train on CommonVoice-PT, test on CORAA-MUPE
if len(cv_train) > 0 and len(coraa_test_split) > 0:
    print('\n--- Direction 2: Train CommonVoice-PT -> Test CORAA-MUPE ---')

    cs_train_ds2 = MelSpectrogramDataset(
        cv_train, label_to_idx, n_mels=cnn_cfg['n_mels'], max_frames=cnn_cfg['max_frames'],
    )
    cs_test_ds2 = MelSpectrogramDataset(
        coraa_test_split, label_to_idx, n_mels=cnn_cfg['n_mels'], max_frames=cnn_cfg['max_frames'],
    )

    # Validation: use same-source val split (never train data)
    cv_val = [e for e in val_entries if e.source == 'CommonVoice-PT']
    if len(cv_val) < 10:
        cv_val = list(val_entries)
        print(f'  Val fallback: using full val split ({len(cv_val)} entries, mixed source)')

    cs_val_ds2 = MelSpectrogramDataset(
        cv_val, label_to_idx, n_mels=cnn_cfg['n_mels'], max_frames=cnn_cfg['max_frames'],
    )

    g_cs2 = torch.Generator()
    g_cs2.manual_seed(SEED)

    cs_train_loader2 = torch.utils.data.DataLoader(
        cs_train_ds2, batch_size=cnn_batch_size, shuffle=True,
        num_workers=cnn_num_workers, worker_init_fn=seed_worker, generator=g_cs2, pin_memory=True,
    )
    cs_val_loader2 = torch.utils.data.DataLoader(
        cs_val_ds2, batch_size=cnn_batch_size, shuffle=False, num_workers=cnn_num_workers, pin_memory=True,
    )
    cs_test_loader2 = torch.utils.data.DataLoader(
        cs_test_ds2, batch_size=cnn_batch_size, shuffle=False, num_workers=cnn_num_workers, pin_memory=True,
    )

    cs_labels2 = [label_to_idx[e.accent] for e in cv_train]
    cs_weights2 = compute_class_weights(cs_labels2, n_classes)

    cs_model_2 = AccentCNN(n_classes=n_classes, n_mels=cnn_cfg['n_mels'], conv_channels=cnn_cfg['conv_channels'])
    cs_config_2 = TrainingConfig(
        learning_rate=cnn_cfg['training']['learning_rate'],
        batch_size=cnn_batch_size, n_epochs=cnn_cfg['training']['n_epochs'],
        patience=cnn_cfg['training']['patience'], device=DEVICE, seed=SEED,
        checkpoint_dir=Path(config['output']['checkpoint_dir']) / 'cross_source_cv2coraa',
        experiment_name='cross_source_cv2coraa', use_amp=cnn_cfg['training']['use_amp'],
    )

    cs_result_2 = train_classifier(cs_model_2, cs_train_loader2, cs_val_loader2, cs_config_2, cs_weights2)

    # weights_only=False: checkpoint contains non-tensor metadata (safe, self-saved)
    checkpoint_cs2 = torch.load(cs_result_2.best_checkpoint_path, map_location=DEVICE, weights_only=False)
    cs_model_2.load_state_dict(checkpoint_cs2['model_state_dict'])

    cs_eval_2 = evaluate_classifier(cs_model_2, cs_test_loader2, label_names, DEVICE)
    cross_source_results['cv2coraa'] = cs_eval_2

    print(f'CV->CORAA bal_acc: {cs_eval_2["balanced_accuracy"]:.4f} '
          f'(CI: [{cs_eval_2["ci_95_lower"]:.4f}, {cs_eval_2["ci_95_upper"]:.4f}])')

    del cs_model_2
    torch.cuda.empty_cache()
else:
    print('Skipping direction 2: insufficient data')

# Interpretation — threshold from config
print(f'\n=== CROSS-SOURCE SUMMARY ===')
print(f'Chance level: {chance:.4f}')
print(f'Above-chance margin: {config["cross_source"]["above_chance_margin_pp"]}pp')
for direction, eval_result in cross_source_results.items():
    ba = eval_result['balanced_accuracy']
    above_chance = ba > chance + above_chance_margin
    status = 'ABOVE CHANCE (accent signal transfers)' if above_chance else 'AT CHANCE (possible source confound)'
    print(f'  {direction}: bal_acc={ba:.4f} -> {status}')

if all(r['balanced_accuracy'] <= chance + above_chance_margin for r in cross_source_results.values()):
    print('\nWARNING: Both directions at chance. Classifier may have learned source, not accent.')
else:
    print('\nAt least one direction shows transfer. Accent signal appears generalizable across sources.')

## 7. Ablation Summary

Comparison table: CNN vs wav2vec2, with balanced accuracy, CI 95%, F1 macro, and cross-source results.  
All metrics follow the protocol: balanced accuracy (primary), CI 95% (bootstrap, 1000 samples).

In [None]:
# Build comparison table
chance = 1.0 / n_classes

comparison_data = {
    'Model': ['CNN (mel-spectrogram)', 'wav2vec2-base'],
    'Balanced Accuracy': [
        f'{cnn_eval["balanced_accuracy"]:.4f}',
        f'{w2v_eval["balanced_accuracy"]:.4f}',
    ],
    'CI 95% Lower': [
        f'{cnn_eval["ci_95_lower"]:.4f}',
        f'{w2v_eval["ci_95_lower"]:.4f}',
    ],
    'CI 95% Upper': [
        f'{cnn_eval["ci_95_upper"]:.4f}',
        f'{w2v_eval["ci_95_upper"]:.4f}',
    ],
    'F1 Macro': [
        f'{cnn_eval["f1_macro"]:.4f}',
        f'{w2v_eval["f1_macro"]:.4f}',
    ],
    'Best Epoch': [
        cnn_result.best_epoch,
        w2v_result.best_epoch,
    ],
    'Total Epochs': [
        cnn_result.total_epochs_run,
        w2v_result.total_epochs_run,
    ],
}

comparison_df = pd.DataFrame(comparison_data)
print('=== ABLATION: CNN vs wav2vec2 ===')
print(f'Chance level: {chance:.4f}')
print()
print(comparison_df.to_string(index=False))

# Check if CIs overlap (cannot claim one is better)
cnn_ci = (cnn_eval['ci_95_lower'], cnn_eval['ci_95_upper'])
w2v_ci = (w2v_eval['ci_95_lower'], w2v_eval['ci_95_upper'])

overlap = cnn_ci[0] <= w2v_ci[1] and w2v_ci[0] <= cnn_ci[1]
if overlap:
    print('\nCIs overlap -> cannot claim one model is superior.')
else:
    winner = 'wav2vec2' if w2v_eval['balanced_accuracy'] > cnn_eval['balanced_accuracy'] else 'CNN'
    print(f'\nCIs do NOT overlap -> {winner} is significantly better.')

# Cross-source results table
if cross_source_results:
    print('\n=== CROSS-SOURCE EVALUATION ===')
    for direction, result in cross_source_results.items():
        print(f'  {direction}: bal_acc={result["balanced_accuracy"]:.4f} '
              f'(CI: [{result["ci_95_lower"]:.4f}, {result["ci_95_upper"]:.4f}])')

In [None]:
# Save full report as JSON with all metrics, configs, and hashes
from datetime import datetime

# Provenance — uses src.utils.git (imported in cell-2)
commit_hash = get_commit_hash()

# SHA-256 of the combined manifest
COMBINED_MANIFEST_PATH = DRIVE_BASE / 'accents_pt_br' / 'manifest.jsonl'
manifest_sha256 = compute_file_hash(COMBINED_MANIFEST_PATH) if COMBINED_MANIFEST_PATH.exists() else 'N/A'

report = {
    'experiment': config['experiment']['name'],
    'date': datetime.now().isoformat(),
    'commit_hash': commit_hash,
    'seed': SEED,
    'environment': {
        'python_version': sys.version,
        'torch_version': torch.__version__,
        'cuda_version': torch.version.cuda if torch.cuda.is_available() else None,
        'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
    },
    'dataset': {
        'name': config['dataset']['name'],
        'combined_manifest_sha256': manifest_sha256,
        'total_entries': len(combined_entries),
        'total_speakers': len({e.speaker_id for e in combined_entries}),
        'n_classes': n_classes,
        'label_names': label_names,
        'per_source': dict(Counter(e.source for e in combined_entries)),
        'per_accent': dict(Counter(e.accent for e in combined_entries)),
    },
    'splits': {
        'method': config['splits']['method'],
        'ratios': config['splits']['ratios'],
        'seed': config['splits']['seed'],
        'train_utterances': len(train_entries),
        'val_utterances': len(val_entries),
        'test_utterances': len(test_entries),
        'train_speakers': len(split_info.train_speakers),
        'val_speakers': len(split_info.val_speakers),
        'test_speakers': len(split_info.test_speakers),
    },
    'confounds': [
        {
            'test': r.test_name,
            'variables': f'{r.variable_a} x {r.variable_b}',
            'statistic': r.statistic,
            'p_value': r.p_value,
            'effect_size': r.effect_size,
            'effect_size_name': r.effect_size_name,
            'is_blocking': r.is_blocking,
            'is_significant': r.is_significant,
            'interpretation': r.interpretation,
        }
        for r in confound_results
    ],
    'cnn': {
        'config': {
            'n_mels': cnn_cfg['n_mels'],
            'max_frames': cnn_cfg['max_frames'],
            'conv_channels': cnn_cfg['conv_channels'],
            'learning_rate': cnn_cfg['training']['learning_rate'],
            'batch_size': cnn_cfg['training']['batch_size'],
            'n_epochs': cnn_cfg['training']['n_epochs'],
            'patience': cnn_cfg['training']['patience'],
        },
        'training': {
            'best_epoch': cnn_result.best_epoch,
            'best_val_bal_acc': cnn_result.best_val_bal_acc,
            'total_epochs_run': cnn_result.total_epochs_run,
            'checkpoint_path': str(cnn_result.best_checkpoint_path),
        },
        'evaluation': cnn_eval,
    },
    'wav2vec2': {
        'config': {
            'model_name': w2v_cfg['model_name'],
            'freeze_feature_extractor': w2v_cfg['freeze_feature_extractor'],
            'max_length_s': w2v_cfg['max_length_s'],
            'learning_rate': w2v_cfg['training']['learning_rate'],
            'batch_size': w2v_cfg['training']['batch_size'],
            'n_epochs': w2v_cfg['training']['n_epochs'],
            'patience': w2v_cfg['training']['patience'],
        },
        'training': {
            'best_epoch': w2v_result.best_epoch,
            'best_val_bal_acc': w2v_result.best_val_bal_acc,
            'total_epochs_run': w2v_result.total_epochs_run,
            'checkpoint_path': str(w2v_result.best_checkpoint_path),
        },
        'evaluation': w2v_eval,
    },
    'robustness': robustness_results,
    'cross_source': cross_source_results,
    'chance_level': chance,
}

# Save report
report_dir = Path(config['output']['report_dir'])
report_dir.mkdir(parents=True, exist_ok=True)
report_path = Path(config['output']['report_json'])

with open(report_path, 'w') as f:
    json.dump(report, f, indent=2, default=str)

print(f'Report saved to: {report_path}')
print(f'Combined manifest SHA-256: {manifest_sha256}')
print(f'Commit hash: {commit_hash}')
print(f'\n=== EXPERIMENT COMPLETE ===')
print(f'CNN bal_acc: {cnn_eval["balanced_accuracy"]:.4f} '
      f'(CI: [{cnn_eval["ci_95_lower"]:.4f}, {cnn_eval["ci_95_upper"]:.4f}])')
print(f'wav2vec2 bal_acc: {w2v_eval["balanced_accuracy"]:.4f} '
      f'(CI: [{w2v_eval["ci_95_lower"]:.4f}, {w2v_eval["ci_95_upper"]:.4f}])')
print(f'Chance level: {chance:.4f}')