# Stage 1.5 â€” Latent Separability Audit

**Projeto:** Controle ExplÃ­cito de Sotaque Regional em pt-BR  
**Objetivo:** Verificar se representaÃ§Ãµes internas do Qwen3-TTS codificam informaÃ§Ã£o suficiente de sotaque regional para classificaÃ§Ã£o acima de chance, com leakage controlado.  
**Backbone:** Qwen3-TTS 1.7B-CustomVoice (frozen)  
**Dataset:** CORAA-MUPE (speaker-disjoint splits)  

Este notebook Ã© a **camada de orquestraÃ§Ã£o**. Toda lÃ³gica estÃ¡ em `src/` (testÃ¡vel, auditÃ¡vel).  
O notebook apenas: instala deps â†’ configura ambiente â†’ chama mÃ³dulos â†’ exibe resultados.

## 0. Setup do Ambiente

In [None]:
# Bootstrap: clone repo, install deps, check NumPy ABI.
# This module uses only stdlib â€” safe to import before pip install.
# On first Colab run, this cell may restart the runtime once (NumPy ABI fix).
from src.utils.notebook_bootstrap import bootstrap
bootstrap()

In [None]:
# Platform-aware persistent cache setup
# - Colab: Google Drive mount â†’ /content/drive/MyDrive/tcc-cache
# - Lightning.ai: persistent storage â†’ /teamspace/studios/this_studio/cache
# - Paperspace: persistent storage â†’ /storage/tcc-cache
# - Local: ./cache (relative to repo root)

from src.utils.platform import detect_platform, setup_environment

platform = detect_platform()
setup_environment(platform)
# Note: setup_environment() already handles Drive mounting on Colab

DRIVE_BASE = platform.cache_base
DRIVE_BASE.mkdir(parents=True, exist_ok=True)

print(f'Platform: {platform.name}')
print(f'Cache base: {DRIVE_BASE}')
print(f'GPU: {platform.has_gpu}')

In [None]:
# Seeds e determinismo â€” OBRIGATÃ“RIO antes de qualquer operaÃ§Ã£o
from src.utils.seed import set_global_seed

SEED = 42
generator = set_global_seed(SEED)
print(f'Seed global configurado: {SEED}')

In [None]:
# Verificar GPU e versÃµes
import torch
import sys

print(f'Python: {sys.version}')
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'CUDA device: {torch.cuda.get_device_name(0)}')
    print(f'CUDA version: {torch.version.cuda}')
    print(f'VRAM total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'\nUsando device: {DEVICE}')

In [None]:
# Load experiment config YAML â€” single source of truth
import yaml
from pathlib import Path

with open('configs/stage1_5.yaml') as f:
    config = yaml.safe_load(f)

print(f'Config loaded: {config["experiment"]["name"]}')
print(f'Dataset: {config["dataset"]["name"]}')
print(f'Splits: {config["splits"]["method"]} (seed={config["splits"]["seed"]})')

## 1. Download e Build Manifest

Carrega o CORAA-MUPE-ASR do HuggingFace, filtra por `speaker_type='R'` (entrevistados),
duraÃ§Ã£o 3â€“15s e mÃ­nimo de speakers por regiÃ£o. O manifest Ã© o artefato versionado (SHA-256).

In [None]:
from src.data.cache import PipelineCache

cache = PipelineCache(config, drive_base=DRIVE_BASE)
print(cache.report())
print()

# Initialize variables for both code paths (cache hit vs miss)
entries = None
build_stats = None

if cache.has_manifest():
    print('Loading manifest from Drive cache...')
    entries = cache.load_manifest()
    print(f'Loaded {len(entries):,} entries from cache')
else:
    from datasets import load_dataset, concatenate_datasets

    print('Downloading CORAA-MUPE-ASR from HuggingFace...')
    print('(~42 GB na primeira vez â€” usa cache nas prÃ³ximas execuÃ§Ãµes)\n')

    ds = load_dataset("nilc-nlp/CORAA-MUPE-ASR")
    print(f'Splits disponÃ­veis: {list(ds.keys())}')
    for split_name, split_data in ds.items():
        print(f'  {split_name}: {len(split_data):,} rows')

    # Concatenar todos os splits â€” criaremos nossos prÃ³prios splits speaker-disjoint
    all_data = concatenate_datasets([ds[split] for split in ds.keys()])
    print(f'\nTotal concatenado: {len(all_data):,} rows')
    print(f'Colunas: {all_data.column_names}')

In [None]:
# Build manifest from HF dataset (only when not loaded from cache)
if entries is None:
    from src.data.manifest_builder import build_manifest_from_hf_dataset

    AUDIO_DIR = Path('data/audio/')
    MANIFEST_PATH = Path(config['dataset']['manifest_path'])

    entries, build_stats = build_manifest_from_hf_dataset(
        dataset=all_data,
        audio_output_dir=AUDIO_DIR,
        manifest_output_path=MANIFEST_PATH,
        speaker_type_filter=config['dataset']['filters']['speaker_type'],
        min_duration_s=config['dataset']['filters']['min_duration_s'],
        max_duration_s=config['dataset']['filters']['max_duration_s'],
        min_speakers_per_region=config['dataset']['filters']['min_speakers_per_region'],
    )

    # Save to cache for next run
    cache.save_manifest(entries)

    print(f"Manifest: {len(entries):,} entries")
    print(f"SHA-256: {build_stats['manifest_sha256']}")
    print(f"\nFilter stats:")
    for key, count in build_stats['filter_stats'].items():
        if key == 'dropped_regions':
            continue
        print(f"  {key}: {count:,}")

    # Report dropped regions (protocol Â§4.3 fallback)
    dropped = build_stats['filter_stats'].get('dropped_regions', [])
    if dropped:
        print(f"\nDropped regions (< {config['dataset']['filters']['min_speakers_per_region']} speakers): {dropped}")
        print("Fallback per TECHNICAL_VALIDATION_PROTOCOL.md Â§4.3")

    print(f"\nRegiÃµes mantidas:")
    for region, info in build_stats['regions'].items():
        print(f"  {region}: {info['n_speakers']} speakers, {info['n_utterances']:,} utterances")
else:
    print(f'Manifest already loaded from cache: {len(entries):,} entries')
    print('Skipping HF dataset build.')

## 2. Speaker-Disjoint Splits

In [None]:
from src.data.splits import (
    generate_speaker_disjoint_splits,
    generate_stratified_splits,
    save_splits,
    save_stratified_splits,
    assign_entries_to_splits,
    assign_entries_to_stratified_splits,
)

split_info = generate_speaker_disjoint_splits(
    entries,
    train_ratio=config['splits']['ratios']['train'],
    val_ratio=config['splits']['ratios']['val'],
    test_ratio=config['splits']['ratios']['test'],
    seed=config['splits']['seed'],
)

# Persistir splits
split_path = save_splits(split_info, Path(config['splits']['output_dir']))
print(f"Splits salvos em: {split_path}")
print(f"Train: {len(split_info.train_speakers)} speakers, {split_info.utterances_per_split['train']} utts")
print(f"Val:   {len(split_info.val_speakers)} speakers, {split_info.utterances_per_split['val']} utts")
print(f"Test:  {len(split_info.test_speakers)} speakers, {split_info.utterances_per_split['test']} utts")

# Assign entries (speaker-disjoint)
split_entries = assign_entries_to_splits(entries, split_info)

# Verify speaker-disjoint (HARD FAIL if violated â€” KB_HARD_FAIL_RULES Â§1)
train_spk = {e.speaker_id for e in split_entries['train']}
val_spk = {e.speaker_id for e in split_entries['val']}
test_spk = {e.speaker_id for e in split_entries['test']}

assert len(train_spk & val_spk) == 0, f'Speaker leakage train->val: {train_spk & val_spk}'
assert len(train_spk & test_spk) == 0, f'Speaker leakage train->test: {train_spk & test_spk}'
assert len(val_spk & test_spk) == 0, f'Speaker leakage val->test: {val_spk & test_spk}'
print('\nSpeaker-disjoint verification: PASSED')

# Generate stratified split for leakage Aâ†’speaker probes
stratified_split_info = generate_stratified_splits(
    entries,
    train_ratio=config['splits']['ratios']['train'],
    seed=config['splits']['seed'],
)
stratified_split_path = save_stratified_splits(
    stratified_split_info, Path(config['splits']['output_dir'])
)
stratified_entries = assign_entries_to_stratified_splits(entries, stratified_split_info)
print(f"\nStratified splits salvos em: {stratified_split_path}")
print(f"Stratified Train: {stratified_split_info.utterances_per_split['train']} utts")
print(f"Stratified Test:  {stratified_split_info.utterances_per_split['test']} utts")
print(f"Speakers in common: {stratified_split_info.speakers_in_common}")

## 3. AnÃ¡lise de Confounds

**Sanity checks obrigatÃ³rios** (recomendaÃ§Ã£o do mentor):  
- Tabela accent Ã— gender com chi-quadrado + Cramer's V  
- Histograma de duraÃ§Ã£o por regiÃ£o + Kruskal-Wallis

In [None]:
from src.analysis.confounds import run_all_confound_checks
import pandas as pd

confound_results = run_all_confound_checks(
    entries,
    gender_blocking_threshold=config['evaluation']['confounds']['accent_x_gender']['threshold_blocker'],
    duration_practical_diff_s=config['evaluation']['confounds']['accent_x_duration']['practical_diff_s'],
    snr_practical_diff_db=config['evaluation']['confounds']['accent_x_snr']['practical_diff_db'],
)

print("=== CONFOUND ANALYSIS ===")
for result in confound_results:
    status = 'ðŸ”´ BLOCKING' if result.is_blocking else ('ðŸŸ¡ SIGNIFICANT' if result.is_significant else 'ðŸŸ¢ OK')
    print(f"\n{result.variable_a} Ã— {result.variable_b}: {status}")
    print(f"  Test: {result.test_name}")
    print(f"  Statistic: {result.statistic:.4f}")
    print(f"  p-value: {result.p_value:.6f}")
    print(f"  Effect size ({result.effect_size_name}): {result.effect_size:.4f}")
    print(f"  Interpretation: {result.interpretation}")

# Tabela accent x gender
gender_table = pd.crosstab(
    [e.accent for e in entries],
    [e.gender for e in entries],
    margins=True,
)
print("\n=== ACCENT Ã— GENDER TABLE ===")
print(gender_table)

In [ ]:
# Duration histogram by region + summary stats
import numpy as np
import matplotlib.pyplot as plt

durations_by_region = {}
for e in entries:
    durations_by_region.setdefault(e.accent, []).append(e.duration_s)

fig, ax = plt.subplots(figsize=(10, 5))
regions_sorted = sorted(durations_by_region.keys())
ax.boxplot(
    [durations_by_region[r] for r in regions_sorted],
    labels=regions_sorted,
    showfliers=False,
)
ax.set_xlabel('Region (IBGE macro-region)')
ax.set_ylabel('Duration (seconds)')
ax.set_title('Duration distribution by accent region')
plt.tight_layout()

Path('reports/figures').mkdir(parents=True, exist_ok=True)
plt.savefig('reports/figures/duration_by_region.png', dpi=150)
plt.show()

print('\nDuration summary:')
for r in regions_sorted:
    durs = durations_by_region[r]
    print(f'  {r}: mean={np.mean(durs):.2f}s, std={np.std(durs):.2f}s, '
          f'median={np.median(durs):.2f}s, n={len(durs)}')

## 4. Feature Extraction

Quatro fontes de features para probing:
1. **Acoustic** (MFCC + pitch + energy) â€” baseline rÃ¡pido, CPU-only
2. **ECAPA-TDNN** â€” embeddings de speaker (192-dim)
3. **WavLM** â€” SSL features por camada
4. **Qwen3-TTS backbone** â€” features internas do modelo-alvo (GPU)

In [None]:
import numpy as np
from tqdm.auto import tqdm
from src.features.acoustic import extract_acoustic_features, features_to_vector
from src.features.ecapa import extract_ecapa_embedding

# 4.1 Acoustic features (CPU, fast)
print('=== Acoustic features ===')
if cache.has_features('acoustic'):
    acoustic_vectors = cache.load_features('acoustic')
    print(f'Loaded {len(acoustic_vectors)} vectors from cache')
else:
    acoustic_vectors = {}
    for entry in tqdm(entries, desc='Acoustic'):
        feats = extract_acoustic_features(
            Path(entry.audio_path), entry.utt_id,
            n_mfcc=config['features']['acoustic']['n_mfcc'],
        )
        acoustic_vectors[entry.utt_id] = features_to_vector(feats)
    cache.save_features('acoustic', acoustic_vectors)
    print(f'Extracted and cached {len(acoustic_vectors)} vectors')

print(f'Dimension: {next(iter(acoustic_vectors.values())).shape}')
if torch.cuda.is_available():
    print(f'VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB allocated, '
          f'{torch.cuda.max_memory_allocated()/1e9:.2f} GB peak')

In [None]:
# 4.2 ECAPA-TDNN speaker embeddings
print('=== ECAPA embeddings ===')
if cache.has_features('ecapa'):
    ecapa_embeddings = cache.load_features('ecapa')
    print(f'Loaded {len(ecapa_embeddings)} embeddings from cache')
else:
    ecapa_embeddings = {}
    for entry in tqdm(entries, desc='ECAPA'):
        emb = extract_ecapa_embedding(Path(entry.audio_path), device=DEVICE)
        ecapa_embeddings[entry.utt_id] = emb
    cache.save_features('ecapa', ecapa_embeddings)
    print(f'Extracted and cached {len(ecapa_embeddings)} embeddings')

print(f'Dimension: {next(iter(ecapa_embeddings.values())).shape}')
if torch.cuda.is_available():
    print(f'VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB allocated, '
          f'{torch.cuda.max_memory_allocated()/1e9:.2f} GB peak')

In [None]:
# 4.3 WavLM SSL features (layer-wise)
from src.features.ssl import extract_ssl_features

SSL_LAYERS = config['features']['ssl']['layers']
print(f'=== WavLM features (layers {SSL_LAYERS}) ===')

ssl_features = {layer: {} for layer in SSL_LAYERS}

# Check cache per layer
all_cached = True
for layer in SSL_LAYERS:
    cache_key = f'wavlm_layer_{layer}'
    if cache.has_features(cache_key):
        ssl_features[layer] = cache.load_features(cache_key)
        print(f'  Layer {layer}: loaded {len(ssl_features[layer])} vectors from cache')
    else:
        all_cached = False
        break

if not all_cached:
    print('  Extracting from scratch...')
    ssl_features = {layer: {} for layer in SSL_LAYERS}
    for entry in tqdm(entries, desc='WavLM'):
        layer_feats = extract_ssl_features(
            Path(entry.audio_path),
            layers=SSL_LAYERS,
            device=DEVICE,
        )
        for layer_idx, feat_vec in layer_feats.items():
            ssl_features[layer_idx][entry.utt_id] = feat_vec

    for layer in SSL_LAYERS:
        cache.save_features(f'wavlm_layer_{layer}', ssl_features[layer])

print(f'WavLM extraction complete')
for layer in SSL_LAYERS:
    dim = next(iter(ssl_features[layer].values())).shape
    print(f'  Layer {layer}: {len(ssl_features[layer])} vectors, dim={dim}')
if torch.cuda.is_available():
    print(f'VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB allocated, '
          f'{torch.cuda.max_memory_allocated()/1e9:.2f} GB peak')

In [None]:
# 4.4 Qwen3-TTS backbone features (layer-wise) â€” GPU required
from src.features.backbone import extract_backbone_features

BACKBONE_LAYERS = config['features']['backbone']['layers']
NEUTRAL_TEXT = config['features']['backbone']['neutral_text']  # from config, not hardcoded
print(f'=== Backbone features (layers {BACKBONE_LAYERS}) ===')
print(f'Neutral text: "{NEUTRAL_TEXT}"')

backbone_features = {layer: {} for layer in BACKBONE_LAYERS}

# Check cache per layer
all_cached = True
for layer in BACKBONE_LAYERS:
    cache_key = f'backbone_layer_{layer}'
    if cache.has_features(cache_key):
        backbone_features[layer] = cache.load_features(cache_key)
        print(f'  Layer {layer}: loaded {len(backbone_features[layer])} vectors from cache')
    else:
        all_cached = False
        break

if not all_cached:
    print('  Extracting from scratch...')
    backbone_features = {layer: {} for layer in BACKBONE_LAYERS}

    for entry in tqdm(entries, desc='Backbone'):
        layer_feats = extract_backbone_features(
            Path(entry.audio_path),
            text=NEUTRAL_TEXT,
            layers=BACKBONE_LAYERS,
            device=DEVICE,
        )
        for layer_idx, feat_vec in layer_feats.items():
            backbone_features[layer_idx][entry.utt_id] = feat_vec

    for layer in BACKBONE_LAYERS:
        if backbone_features[layer]:
            cache.save_features(f'backbone_layer_{layer}', backbone_features[layer])

print(f'Backbone extraction complete')
for layer in BACKBONE_LAYERS:
    if backbone_features[layer]:
        dim = next(iter(backbone_features[layer].values())).shape
        print(f'  Layer {layer}: {len(backbone_features[layer])} vectors, dim={dim}')

# Free GPU memory after heaviest extraction
if torch.cuda.is_available():
    print(f'VRAM before cleanup: {torch.cuda.memory_allocated()/1e9:.2f} GB allocated, '
          f'{torch.cuda.max_memory_allocated()/1e9:.2f} GB peak')
torch.cuda.empty_cache()

## 5. Baseline ECAPA Speaker Similarity

Mede similaridade intra-speaker (mesmo speaker, utterances diferentes) e inter-speaker no Ã¡udio real.  
Este baseline Ã© referÃªncia obrigatÃ³ria para Stage 2 (preservaÃ§Ã£o de identidade com LoRA).

In [None]:
from src.features.ecapa import compute_speaker_similarity_baseline
from src.evaluation.bootstrap_ci import bootstrap_cosine_similarity

# Group embeddings by speaker
speaker_embs = {}
for entry in entries:
    speaker_embs.setdefault(entry.speaker_id, []).append(
        ecapa_embeddings[entry.utt_id]
    )

sim_baseline = compute_speaker_similarity_baseline(speaker_embs)

# CI for intra and inter
intra_ci = bootstrap_cosine_similarity(
    np.array(sim_baseline['intra']['values']), seed=SEED
)
inter_ci = bootstrap_cosine_similarity(
    np.array(sim_baseline['inter']['values']), seed=SEED
)

print('=== SPEAKER SIMILARITY BASELINE (ECAPA-TDNN, 192-dim) ===')
print(f"Intra-speaker: {sim_baseline['intra']['mean']:.4f} Â± {sim_baseline['intra']['std']:.4f}")
print(f"  CI 95%: [{intra_ci.ci_lower:.4f}, {intra_ci.ci_upper:.4f}]")
print(f"  N pairs: {sim_baseline['intra']['n_pairs']}")
print(f"\nInter-speaker: {sim_baseline['inter']['mean']:.4f} Â± {sim_baseline['inter']['std']:.4f}")
print(f"  CI 95%: [{inter_ci.ci_lower:.4f}, {inter_ci.ci_upper:.4f}]")
print(f"  N pairs: {sim_baseline['inter']['n_pairs']}")
print(f"\nSeparation: {sim_baseline['intra']['mean'] - sim_baseline['inter']['mean']:.4f}")

## 6. Linear Probes

Probe architecture: **Logistic Regression** (linear only â€” protocol requirement).  

Split assignments (corrected â€” Achado 1 da auditoria):  
- Accent probe: **speaker-disjoint** split  
- Speaker probe: **stratified** split  
- Leakage Aâ†’speaker: **stratified** split (same speakers in train/test)  
- Leakage Sâ†’accent: **speaker-disjoint** split (different speakers in test)

In [None]:
from src.evaluation.probes import (
    build_probe_data,
    train_linear_probe,
    evaluate_probe_against_thresholds,
    sweep_regularization,
    train_selectivity_control,
)
from src.evaluation.confusion import plot_confusion_matrix

In [None]:
# 6.1 Accent Probe (per layer, speaker-disjoint split)
# Initialize probe result collectors (reset on re-execution for idempotency)
all_probe_results = []
all_selectivity_results = []

print('=== ACCENT PROBES ===')

# Build train/test for speaker-disjoint
train_entries = split_entries['train']
test_entries = split_entries['test']

# Probe each feature source
feature_sources = {}

# Acoustic
feature_sources['acoustic'] = acoustic_vectors

# ECAPA
feature_sources['ecapa'] = ecapa_embeddings

# WavLM layers
for layer in SSL_LAYERS:
    feature_sources[f'wavlm_layer_{layer}'] = ssl_features[layer]

# Backbone layers
for layer in BACKBONE_LAYERS:
    if backbone_features[layer]:
        feature_sources[f'backbone_layer_{layer}'] = backbone_features[layer]

C_values = config['probes']['regularization_C']

for source_name, feat_dict in feature_sources.items():
    X_train, y_train = build_probe_data(feat_dict, train_entries, 'accent')
    X_test, y_test = build_probe_data(feat_dict, test_entries, 'accent')
    
    if len(X_train) == 0 or len(X_test) == 0:
        print(f'  {source_name}: SKIPPED (no data)')
        continue
    
    # Sweep regularization to find best C
    sweep_results = sweep_regularization(
        X_train, y_train, X_test, y_test,
        C_values=C_values,
        probe_name=f'accent_{source_name}',
        feature_source=source_name,
        target='accent',
        split_type='speaker_disjoint',
        seed=SEED,
    )
    best_sweep = max(sweep_results, key=lambda r: r.balanced_accuracy)
    best_C = best_sweep.regularization_C
    
    # Re-train with best C and full CI
    result = train_linear_probe(
        X_train, y_train, X_test, y_test,
        probe_name=f'accent_{source_name}',
        feature_source=source_name,
        target='accent',
        split_type='speaker_disjoint',
        C=best_C,
        seed=SEED,
    )
    all_probe_results.append(result)
    
    decision = evaluate_probe_against_thresholds(
        result, config['thresholds']['accent_probe']
    )
    print(f'  {source_name}: bal_acc={result.balanced_accuracy:.4f} '
          f'CI=[{result.ci.ci_lower:.4f}, {result.ci.ci_upper:.4f}] '
          f'delta={result.delta_pp:+.1f}pp C={best_C} â†’ {decision}')

# Selectivity control for accent probes
print('\n=== SELECTIVITY CONTROL (accent probes) ===')
accent_results = [r for r in all_probe_results if r.target == 'accent' and 'leakage' not in r.probe_name]
for result in accent_results:
    feat_dict = feature_sources[result.feature_source]
    X_train, y_train = build_probe_data(feat_dict, train_entries, 'accent')
    X_test, y_test = build_probe_data(feat_dict, test_entries, 'accent')
    
    sel = train_selectivity_control(
        X_train, y_train, X_test, y_test,
        real_result=result,
        seed=SEED,
        C=result.regularization_C,
    )
    sel['probe_name'] = result.probe_name
    sel['feature_source'] = result.feature_source
    all_selectivity_results.append(sel)
    
    print(f'  {result.feature_source}: real={sel["real_bal_acc"]:.4f} '
          f'permuted={sel["permuted_bal_acc_mean"]:.4f}Â±{sel["permuted_bal_acc_std"]:.4f} '
          f'selectivity={sel["selectivity_pp"]:+.1f}pp')

In [None]:
# 6.2 Leakage Probes
# Remove previous leakage results for idempotent re-execution
all_probe_results = [r for r in all_probe_results if 'leakage' not in r.probe_name]
all_selectivity_results = [s for s in all_selectivity_results if 'leakage' not in s.get('probe_name', '')]

print('\n=== LEAKAGE PROBES ===')

# --- Leakage Aâ†’speaker: Do accent features contain speaker identity? ---
# Uses STRATIFIED split (same speakers in train/test â€” we need known speakers)
print('Leakage Aâ†’speaker (accent feature sources, stratified split):')

strat_train_entries = stratified_entries['train']
strat_test_entries = stratified_entries['test']

# Accent feature sources: WavLM layers, backbone layers, acoustic
# (NOT ECAPA â€” those are speaker embeddings, not accent features)
leakage_a2s_sources = {}
for layer in SSL_LAYERS:
    leakage_a2s_sources[f'wavlm_layer_{layer}'] = ssl_features[layer]
for layer in BACKBONE_LAYERS:
    if backbone_features[layer]:
        leakage_a2s_sources[f'backbone_layer_{layer}'] = backbone_features[layer]
leakage_a2s_sources['acoustic'] = acoustic_vectors

leakage_a2s_results = []
for source_name, feat_dict in leakage_a2s_sources.items():
    X_train, y_train = build_probe_data(feat_dict, strat_train_entries, 'speaker_id')
    X_test, y_test = build_probe_data(feat_dict, strat_test_entries, 'speaker_id')
    
    if len(X_train) == 0 or len(X_test) == 0:
        print(f'  {source_name}: SKIPPED (no data)')
        continue

    result = train_linear_probe(
        X_train, y_train, X_test, y_test,
        probe_name=f'leakage_a2s_{source_name}',
        feature_source=source_name,
        target='speaker_id',
        split_type='stratified',
        C=config['probes']['default_C'],
        seed=SEED,
    )
    leakage_a2s_results.append(result)
    all_probe_results.append(result)

    leak_decision = evaluate_probe_against_thresholds(
        result, config['thresholds']['leakage']
    )
    print(f'  {source_name}: bal_acc={result.balanced_accuracy:.4f} '
          f'chance={result.chance_level:.4f} '
          f'delta={result.delta_pp:+.1f}pp â†’ {leak_decision}')

# Selectivity control for Aâ†’speaker leakage
print('\n=== SELECTIVITY CONTROL (leakage Aâ†’speaker) ===')
for result in leakage_a2s_results:
    feat_dict = leakage_a2s_sources[result.feature_source]
    X_train, y_train = build_probe_data(feat_dict, strat_train_entries, 'speaker_id')
    X_test, y_test = build_probe_data(feat_dict, strat_test_entries, 'speaker_id')

    sel = train_selectivity_control(
        X_train, y_train, X_test, y_test,
        real_result=result,
        seed=SEED,
        C=result.regularization_C,
    )
    sel['probe_name'] = result.probe_name
    sel['feature_source'] = result.feature_source
    all_selectivity_results.append(sel)

    print(f'  {result.feature_source}: real={sel["real_bal_acc"]:.4f} '
          f'permuted={sel["permuted_bal_acc_mean"]:.4f}Â±{sel["permuted_bal_acc_std"]:.4f} '
          f'selectivity={sel["selectivity_pp"]:+.1f}pp')

# --- Leakage Sâ†’accent: Do speaker features contain accent info? ---
# Uses SPEAKER-DISJOINT split (different speakers in test â€” tests generalization)
print('\nLeakage Sâ†’accent (ECAPA embeddings, speaker-disjoint split):')
X_train, y_train = build_probe_data(ecapa_embeddings, train_entries, 'accent')
X_test, y_test = build_probe_data(ecapa_embeddings, test_entries, 'accent')

leakage_s2a = train_linear_probe(
    X_train, y_train, X_test, y_test,
    probe_name='leakage_s2a_ecapa',
    feature_source='ecapa',
    target='accent',
    split_type='speaker_disjoint',
    C=config['probes']['default_C'],
    seed=SEED,
)
all_probe_results.append(leakage_s2a)

leak_decision = evaluate_probe_against_thresholds(
    leakage_s2a, config['thresholds']['leakage']
)
print(f'  bal_acc={leakage_s2a.balanced_accuracy:.4f} '
      f'chance={leakage_s2a.chance_level:.4f} '
      f'delta={leakage_s2a.delta_pp:+.1f}pp â†’ {leak_decision}')

# Selectivity control for Sâ†’accent leakage
sel_s2a = train_selectivity_control(
    X_train, y_train, X_test, y_test,
    real_result=leakage_s2a,
    seed=SEED,
    C=leakage_s2a.regularization_C,
)
sel_s2a['probe_name'] = leakage_s2a.probe_name
sel_s2a['feature_source'] = leakage_s2a.feature_source
all_selectivity_results.append(sel_s2a)
print(f'  selectivity: real={sel_s2a["real_bal_acc"]:.4f} '
      f'permuted={sel_s2a["permuted_bal_acc_mean"]:.4f}Â±{sel_s2a["permuted_bal_acc_std"]:.4f} '
      f'selectivity={sel_s2a["selectivity_pp"]:+.1f}pp')

In [None]:
# 6.3 Confusion Matrices (best accent probe)
accent_results = [r for r in all_probe_results if r.target == 'accent' and 'leakage' not in r.probe_name]
if accent_results:
    best = max(accent_results, key=lambda r: r.balanced_accuracy)
    print(f'Best accent probe: {best.feature_source} (bal_acc={best.balanced_accuracy:.4f})')
    
    if best.confusion_matrix is not None:
        Path('reports/figures').mkdir(parents=True, exist_ok=True)
        plot_confusion_matrix(
            best.confusion_matrix,
            best.confusion_labels,
            title=f'Accent Confusion Matrix ({best.feature_source})',
            output_path=Path('reports/figures/confusion_matrix_accent.png'),
        )
        print('Confusion matrix saved to reports/figures/confusion_matrix_accent.png')

## 7. Robustness (Multiple Seeds)

Repete o melhor probe com 3 seeds para reportar mÃ©dia e desvio.

In [None]:
ROBUSTNESS_SEEDS = config['seed']['robustness_seeds']
print(f'=== ROBUSTNESS CHECK (seeds: {ROBUSTNESS_SEEDS}) ===')

if accent_results:
    best_source = best.feature_source
    best_features = feature_sources[best_source]
    
    seed_results = []
    for s in ROBUSTNESS_SEEDS:
        set_global_seed(s)
        X_tr, y_tr = build_probe_data(best_features, train_entries, 'accent')
        X_te, y_te = build_probe_data(best_features, test_entries, 'accent')
        
        r = train_linear_probe(
            X_tr, y_tr, X_te, y_te,
            probe_name=f'accent_{best_source}_seed{s}',
            feature_source=best_source,
            target='accent',
            split_type='speaker_disjoint',
            seed=s,
            compute_ci=True,
        )
        seed_results.append(r)
        print(f'  Seed {s}: bal_acc={r.balanced_accuracy:.4f} CI=[{r.ci.ci_lower:.4f}, {r.ci.ci_upper:.4f}]')
    
    accs = [r.balanced_accuracy for r in seed_results]
    print(f'\n  Mean: {np.mean(accs):.4f} Â± {np.std(accs):.4f}')
    
    # Restore original seed
    set_global_seed(SEED)

## 8. Gate Decision

AvaliaÃ§Ã£o automÃ¡tica contra os thresholds do protocolo.

In [None]:
from src.evaluation.probes import evaluate_probe_against_thresholds

# Gate decision: evaluate all probes against protocol thresholds
print('=== STAGE 1.5 GATE DECISION ===\n')

overall = 'NOT_EVALUATED'  # safe default â€” overwritten below if all checks run

# 1. Accent probes â€” at least one must reach GO or GO_CONDITIONAL
accent_results = [r for r in all_probe_results if r.target == 'accent' and 'leakage' not in r.probe_name]
accent_decisions = []
for r in accent_results:
    d = evaluate_probe_against_thresholds(r, config['thresholds']['accent_probe'])
    accent_decisions.append((r.feature_source, r.balanced_accuracy, r.delta_pp, d))
    print(f'  Accent {r.feature_source}: bal_acc={r.balanced_accuracy:.4f} delta={r.delta_pp:+.1f}pp â†’ {d}')

accent_pass = any(d in ('GO', 'GO_CONDITIONAL') for _, _, _, d in accent_decisions)
print(f'\n  Accent gate: {"GO" if accent_pass else "FAIL"} (at least one source above threshold)')

# 2. Leakage probes â€” all must be GO or GO_CONDITIONAL (below threshold)
leakage_results = [r for r in all_probe_results if 'leakage' in r.probe_name]
leakage_decisions = []
for r in leakage_results:
    d = evaluate_probe_against_thresholds(r, config['thresholds']['leakage'])
    leakage_decisions.append((r.probe_name, r.balanced_accuracy, r.delta_pp, d))
    print(f'  Leakage {r.probe_name}: bal_acc={r.balanced_accuracy:.4f} delta={r.delta_pp:+.1f}pp â†’ {d}')

leakage_pass = all(d in ('GO', 'GO_CONDITIONAL') for _, _, _, d in leakage_decisions)
print(f'\n  Leakage gate: {"GO" if leakage_pass else "FAIL"} (all probes below threshold)')

# 3. Confounds â€” no blocking confound
confound_pass = not any(r.is_blocking for r in confound_results)
print(f'  Confound gate: {"GO" if confound_pass else "FAIL"} (no blocking confounds)')

# 4. Overall decision
if accent_pass and leakage_pass and confound_pass:
    overall = 'GO'
elif accent_pass and confound_pass:
    overall = 'ADJUST'  # signal exists but leakage needs attention
else:
    overall = 'FAIL'

print(f'\n{"="*50}')
print(f'  STAGE 1.5 GATE: {overall}')
print(f'{"="*50}')

In [None]:
import json
from collections import defaultdict
from datetime import datetime
from src.utils.git import get_commit_hash

commit_hash = get_commit_hash()

# Manifest SHA-256 â€” from build_stats if available, or recompute from file
manifest_sha256 = None
if build_stats is not None:
    manifest_sha256 = build_stats.get('manifest_sha256')
elif cache.has_manifest():
    from src.data.manifest import compute_file_hash
    manifest_sha256 = compute_file_hash(cache.get_manifest_path())

# Compute region stats (fallback when build_stats is unavailable)
if build_stats and build_stats.get('regions'):
    region_stats = build_stats['regions']
else:
    speakers_by_region = defaultdict(set)
    utts_by_region = defaultdict(int)
    for e in entries:
        speakers_by_region[e.accent].add(e.speaker_id)
        utts_by_region[e.accent] += 1
    region_stats = {
        region: {'n_speakers': len(speakers_by_region[region]), 'n_utterances': utts_by_region[region]}
        for region in sorted(speakers_by_region)
    }

report = {
    'experiment': config['experiment']['name'],
    'date': datetime.now().isoformat(),
    'commit_hash': commit_hash,
    'seed': SEED,
    'filter_hash': cache.filter_hash,
    'environment': {
        'cuda_version': torch.version.cuda if torch.cuda.is_available() else None,
        'cudnn_version': torch.backends.cudnn.version() if torch.cuda.is_available() else None,
        'torch_version': torch.__version__,
    },
    'dataset': {
        'name': config['dataset']['name'],
        'manifest_sha256': manifest_sha256,
        'total_entries': len(entries),
        'regions': region_stats,
    },
    'splits': split_info.to_dict(),
    'stratified_splits': stratified_split_info.to_dict(),
    'confounds': [
        {
            'test': r.test_name,
            'variables': f'{r.variable_a} x {r.variable_b}',
            'statistic': r.statistic,
            'p_value': r.p_value,
            'effect_size': r.effect_size,
            'is_blocking': r.is_blocking,
            'interpretation': r.interpretation,
        }
        for r in confound_results
    ],
    'speaker_similarity_baseline': {
        'intra': {
            'mean': sim_baseline['intra']['mean'],
            'std': sim_baseline['intra']['std'],
            'ci_lower': intra_ci.ci_lower,
            'ci_upper': intra_ci.ci_upper,
            'n_pairs': sim_baseline['intra']['n_pairs'],
        },
        'inter': {
            'mean': sim_baseline['inter']['mean'],
            'std': sim_baseline['inter']['std'],
            'ci_lower': inter_ci.ci_lower,
            'ci_upper': inter_ci.ci_upper,
            'n_pairs': sim_baseline['inter']['n_pairs'],
        },
    },
    'probes': [
        {
            'name': r.probe_name,
            'feature_source': r.feature_source,
            'target': r.target,
            'split_type': r.split_type,
            'balanced_accuracy': r.balanced_accuracy,
            'f1_macro': r.f1_macro,
            'chance_level': r.chance_level,
            'delta_pp': r.delta_pp,
            'ci_lower': r.ci.ci_lower if r.ci else None,
            'ci_upper': r.ci.ci_upper if r.ci else None,
            'n_train': r.n_train,
            'n_test': r.n_test,
            'n_classes': r.n_classes,
            'C': r.regularization_C,
        }
        for r in all_probe_results
    ],
    'selectivity_controls': all_selectivity_results,
    'gate_decision': overall,
}

Path('reports').mkdir(exist_ok=True)
report_path = Path('reports/stage1_5_report.json')
with open(report_path, 'w') as f:
    json.dump(report, f, indent=2, default=str)

print(f'Report saved to {report_path}')
print(f'Filter hash: {cache.filter_hash}')
print(f'Total probe results: {len(all_probe_results)}')
print(f'Total selectivity controls: {len(all_selectivity_results)}')
print(f'Gate decision: {report["gate_decision"]}')