# üß™ Kaggle Late Fusion Sweep (3 modelli) ‚Äî SpeechOcean762

Questo notebook:
- carica **3 modelli CTC** da uno **ZIP** (Kaggle Dataset input)
- esegue **Late Fusion** solo su **coppie** e sulla **tripla** (niente valutazione dei singoli)
- sweep su griglie di pesi (step configurabile)
- valuta su **SpeechOcean762** con i 3 task (A/B/C)
- salva **una riga per ogni configurazione** in Excel (append)

Nota: i 3 modelli devono condividere lo stesso tokenizer/vocab (stesso mapping token‚Üíid).
Se includi **WavLM Large**, usa la versione *non weighted* come richiesto.

In [None]:
# Install deps (Kaggle) + clone repo (per confronto affidabile)
import os, sys, subprocess
from pathlib import Path

# ---- deps ----
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',
                'transformers>=4.38', 'datasets>=2.18', 'jiwer',
                'soundfile', 'librosa', 'scikit-learn', 'scipy', 'safetensors',
                'openpyxl', 'accelerate', 'tqdm'], check=False)

import zipfile, math
import numpy as np
import torch

print('torch:', torch.__version__, 'cuda:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('gpu:', torch.cuda.get_device_name(0))

# ---- clone repo (default ON su Kaggle) ----
IS_KAGGLE = Path('/kaggle').exists()
SKIP_CLONE = str(os.environ.get('DL_PHONEME_SKIP_CLONE', '')).strip().lower() in ('1', 'true', 'yes')

REPO_URL_DEFAULT = 'https://github.com/maurocarlu/pronuncIAtion.git'
REPO_URL = str(os.environ.get('DL_PHONEME_REPO_URL', REPO_URL_DEFAULT)).strip()

# Nota: su Kaggle cloniamo per usare ESATTAMENTE le stesse funzioni di normalizzazione/mapping
PROJECT_DIR = (Path('/kaggle/working/pronuncIAtion') if IS_KAGGLE else Path.cwd())

if IS_KAGGLE and (not SKIP_CLONE) and REPO_URL:
    if not PROJECT_DIR.exists():
        print('Cloning repo:', REPO_URL)
        subprocess.run(['git', 'clone', REPO_URL, str(PROJECT_DIR)], check=False)
    else:
        print('Repo gi√† presente:', PROJECT_DIR)

    sys.path.insert(0, str(PROJECT_DIR))
    os.chdir(PROJECT_DIR)
    print('CWD:', os.getcwd())
else:
    if IS_KAGGLE and SKIP_CLONE:
        print('Repo clone skipped (DL_PHONEME_SKIP_CLONE=1).')
    else:
        print('Repo clone skipped (not running on Kaggle).')

In [None]:
# CONFIG ‚Äî modifica questi parametri
KAGGLE_INPUT_DATASET_DIR = '/kaggle/input'
ZIP_DATASET_NAME = 'late-fusion'   # <-- dataset Kaggle che contiene lo zip
ZIP_FILENAME = 'LateFusion'                    # <-- nome file zip o directory montata

# Valutazione
FULL_DATASET = True   # True = ~2500 esempi
MAX_SAMPLES = None    # es. 300 per debug
BATCH_SIZE = 4
WEIGHT_STEP = 0.10    # 0.10 (veloce) / 0.05 (pi√π fine ma pi√π lenta)

# Benchmark logging (schema compatibile con scripts/evaluation/track_benchmark.py)
TRAINING_DATA = 'Aug_Comb'
ARCHITECTURE = 'Fusion'  # valori ammessi: ... 'Fusion'
MODEL_NAME_PREFIX = 'LateFusion'

# Output
EXTRACT_DIR = Path('/kaggle/working/models_extracted')
BENCHMARK_XLSX = (PROJECT_DIR / 'benchmark_results.xlsx') if IS_KAGGLE else Path('benchmark_results.xlsx')
OUTPUT_XLSX = BENCHMARK_XLSX
ZIP_PATH = Path(KAGGLE_INPUT_DATASET_DIR) / ZIP_DATASET_NAME / ZIP_FILENAME

print('ZIP_PATH:', ZIP_PATH)
print('Exists:', ZIP_PATH.exists())
print('OUTPUT_XLSX (track_benchmark schema):', OUTPUT_XLSX)

In [None]:
# Estrai lo ZIP dei modelli (o usa direttamente la directory) e individua le cartelle modello
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)
assert ZIP_PATH.exists(), f'Percorso non trovato: {ZIP_PATH}. Controlla ZIP_DATASET_NAME / ZIP_FILENAME.'

# Kaggle a volte monta direttamente una CARTELLA invece di un .zip: gestiamo entrambi
if ZIP_PATH.is_dir():
    print('‚ÑπÔ∏è ZIP_PATH √® una directory. Salto estrazione zip e uso direttamente questa directory.')
    MODELS_ROOT = ZIP_PATH
else:
    MODELS_ROOT = EXTRACT_DIR
    with zipfile.ZipFile(ZIP_PATH, 'r') as z:
        z.extractall(EXTRACT_DIR)
    print('‚úì Extracted to:', EXTRACT_DIR)

# Trova directory che contengono config.json (modelli HF)
candidate_model_dirs = sorted({p.parent for p in MODELS_ROOT.glob('**/config.json')})
print(f'Found {len(candidate_model_dirs)} candidate model dirs')
for p in candidate_model_dirs[:30]:
    print('  ‚úì', p)
if len(candidate_model_dirs) > 30:
    print('  ...')

In [None]:
# Selezione model dirs (AUTO se possibile, altrimenti manuale)
# Nota: qui vuoi includere WavLM Large (non weighted) come uno dei 3 modelli.

MODEL_DIRS = None
if len(candidate_model_dirs) == 3:
    MODEL_DIRS = [str(p) for p in candidate_model_dirs]
elif len(candidate_model_dirs) > 3:
    # euristica: prendi le ultime 3 (spesso i final_model stanno pi√π in profondit√†)
    MODEL_DIRS = [str(p) for p in candidate_model_dirs[-3:]]

# Se vuoi forzare manualmente, decommenta e modifica:
# MODEL_DIRS = [
#   str(MODELS_ROOT / 'hubert_large' / 'final_model_hubert'),
#   str(MODELS_ROOT / 'wavLM_large' / 'final_model_wavlm_large'),
#   str(MODELS_ROOT / 'wav2vec2_phoneme' / 'final_model'),
# ]

assert MODEL_DIRS is not None and len(MODEL_DIRS) == 3, (
    'Imposta MODEL_DIRS manualmente: nello zip/directory ci sono !=3 cartelle modello.'
 )

for i, p in enumerate(MODEL_DIRS, start=1):
    p = Path(p)
    has_config = (p / 'config.json').exists()
    print(f'Model {i}: {p} | exists={p.exists()} | has_config={has_config}')
    assert p.exists() and has_config

In [None]:
# Helper audio decode + metriche (ALLINEATI alla repo: normalize_ipa / arpa_to_ipa)
import math
import os
import io
import numpy as np

# --- importa util dal progetto (se repo clonata su Kaggle o se stai eseguendo dentro la repo) ---
try:
    from src.data.normalize_ipa import IPANormalizer, arpa_to_ipa
    _HAS_PROJECT_NORMALIZER = True
except Exception as e:
    _HAS_PROJECT_NORMALIZER = False
    IPANormalizer = None
    arpa_to_ipa = None
    print('‚ö†Ô∏è Impossibile importare src.data.normalize_ipa:', repr(e))
    print('   Fallback: normalizzazione semplice + mapping ARPA_TO_IPA minimale (meno affidabile).')

def _to_mono_float32(arr: np.ndarray) -> np.ndarray:
    arr = np.asarray(arr)
    if arr.ndim == 1:
        out = arr
    elif arr.ndim == 2:
        if arr.shape[0] <= 8 and arr.shape[1] > arr.shape[0]:
            out = arr.mean(axis=0)
        elif arr.shape[1] <= 8:
            out = arr.mean(axis=1)
        else:
            out = arr.reshape(-1)
    else:
        out = arr.reshape(-1)
    return out.astype(np.float32, copy=False)

def _resample_1d(arr: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
    if int(orig_sr) == int(target_sr):
        return arr.astype(np.float32, copy=False)
    try:
        import librosa
        return librosa.resample(arr.astype(np.float32, copy=False), orig_sr=orig_sr, target_sr=target_sr).astype(np.float32)
    except Exception:
        pass
    try:
        from scipy.signal import resample_poly
        g = math.gcd(int(orig_sr), int(target_sr))
        up = int(target_sr) // g
        down = int(orig_sr) // g
        return resample_poly(arr, up=up, down=down).astype(np.float32)
    except Exception:
        ratio = float(target_sr) / float(orig_sr)
        n = int(max(1, round(len(arr) * ratio)))
        x_old = np.linspace(0.0, 1.0, num=len(arr), endpoint=False)
        x_new = np.linspace(0.0, 1.0, num=n, endpoint=False)
        return np.interp(x_new, x_old, arr).astype(np.float32)

def _decode_audio_dict_payload(audio_dict: dict):
    """Gestisce payload HF datasets quando usi Audio(decode=False)."""
    import soundfile as sf
    # 1) preferisci bytes (Kaggle spesso li usa)
    b = audio_dict.get('bytes', None)
    if b is not None:
        if isinstance(b, memoryview):
            b = b.tobytes()
        arr, sr = sf.read(io.BytesIO(b), dtype='float32', always_2d=False)
        return arr, int(sr)
    # 2) fallback: path (locale o remoto)
    p = audio_dict.get('path', None)
    if p is None:
        raise ValueError(f"Audio dict senza 'bytes' e senza 'path': keys={list(audio_dict.keys())}")
    p = str(p)
    if os.path.exists(p):
        arr, sr = sf.read(p, dtype='float32', always_2d=False)
        return arr, int(sr)
    # 3) prova a leggere via fsspec (es. hf:// o path non locale)
    try:
        import fsspec
        with fsspec.open(p, 'rb') as f:
            data = f.read()
        arr, sr = sf.read(io.BytesIO(data), dtype='float32', always_2d=False)
        return arr, int(sr)
    except Exception as e:
        raise FileNotFoundError(f"Audio path non leggibile: {p} ({e})")

def decode_audio_to_16k(audio_data, target_sr: int = 16000):
    arr = None
    sr = target_sr
    if isinstance(audio_data, dict):
        # HF Audio(decode=False) tipicamente: {'path': ..., 'bytes': ..., 'sampling_rate': ...?}
        if audio_data.get('array') is not None:
            arr = audio_data['array']
            sr = int(audio_data.get('sampling_rate', target_sr) or target_sr)
        else:
            arr, sr = _decode_audio_dict_payload(audio_data)
    elif hasattr(audio_data, 'get_all_samples'):
        samples = audio_data.get_all_samples()
        sr = int(getattr(samples, 'sample_rate', getattr(samples, 'sampling_rate', target_sr)) or target_sr)
        data = getattr(samples, 'data', samples)
        arr = data.numpy() if hasattr(data, 'numpy') else np.asarray(data)
    elif hasattr(audio_data, 'array'):
        arr = np.asarray(audio_data.array)
        sr = int(getattr(audio_data, 'sampling_rate', target_sr) or target_sr)
    elif callable(audio_data):
        decoded = audio_data()
        data = getattr(decoded, 'data', decoded)
        arr = data.numpy() if hasattr(data, 'numpy') else np.asarray(data)
        sr = int(getattr(decoded, 'sample_rate', getattr(decoded, 'sampling_rate', target_sr)) or target_sr)
    else:
        import soundfile as sf
        arr, sr = sf.read(str(audio_data), dtype='float32', always_2d=False)
    if arr is None:
        raise ValueError(f'Audio payload non decodificabile: {type(audio_data)}')
    arr = _to_mono_float32(arr)
    if int(sr) != int(target_sr):
        arr = _resample_1d(arr, int(sr), int(target_sr))
        sr = target_sr
    return arr, int(sr)

# --- reference IPA: usa la conversione ARPA‚ÜíIPA del progetto ---
ARPA_TO_IPA_FALLBACK = {
    'AA':'…ë','AE':'√¶','AH':' å','AO':'…î','AW':'a ä','AY':'a…™',
    'B':'b','CH':'t É','D':'d','DH':'√∞','EH':'…õ','ER':'…ù','EY':'e…™',
    'F':'f','G':'…°','HH':'h','IH':'…™','IY':'i','JH':'d í',
    'K':'k','L':'l','M':'m','N':'n','NG':'≈ã','OW':'o ä','OY':'…î…™',
    'P':'p','R':'…π','S':'s','SH':' É','T':'t','TH':'Œ∏','UH':' ä','UW':'u',
    'V':'v','W':'w','Y':'j','Z':'z','ZH':' í',
}

def _arpa_to_ipa_fallback(phone: str) -> str:
    if phone is None:
        return ''
    p = str(phone).strip().upper()
    while p and p[-1].isdigit():
        p = p[:-1]
    return ARPA_TO_IPA_FALLBACK.get(p, '')

def extract_phones_from_words(words_list) -> str:
    out = []
    for w in (words_list or []):
        for p in (w.get('phones', []) or []):
            if _HAS_PROJECT_NORMALIZER and arpa_to_ipa is not None:
                ipa = arpa_to_ipa(p, use_corrected=True)
            else:
                ipa = _arpa_to_ipa_fallback(p)
            if ipa:
                out.append(ipa)
    return ''.join(out)

# --- normalizzazione + CER/PER (coerente con scripts/evaluation/evaluate_speechocean.py) ---
normalizer = IPANormalizer(mode='strict') if _HAS_PROJECT_NORMALIZER and IPANormalizer is not None else None

def normalize_for_eval(s: str) -> str:
    s = '' if s is None else str(s)
    if normalizer is None:
        # fallback minimale (meno fedele alla repo)
        s = s.strip().replace(' ', '')
        for ch in ['Àà','Àå','Àê','¬∑']:
            s = s.replace(ch, '')
        return s
    return normalizer.normalize(s)

def levenshtein(a: str, b: str) -> int:
    if a == b:
        return 0
    if not a:
        return len(b)
    if not b:
        return len(a)
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a, start=1):
        cur = [i]
        for j, cb in enumerate(b, start=1):
            ins = cur[j-1] + 1
            dele = prev[j] + 1
            sub = prev[j-1] + (0 if ca == cb else 1)
            cur.append(min(ins, dele, sub))
        prev = cur
    return prev[-1]

def cer_components(pred: str, ref: str):
    pred = pred or ''
    ref = ref or ''
    return levenshtein(pred, ref), len(ref)

def cer(pred: str, ref: str) -> float:
    e, n = cer_components(pred, ref)
    return float(e) / float(max(1, n))

In [None]:
# Load dataset SpeechOcean762 + prepara reference IPA (evita torchcodec con decode=False)
from datasets import load_dataset, Audio

ds_dict = load_dataset('mispeech/speechocean762')
split_name = 'test' if 'test' in ds_dict else ('validation' if 'validation' in ds_dict else list(ds_dict.keys())[0])
ds = ds_dict[split_name]
print('Loaded split:', split_name, 'len=', len(ds))

# IMPORTANT: disabilita decode automatico dell'audio (cos√¨ non serve torchcodec)
if 'audio' in ds.column_names:
    ds = ds.cast_column('audio', Audio(decode=False))
    print('‚úì audio casted to decode=False')

if not FULL_DATASET:
    ds = ds.select(range(min(500, len(ds))))
if MAX_SAMPLES is not None:
    ds = ds.select(range(min(int(MAX_SAMPLES), len(ds))))

refs = []
human_scores = []
texts = []
valid_indices = []

for i in range(len(ds)):
    ex = ds[i]
    ref_ipa = extract_phones_from_words(ex.get('words', []))
    score = ex.get('accuracy', ex.get('score', None))
    text = ex.get('text', ex.get('sentence', ex.get('prompt', '')))

    refs.append(ref_ipa)
    human_scores.append(float(score) if score is not None else float('nan'))
    texts.append(text)

    # Per confronto affidabile: richiediamo reference + score
    if ref_ipa and score is not None:
        valid_indices.append(i)

print('Valid examples for eval:', len(valid_indices), '/', len(ds))
assert len(valid_indices) > 0, 'Nessun esempio valido: controlla colonne (words/accuracy).'

In [None]:
# Load 3 models + processors
from transformers import AutoModelForCTC, AutoProcessor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

processors = []
models = []
model_names = []

for p in MODEL_DIRS:
    p = str(p)
    proc = AutoProcessor.from_pretrained(p)
    m = AutoModelForCTC.from_pretrained(p, torch_dtype=dtype)
    m.to(device)
    m.eval()
    processors.append(proc)
    models.append(m)
    model_names.append(Path(p).name)
    print('‚úì loaded:', p)

def _get_vocab(proc):
    tok = getattr(proc, 'tokenizer', None)
    if tok is None:
        return None
    try:
        return tok.get_vocab()
    except Exception:
        return None

vocabs = [_get_vocab(p) for p in processors]
if all(v is not None for v in vocabs):
    base = vocabs[0]
    for i, v in enumerate(vocabs[1:], start=2):
        if base != v:
            raise RuntimeError(f'Vocab mismatch tra model 1 e model {i}. Late fusion richiede stesso mapping token->id.')
    print('‚úì tokenizer vocabs match')
else:
    print('‚ö†Ô∏è Non posso verificare vocab con certezza. Assicurati che i 3 modelli usino lo stesso vocab.json/tokenizer.')

In [None]:
# Genera combinazioni pesi: SOLO coppie + tripla (no singoli)
def generate_weight_combos(step: float):
    assert 0 < step < 1
    steps = int(round(1.0 / step))
    if abs(steps * step - 1.0) > 1e-6:
        raise ValueError('WEIGHT_STEP deve dividere 1.0 esattamente (es. 0.1, 0.05).')

    combos = []

    # Coppie: (a, 1-a, 0) ecc, escludendo a=0/1 (singoli)
    alphas = [i * step for i in range(1, steps)]
    pairs = [(0,1,2), (0,2,1), (1,2,0)]
    for a in alphas:
        if a <= 0.0 or a >= 1.0:
            continue
        for i,j,k in pairs:
            w = [0.0, 0.0, 0.0]
            w[i] = float(a)
            w[j] = float(1.0 - a)
            w[k] = 0.0
            combos.append(tuple(w))

    # Tripla: griglia sul simplex con tutti i pesi > 0
    for i in range(1, steps):
        for j in range(1, steps - i):
            k = steps - i - j
            if k <= 0:
                continue
            w1 = i / steps
            w2 = j / steps
            w3 = k / steps
            combos.append((w1, w2, w3))

    # Dedup conservando ordine
    seen = set()
    unique = []
    for c in combos:
        key = tuple(round(x, 6) for x in c)
        if key in seen:
            continue
        seen.add(key)
        unique.append(c)
    return unique

weight_combos = generate_weight_combos(WEIGHT_STEP)
print('weight combos:', len(weight_combos))
print('example:', weight_combos[:5])

human_valid = np.array([human_scores[i] for i in valid_indices], dtype=np.float32)
refs_valid = [refs[i] for i in valid_indices]

acc = {c: {'pers': [], 'hq_edits': 0, 'hq_chars': 0} for c in weight_combos}
decode_failures = 0

In [None]:
# Inference loop: forward una volta per batch per modello, poi sweep pesi sul batch
from tqdm.auto import tqdm

def _batch(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

decode_failures = 0
first_decode_errors = []

with torch.no_grad():
    for batch_ids in tqdm(list(_batch(valid_indices, BATCH_SIZE))):
        audios = []
        ok = []
        for idx in batch_ids:
            try:
                arr, _ = decode_audio_to_16k(ds[int(idx)]['audio'], 16000)
                audios.append(arr)
                ok.append(True)
            except Exception as e:
                audios.append(np.zeros(16000, dtype=np.float32))
                ok.append(False)
                decode_failures += 1
                if len(first_decode_errors) < 5:
                    payload = ds[int(idx)]['audio']
                    keys = list(payload.keys()) if isinstance(payload, dict) else None
                    first_decode_errors.append((int(idx), type(payload).__name__, keys, repr(e)))

        logits_list = []
        for proc, model in zip(processors, models):
            inputs = proc(audios, sampling_rate=16000, return_tensors='pt', padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            out = model(**inputs)
            logits_list.append(out.logits)

        # allinea tempo (T) tra modelli
        min_t = min(l.shape[1] for l in logits_list)
        logits_list = [l[:, :min_t, :].float() for l in logits_list]

        ref_batch_raw = [refs[int(i)] for i in batch_ids]
        score_batch = [human_scores[int(i)] for i in batch_ids]

        for combo in weight_combos:
            w1, w2, w3 = combo
            fused = logits_list[0] * float(w1)
            fused = fused + logits_list[1] * float(w2)
            fused = fused + logits_list[2] * float(w3)

            pred_ids = fused.argmax(dim=-1)
            pred_texts = processors[0].batch_decode(pred_ids)

            for p_raw, r_raw, s, ok_i in zip(pred_texts, ref_batch_raw, score_batch, ok):
                # Normalizzazione allineata al progetto
                r = normalize_for_eval(r_raw)
                if not r:
                    acc[combo]['pers'].append(1.0)
                    continue
                if not ok_i:
                    p = ''
                    per_i = 1.0
                else:
                    p = normalize_for_eval(p_raw)
                    per_i = cer(p, r)

                acc[combo]['pers'].append(float(per_i))

                # Task A: high quality score >= 8
                if float(s) >= 8.0:
                    e, n = cer_components('' if not ok_i else p, r)
                    acc[combo]['hq_edits'] += int(e)
                    acc[combo]['hq_chars'] += int(n)

print('decode failures:', decode_failures)
if first_decode_errors:
    print('First decode errors (up to 5):')
    for idx, typ, keys, err in first_decode_errors:
        print('  - idx=', idx, '| type=', typ, '| keys=', keys, '| err=', err)
n_valid = len(valid_indices)
for c in weight_combos:
    len_p = len(acc[c]['pers'])
    assert len_p == n_valid, f'Length mismatch for combo {c}: {len_p} vs {n_valid}'

In [None]:
# Calcola metriche Task A/B/C e append su Excel (schema ALLINEATO a scripts/evaluation/track_benchmark.py)
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support

TRACK_BENCHMARK_COLUMNS = [
    'Model_Name',
    'Architecture',
    'Training_Data',
    'TaskA_PER_HighQuality',
    'TaskA_Accuracy',
    'TaskB_Pearson_r',
    'TaskB_Spearman_rho',
    'TaskC_AUC_ROC',
    'TaskC_F1_Score',
    'TaskC_Recall_Errors',
    'TaskC_Precision',
    'TaskC_Threshold',
    'Notes',
 ]

def append_row_track_benchmark_xlsx(path: Path, row: dict, sheet_name: str = 'Benchmark Results'):
    path.parent.mkdir(parents=True, exist_ok=True)
    if path.exists():
        try:
            df = pd.read_excel(path, engine='openpyxl')
        except Exception:
            df = pd.DataFrame(columns=TRACK_BENCHMARK_COLUMNS)
    else:
        df = pd.DataFrame(columns=TRACK_BENCHMARK_COLUMNS)

    # garantisci colonne
    for col in TRACK_BENCHMARK_COLUMNS:
        if col not in df.columns:
            df[col] = None
    df = df[TRACK_BENCHMARK_COLUMNS]

    # append
    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

    with pd.ExcelWriter(path, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name=sheet_name)

thresholds_to_test = np.arange(0.05, 0.50, 0.01)

rows = []
notes_common = f"weight_step={WEIGHT_STEP} batch={BATCH_SIZE} n_samples={len(valid_indices)} decode_failures={decode_failures}"

def get_active_models_name(weights, names):
    """Genera nome modello con solo modelli attivi (peso > 0)."""
    active = [(w, n) for w, n in zip(weights, names) if w > 0]
    if len(active) == 3:
        return 'LateFusion-3way: ' + '+'.join(names)
    elif len(active) == 2:
        return 'LateFusion-2way: ' + '+'.join(n for w, n in active)
    else:
        return f'Single: {active[0][1]}' if active else 'Unknown'

def get_weights_description(weights, names):
    """Genera descrizione pesi con nomi modelli."""
    parts = [f"{n}={w:.2f}" for n, w in zip(names, weights) if w > 0]
    return ', '.join(parts)

for combo in weight_combos:
    pers = np.array(acc[combo]['pers'], dtype=np.float32)
    hs = human_valid

    # Task A (High Quality)
    hq_chars = max(1, int(acc[combo]['hq_chars']))
    per_high = float(acc[combo]['hq_edits']) / float(hq_chars)  # 0..1
    per_high_pct = per_high * 100.0
    acc_high_pct = (1.0 - per_high) * 100.0

    # Task B
    try:
        pearson_per, _ = pearsonr(1.0 - pers, hs)
    except Exception:
        pearson_per = float('nan')
    try:
        spearman_per, _ = spearmanr(1.0 - pers, hs)
    except Exception:
        spearman_per = float('nan')

    # Task C
    y_true = (hs <= 6.0).astype(int)
    y_prob = pers

    best_f1 = -1.0
    best_threshold = float(thresholds_to_test[0])
    for t in thresholds_to_test:
        y_pred_t = (y_prob >= t).astype(int)
        _, _, f1_t, _ = precision_recall_fscore_support(y_true, y_pred_t, average='binary', zero_division=0)
        if f1_t > best_f1:
            best_f1 = float(f1_t)
            best_threshold = float(t)

    y_pred = (y_prob >= best_threshold).astype(int)
    try:
        auc = float(roc_auc_score(y_true, y_prob))
    except ValueError:
        auc = 0.5

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)

    # Genera nome modello specifico per questa combinazione
    combo_model_name = get_active_models_name(combo, model_names)
    # Genera note con descrizione pesi
    weights_desc = get_weights_description(combo, model_names)
    notes = f"{weights_desc} | {notes_common}"

    row = {
        'Model_Name': combo_model_name,
        'Architecture': ARCHITECTURE,
        'Training_Data': TRAINING_DATA,
        'TaskA_PER_HighQuality': float(per_high_pct),
        'TaskA_Accuracy': float(acc_high_pct),
        'TaskB_Pearson_r': float(pearson_per),
        'TaskB_Spearman_rho': float(spearman_per),
        'TaskC_AUC_ROC': float(auc),
        'TaskC_F1_Score': float(f1),
        'TaskC_Recall_Errors': float(recall),
        'TaskC_Precision': float(precision),
        'TaskC_Threshold': float(best_threshold),
        'Notes': notes,
    }
    rows.append(row)
    append_row_track_benchmark_xlsx(OUTPUT_XLSX, row)

print('‚úì wrote (track_benchmark schema):', OUTPUT_XLSX)
df_out = pd.DataFrame(rows)
df_out = df_out.sort_values(['TaskC_AUC_ROC', 'TaskC_F1_Score'], ascending=False)
df_out.head(10)

## ‚úÖ Output

- Excel (schema `track_benchmark.py`): `/kaggle/working/pronuncIAtion/benchmark_results.xlsx`
- In Kaggle: apri il tab **Output** e scarica il file.
- Ogni combinazione di pesi √® una riga; i pesi sono in `Notes`.