# üöÄ Unified Trainer - Phoneme Recognition Benchmark

In [None]:
import os, sys, zipfile, glob, re, shutil

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# COLAB Setup
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BACKUP = '/content/drive/MyDrive/phoneme_checkpoints'
    PROJECT_DIR = '/content/DeepLearning-Phoneme'
    ZIP_PATH = '/content/drive/MyDrive/DeepLearning-Phoneme.zip'
    
    if os.path.exists(ZIP_PATH):
        with zipfile.ZipFile(ZIP_PATH, 'r') as z:
            z.extractall('/content')
        print('‚úì Extracted')
    else:
        raise FileNotFoundError(ZIP_PATH)
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)

In [None]:
# KAGGLE Setup
if ENV == 'kaggle':
    PROJECT_DIR = '/kaggle/working/pronuncIAtion'
    DRIVE_BACKUP = '/kaggle/working/checkpoints'
    
    import subprocess
    if not os.path.exists(PROJECT_DIR):
        subprocess.run(['git', 'clone', 'https://github.com/maurocarlu/pronuncIAtion.git', PROJECT_DIR], check=False)
    else:
        # Importante: se la cartella esiste gi√†, fai pull per prendere gli ultimi fix agli script
        subprocess.run(['git', '-C', PROJECT_DIR, 'pull', '--rebase'], check=False)
    
    DATA_INPUT = '/kaggle/input/pronunciation-data/data'
    DATA_TARGET = f'{PROJECT_DIR}/data'
    
    if os.path.islink(DATA_TARGET):
        print('‚úì Data symlink exists')
    elif os.path.exists(DATA_TARGET):
        shutil.rmtree(DATA_TARGET)
        os.symlink(DATA_INPUT, DATA_TARGET)
    elif os.path.exists(DATA_INPUT):
        os.symlink(DATA_INPUT, DATA_TARGET)
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    print(f'‚úì Kaggle ready: {PROJECT_DIR}')

In [None]:
# LOCAL Setup
if ENV == 'local':
    PROJECT_DIR = os.getcwd()
    if 'notebooks' in PROJECT_DIR:
        PROJECT_DIR = os.path.dirname(PROJECT_DIR)
    DRIVE_BACKUP = f'{PROJECT_DIR}/outputs'

os.makedirs(DRIVE_BACKUP, exist_ok=True)
os.chdir(PROJECT_DIR)
sys.path.insert(0, PROJECT_DIR)
print(f'üìÅ Project: {PROJECT_DIR}')
print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'transformers', 'datasets', 'evaluate', 'jiwer', 'soundfile', 'librosa', 'accelerate', 'safetensors', 'bitsandbytes', 'peft', 'torchcodec','scipy', 'scikit-learn'])
import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')

## Configuration

In [None]:
CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': PROJECT_DIR,
    'epochs': 10,
    'output_base': DRIVE_BACKUP,
}

for k,v in CONFIG.items():
    if 'path' in k:
        status = '‚úì' if os.path.exists(v) else '‚úó'
        print(f'{status} {k}: {v}')

## Training - Choose Model

In [None]:
import subprocess, shlex, os

def run_streaming(cmd, cwd=None, env=None):
    """Run a command and stream stdout/stderr live in notebooks."""
    # Forza output non-buffered: evita "sembra bloccato" durante fasi lunghe (preprocessing/audio).
    merged_env = os.environ.copy()
    if env:
        merged_env.update(env)
    merged_env.setdefault('PYTHONUNBUFFERED', '1')

    pretty = ' '.join(shlex.quote(str(x)) for x in cmd)
    print(f'‚ñ∂ {pretty}')
    proc = subprocess.Popen(
        cmd,
        cwd=cwd,
        env=merged_env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        universal_newlines=True,
    )
    assert proc.stdout is not None
    for line in proc.stdout:
        print(line, end='', flush=True)
    rc = proc.wait()
    if rc != 0:
        raise RuntimeError(f'Command failed with exit code {rc}: {pretty}')
    return rc

In [None]:
# 1. WAV2VEC2-BERT 2.0 (Recommended - Stable)
cmd = [
    sys.executable, 'scripts/training/train_w2v2_bert.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/w2v2_bert",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '8'
]
run_streaming(cmd)

In [None]:
# 2. MMS (legacy script: mms-300m / older setup)
# Per MMS-1B usa la cella "MMS-1B" pi√π sotto (ha flag anti-OOM + QLoRA).
cmd = [
    sys.executable, 'scripts/training/train_mms.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/mms",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '8',
]
run_streaming(cmd)

In [None]:
# 3. WHISPER ENCODER
cmd = [
    sys.executable, 'scripts/training/train_whisper_encoder.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/whisper_encoder",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4'
]
run_streaming(cmd)

In [None]:
# 4. QWEN2-AUDIO (Linear Probe - encoder frozen)
cmd = [
    sys.executable, 'scripts/training/train_qwen_audio.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/qwen_audio",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '2'
]
run_streaming(cmd)

In [None]:
# 5. SPEECHTOKENIZER
cmd = [
    sys.executable, 'scripts/training/train_speechtokenizer.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/speechtokenizer",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4'
]
run_streaming(cmd)

In [None]:
# 6. WAV2VEC2 PHONEME (lv60-pmp - Domain Init)
# NOTE: The training script now auto-detects char-level vocab and adds a [UNK] sanity check.
extra = ['--save-to-drive'] if ENV == 'colab' else []
cmd = [
    sys.executable, 'scripts/training/train_wav2vec2_phoneme.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    # Use a new folder to avoid overwriting an older run
    '--output-dir', f"{CONFIG['output_base']}/wav2vec2_phoneme_charfix",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4',
    '--learning-rate', '3e-5',
    # Fail fast if labels are mostly [UNK]
    '--unk-check-samples', '300',
    '--max-unk-ratio', '0.05',
] + extra
run_streaming(cmd)

## Training - New Benchmark Models

> Queste celle lanciano i nuovi script aggiunti per il benchmark (raw waveform + M-CTC-T mel CTC).

### Sanity run (consigliato)

> Esegue 1 epoca con batch piccoli e salva in `outputs/_sanity/...` per verificare che pipeline/dataloader/model siano ok prima di lanciare run lunghi.

In [None]:
# Sanity: DATA2VEC2 Large (1 epoca)
cmd = [
    sys.executable, 'scripts/training/train_data2vec2.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/_sanity/data2vec2_large",
    '--epochs', '1',
    '--batch-size', '2',
    # "state dictionary corrupted" su Kaggle = cache HF rotta: sblocca con force download
    # '--force-download',
]
run_streaming(cmd)

In [None]:
# Sanity: DistilHuBERT (1 epoca)
# Checkpoint: ntu-spml/distilhubert (HubertForCTC)
# Nota Kaggle: se il download HF √® gated/limitato, imposta HF_TOKEN/HUGGINGFACE_HUB_TOKEN nelle "Add-ons -> Secrets"
cmd = [
    sys.executable, 'scripts/training/train_distilhubert.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/_sanity/distilhubert",
    '--epochs', '1',
    '--batch-size', '4',
    '--gradient-accumulation-steps', '4',
    '--learning-rate', '3e-5',
    '--warmup-ratio', '0.1',
    # Se la cache HF √® corrotta/incompleta, sblocca con force download:
    # '--force-download',
 ]
run_streaming(cmd, env={'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True'})

In [None]:
# Sanity: XLS-R 1B (1 epoca)
# Nota: su Kaggle 16GB spesso serve limitare durata e/o QLoRA (4-bit).
cmd = [
    sys.executable, 'scripts/training/train_xlsr_1b.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/_sanity/xlsr_1b",
    '--epochs', '1',
    '--batch-size', '1',
    '--eval-batch-size', '1',
    '--gradient-accumulation-steps', '4',
    '--max-audio-seconds', '10',
    '--max-samples', '200',
    # Stabilit√† CTC (anti blank-collapse)
    '--learning-rate', '1e-5',
    '--warmup-ratio', '0.2',
    '--no-spec-augment',
    # Se va OOM anche cos√¨: prova QLoRA 4-bit
    # '--load-in-4bit',
    # '--lora-r', '16', '--lora-alpha', '32', '--lora-dropout', '0.05',
 ]
run_streaming(cmd, env={'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True'})

In [None]:
# Sanity: MMS-1B (1 epoca)
# Nota: su Kaggle 16GB spesso serve limitare durata e/o QLoRA (4-bit).
cmd = [
    sys.executable, 'scripts/training/train_mms_1b.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/_sanity/mms_1b",
    '--epochs', '1',
    '--batch-size', '1',
    '--eval-batch-size', '1',
    '--gradient-accumulation-steps', '4',
    '--max-audio-seconds', '10',
    '--max-samples', '200',
    # Se va OOM anche cos√¨: prova QLoRA 4-bit
    # '--load-in-4bit',
    # '--lora-r', '16', '--lora-alpha', '32', '--lora-dropout', '0.05',
 ]
run_streaming(cmd, env={'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True'})

In [None]:
# (Opzionale) Aggiorna Transformers
# Nota: gli script del benchmark ora usano classi specifiche (NO AutoProcessor/AutoModel).
# Se vedi ImportError su MCTCT* o Parakeet*, fai upgrade e riavvia il kernel (Colab/Kaggle).
import transformers, sys
print('transformers (prima):', transformers.__version__)
!pip -q install --upgrade transformers
import importlib
importlib.reload(transformers)
print('transformers (dopo):', transformers.__version__)

In [None]:
# Sanity: M-CTC-T (Meta) (1 epoca)
# Nota: bucketing per lunghezza √® ON di default nello script; se vuoi disabilitarlo usa --no-group-by-length.
cmd = [
    sys.executable, 'scripts/training/train_mctct.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/_sanity/mctct_large",
    '--epochs', '1',
    '--batch-size', '1',
    # Se il checkpoint √® gated/privato (401), serve autenticazione HF.
    # Opzione A (consigliata): imposta env var HUGGINGFACE_HUB_TOKEN / HF_TOKEN e rilancia la cella.
    # Opzione B: passa esplicitamente il token (non committare il notebook con il token in chiaro!).
    # '--hf-token', '<HF_TOKEN>',
    # Se il download del checkpoint √® incompleto/corrotto, abilita il re-download
    # '--force-download',
    # Debug: disabilita bucketing per lunghezza
    # '--no-group-by-length',
 ]
run_streaming(cmd, env={'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True'})

In [None]:
# Sanity: Parakeet-CTC 1.1B (1 epoca)
# Nota: il modello viene caricato obbligatoriamente in 4-bit (bitsandbytes richiesto).
# Nota: il preprocessing ora salva solo l'audio (input_values) e calcola le feature on-the-fly nel collator (molto pi√π veloce).
cmd = [
    sys.executable, 'scripts/training/train_parakeet.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/_sanity/parakeet_ctc_1p1b",
    '--epochs', '1',
    '--batch-size', '1',
    '--max-samples', '200',
    # Se vuoi forzare il vecchio comportamento (pi√π lento in preprocessing):
    # '--precompute-features',
    # Se il download del checkpoint √® incompleto/corrotto, abilita il re-download
    # '--force-download',
    # Se il checkpoint √® gated/privato (401), serve autenticazione HF
    # '--hf-token', '<HF_TOKEN>',
]
run_streaming(cmd)

In [None]:
# 7. DATA2VEC2 Large (Raw Waveform)
cmd = [
    sys.executable, 'scripts/training/train_data2vec2.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/data2vec2_large",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4',
]
run_streaming(cmd)

In [None]:
# 7b. DistilHuBERT (Raw Waveform)
# Checkpoint: ntu-spml/distilhubert (HubertForCTC)
# Nota Kaggle: se il download HF √® gated/limitato, imposta HF_TOKEN/HUGGINGFACE_HUB_TOKEN nelle "Add-ons -> Secrets"
cmd = [
    sys.executable, 'scripts/training/train_distilhubert.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/distilhubert",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4',
    '--gradient-accumulation-steps', '4',
    '--learning-rate', '3e-5',
    '--warmup-ratio', '0.1',
    # Se la cache HF √® corrotta/incompleta, sblocca con force download:
    # '--force-download',
]
run_streaming(cmd, env={'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True'})

In [None]:
# 8. XLS-R 1B (Raw Waveform)
# Consigli Kaggle 16GB: batch=1 + grad_accum + max-audio-seconds. Se ancora OOM: QLoRA 4-bit.
cmd = [
    sys.executable, 'scripts/training/train_xlsr_1b.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/xlsr_1b",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '1',
    '--eval-batch-size', '1',
    '--gradient-accumulation-steps', '4',
    '--max-audio-seconds', '10',
    # Stabilit√† CTC (anti blank-collapse)
    '--learning-rate', '1e-5',
    '--warmup-ratio', '0.2',
    '--no-spec-augment',
    # QLoRA (opzionale):
    '--load-in-4bit',
 ]
run_streaming(cmd, env={'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True'})

In [None]:
# 9. MMS-1B (Raw Waveform)
# Consigli Kaggle 16GB: batch=1 + grad_accum + max-audio-seconds. Se ancora OOM: QLoRA 4-bit.
cmd = [
    sys.executable, 'scripts/training/train_mms_1b.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/mms_1b",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '1',
    '--eval-batch-size', '1',
    '--gradient-accumulation-steps', '4',
    '--max-audio-seconds', '10',
    # QLoRA (opzionale):
    # '--load-in-4bit',
 ]
run_streaming(cmd, env={'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True'})

In [None]:
# 10. M-CTC-T (Meta) (Mel Spectrogram)
cmd = [
    sys.executable, 'scripts/training/train_mctct.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/mctct_large",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '2',
]
run_streaming(cmd)

In [None]:
# 11. Parakeet-CTC 1.1B (FastConformer-CTC)
# Nota: carico 4-bit + backbone frozen + train solo ctc_head (linear probing).
cmd = [
    sys.executable, 'scripts/training/train_parakeet.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/parakeet_ctc_1p1b",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '1',
    # Se il download del checkpoint √® incompleto/corrotto, abilita il re-download
    # '--force-download',
    # Se il checkpoint √® gated/privato (401), serve autenticazione HF
    # '--hf-token', '<HF_TOKEN>',
 ]
run_streaming(cmd)

## Evaluation (SpeechOcean762)

Valuta il modello appena allenato su SpeechOcean762 con lo script di benchmark.

In [None]:
# Run evaluation on the model produced by the cell above
model_path = f"{CONFIG['output_base']}/wav2vec2_phoneme_charfix/final_model"
cmd = [
    sys.executable, 'scripts/evaluation/evaluate_speechocean.py',
    '--model-path', model_path,
    '--full',
    # '--quiet',  # uncomment for shorter output
]
run_streaming(cmd)

## Utilities

In [None]:
# Cleanup disk (Kaggle)
if ENV == 'kaggle':
    for f in ['/kaggle/working/checkpoints', '/root/.cache/huggingface']:
        if os.path.exists(f) and not os.path.islink(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è {f}')

In [None]:
# Download checkpoints as ZIP (Kaggle)
if ENV == 'kaggle':
    for model in [
        'w2v2_bert',
        'mms',
        'whisper_encoder',
        'qwen_audio',
        'speechtokenizer',
        'data2vec2_large',
        'xlsr_1b',
        'mms_1b',
        'mctct_large',
        'parakeet_ctc_1p1b',
    ]:
        p = f'{DRIVE_BACKUP}/{model}'
        if os.path.exists(p):
            shutil.make_archive(f'/kaggle/working/{model}_ckpt', 'zip', p)
            print(f'‚úì {model}')

In [None]:
# üì• Download checkpoints via browser (Colab only)
if ENV == 'colab':
    from google.colab import files
    import datetime
    
    models_to_download = [
        'w2v2_bert',
        'mms',
        'whisper_encoder',
        'qwen_audio',
        'speechtokenizer',
        'data2vec2_large',
        'xlsr_1b',
        'mms_1b',
        'mctct_large',
        'parakeet_ctc_1p1b',
    ]
    
    for model in models_to_download:
        model_dir = f'{DRIVE_BACKUP}/{model}'
        final_model = f'{model_dir}/final_model'
        
        # Prefer final_model if exists, otherwise use latest checkpoint
        if os.path.exists(final_model):
            source_dir = final_model
            zip_name = f'{model}_final'
        elif os.path.exists(model_dir):
            # Find latest checkpoint
            checkpoints = sorted(glob.glob(f'{model_dir}/checkpoint-*'), 
                               key=lambda x: int(x.split('-')[-1]) if x.split('-')[-1].isdigit() else 0)
            if checkpoints:
                source_dir = checkpoints[-1]
                zip_name = f'{model}_{os.path.basename(source_dir)}'
            else:
                continue
        else:
            continue
        
        # Create ZIP in /content (faster than Drive)
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')
        zip_path = f'/content/{zip_name}_{timestamp}'
        print(f'üì¶ Zipping {source_dir}...')
        shutil.make_archive(zip_path, 'zip', source_dir)
        zip_file = f'{zip_path}.zip'
        size_mb = os.path.getsize(zip_file) / (1024*1024)
        print(f'‚úì Created {zip_file} ({size_mb:.1f} MB)')
        
        # Trigger browser download
        print('‚¨áÔ∏è Starting download...')
        files.download(zip_file)
        print(f'‚úì {model} download complete!')

In [None]:
# === Colab: Evaluate Parakeet on SpeechOcean762 (GPU required) ===
import os, sys, subprocess

# 0) Mount Drive (if your checkpoint is on Drive)
from google.colab import drive
drive.mount('/content/drive')

# 1) Clone repo
REPO_DIR = '/content/DeepLearning-Phoneme'
if not os.path.exists(REPO_DIR):
    subprocess.run(['git', 'clone', 'https://github.com/maurocarlu/pronuncIAtion.git', REPO_DIR], check=True)
else:
    subprocess.run(['git', '-C', REPO_DIR, 'pull', '--rebase'], check=False)

os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)
print('‚úì Repo ready:', os.getcwd())

# 2) Install deps (include bitsandbytes for 4-bit Parakeet)
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'], check=False)
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',
                'transformers', 'datasets', 'evaluate', 'jiwer', 'soundfile', 'librosa',
                'accelerate', 'safetensors', 'bitsandbytes', 'scipy', 'scikit-learn'], check=True)

# 3) Sanity: GPU must be available for quantized Parakeet
import torch
print('PyTorch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if not torch.cuda.is_available():
    raise RuntimeError('Parakeet quantizzato (4/8-bit) richiede GPU in Colab: Runtime -> Change runtime type -> GPU')

# 4) (Optional) HF token if needed for gated models
# os.environ['HF_TOKEN'] = '...'  # sconsigliato in chiaro; meglio usare colab secrets/vars
# os.environ['HUGGINGFACE_HUB_TOKEN'] = os.environ['HF_TOKEN']

# 5) Point to your trained model folder
# Esempio: se hai caricato il checkpoint su Drive
MODEL_PATH = '/content/drive/MyDrive/backup/parakeet/final_model'
assert os.path.exists(MODEL_PATH), f'Model path not found: {MODEL_PATH}'
print('‚úì MODEL_PATH:', MODEL_PATH)

def run_with_full_output(cmd, cwd=None):
    """Run a command and always show stdout/stderr (so notebooks don't hide the real error)."""
    pretty = ' '.join(str(x) for x in cmd)
    print('‚ñ∂', pretty)
    p = subprocess.run(cmd, cwd=cwd, text=True, capture_output=True)
    if p.stdout:
        print(p.stdout)
    if p.stderr:
        print(p.stderr)
    if p.returncode != 0:
        raise RuntimeError(f'Command failed with exit code {p.returncode}: {pretty}')
    return p.returncode

# 6) Run SpeechOcean evaluation
cmd = [
    sys.executable, 'scripts/evaluation/evaluate_speechocean.py',
    '--model-path', MODEL_PATH,
    '--full',
    # Nota: lascio QUIET disabilitato per vedere bene log e stacktrace in caso di crash.
    # '--quiet',
 ]
run_with_full_output(cmd)