## 1. Setup Ambiente

In [None]:
# 1.1 Verifica GPU
!nvidia-smi

import torch
print(f"\n{'='*50}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA disponibile: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# 1.2 Monta Google Drive (per checkpoint persistenti)
from google.colab import drive
drive.mount('/content/drive')
print("‚úÖ Drive montato")

In [None]:
# 1.3 Estrai progetto da zip
import os
import zipfile
from pathlib import Path

ZIP_PATH = '/content/drive/MyDrive/phonemeRef.zip'
EXTRACT_PATH = '/content/DeepLearning-Phoneme'

if not os.path.exists(ZIP_PATH):
    raise FileNotFoundError(f"‚ùå File non trovato: {ZIP_PATH}\nCarica phonemeRef.zip su Google Drive")

print(f"üì¶ Estrazione {ZIP_PATH}...")
with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Trova cartella estratta
extracted = [f for f in os.listdir('/content/') if os.path.isdir(f'/content/{f}') and 'Phoneme' in f]
if extracted:
    EXTRACT_PATH = f'/content/{extracted[0]}'

os.chdir(EXTRACT_PATH)
print(f"‚úÖ Progetto in: {EXTRACT_PATH}")
!ls -la

In [None]:
# 1.4 Installa dipendenze
!pip install -q transformers datasets evaluate jiwer accelerate soundfile librosa pyyaml tqdm audiomentations

# Torchcodec richiesto per decodifica audio nei datasets HuggingFace
!pip install -q torchcodec

print("\n‚úÖ Dipendenze installate")

## 2. Preparazione Dataset

**Due opzioni:**
- **A) Usa dataset esistente** (pi√π veloce) - se il dataset augmentato √® gi√† nel zip
- **B) Rigenera augmentation** (pi√π lento, ~30 min) - se vuoi ricreare i dati

Esegui le celle della sezione che preferisci.

### 2A. Usa Dataset Esistente

In [None]:
# 2A.1 Carica e analizza dataset
import pandas as pd
from pathlib import Path

# Trova il dataset
DATASET_OPTIONS = [
    'data/processed/combined_augmented.csv'
]

DATASET_CSV = None
for opt in DATASET_OPTIONS:
    if Path(opt).exists():
        DATASET_CSV = opt
        break

if not DATASET_CSV:
    raise FileNotFoundError("‚ùå Nessun dataset trovato! Esegui la sezione 2B.")

df = pd.read_csv(DATASET_CSV)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"   Samples: {len(df):,}")
print(f"\n=== Distribuzione ===")
if 'source' in df.columns:
    print(df['source'].value_counts())
if 'is_correct' in df.columns:
    print(f"\n=== Corretti vs Errori ===")
    print(df['is_correct'].value_counts())

In [None]:
# 2A.2 Verifica qualit√† IPA (cerca placeholder invalidi E annotazioni)
import pandas as pd
import json
import re

df = pd.read_csv(DATASET_CSV)

# 1. Cerca IPA invalidi (placeholder [word])
placeholder_mask = df['ipa_clean'].str.contains(r'^\[.*\]$', regex=True, na=False)

# 2. Cerca annotazioni problematiche (adj., n., v., etc.) - NUOVA verifica!
annotation_mask = df['ipa_clean'].str.contains(
    r'adj\.|n\.|v\.|adv\.|interj\.|for \d|unstressed|stressed|esp\.|also|Brit\.|;',
    regex=True, na=False
)

# 3. IPA troppo corti (< 2 caratteri)
short_mask = df['ipa_clean'].str.len() < 2

invalid_mask = placeholder_mask | annotation_mask | short_mask
invalid_count = invalid_mask.sum()

print(f"üîç Analisi qualit√† IPA:")
print(f"   Totale samples: {len(df):,}")
print(f"   IPA placeholder [word]: {placeholder_mask.sum():,}")
print(f"   IPA con annotazioni (adj., v., etc.): {annotation_mask.sum():,}")
print(f"   IPA troppo corti (<2): {short_mask.sum():,}")
print(f"   Totale invalidi: {invalid_count:,} ({100*invalid_count/len(df):.1f}%)")

if invalid_count > 0:
    print(f"\n‚ö†Ô∏è ATTENZIONE: {invalid_count} samples hanno IPA problematici!")
    
    # Mostra esempi
    print("\n   Esempi di IPA invalidi:")
    examples = df[invalid_mask][['word', 'ipa_clean']].head(10)
    for _, row in examples.iterrows():
        print(f"   - {row['word']}: '{row['ipa_clean']}'")
    
    # Rimuovi invalidi
    df_clean = df[~invalid_mask].copy()
    DATASET_CLEAN = 'data/processed/phonemeref_clean.csv'
    df_clean.to_csv(DATASET_CLEAN, index=False)
    print(f"\n‚úÖ Dataset pulito salvato: {DATASET_CLEAN}")
    print(f"   Samples validi: {len(df_clean):,}")
    DATASET_CSV = DATASET_CLEAN
else:
    print("\n‚úÖ Tutti gli IPA sono validi!")

In [None]:
# 2A.3 Fix path e rimuovi file mancanti
import pandas as pd
from pathlib import Path
from tqdm import tqdm

df = pd.read_csv(DATASET_CSV)

def fix_path(path_str):
    """Converte path Windows in path Colab."""
    path_str = str(path_str).replace('\\', '/')
    
    # Se √® gi√† un path relativo corretto (data/...), usalo
    if path_str.startswith('data/'):
        return path_str
    
    # Se inizia con 'audio/audio/' (caso speciale per parola 'audio')
    if path_str.startswith('audio/audio/'):
        return 'data/raw/phonemeref_data/' + path_str
    
    # Se inizia con 'audio/' (path relativo senza prefisso)
    if path_str.startswith('audio/'):
        return 'data/raw/phonemeref_data/' + path_str
    
    # Se contiene 'audio/' ma non 'data/', aggiungi il prefisso corretto
    if '/audio/' in path_str:
        idx = path_str.find('/audio/')
        return 'data/raw/phonemeref_data' + path_str[idx:]
    
    # Se contiene path Windows assoluto con 'data/'
    if 'data/' in path_str:
        idx = path_str.find('data/')
        return path_str[idx:]
    
    # Se contiene 'DeepLearning-Phoneme/'
    if 'DeepLearning-Phoneme/' in path_str:
        idx = path_str.find('DeepLearning-Phoneme/')
        path_str = path_str[idx + len('DeepLearning-Phoneme/'):]
        if not path_str.startswith('data/'):
            path_str = 'data/raw/phonemeref_data/' + path_str
        return path_str
    
    return path_str

# Fix path
df['audio_path'] = df['audio_path'].apply(fix_path)

# === RIMUOVI FILE MANCANTI ===
print("üîç Verifica esistenza file audio...")
missing_files = []
existing_mask = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Checking files"):
    exists = Path(row['audio_path']).exists()
    existing_mask.append(exists)
    if not exists:
        missing_files.append((row.get('word', '?'), row['audio_path']))

existing_mask = pd.Series(existing_mask, index=df.index)
n_missing = len(missing_files)
n_total = len(df)

print(f"\nüìä Risultato verifica:")
print(f"   Totale samples: {n_total:,}")
print(f"   File esistenti: {n_total - n_missing:,} ({100*(n_total-n_missing)/n_total:.1f}%)")
print(f"   File mancanti: {n_missing:,} ({100*n_missing/n_total:.1f}%)")

if n_missing > 0:
    print(f"\n‚ö†Ô∏è Esempi file mancanti:")
    for word, path in missing_files[:10]:
        print(f"   ‚ùå {word}: {path}")
    
    # Rimuovi file mancanti
    df_clean = df[existing_mask].copy()
    print(f"\n‚úÖ Rimossi {n_missing} samples con file mancanti")
    print(f"   Dataset finale: {len(df_clean):,} samples")
    df = df_clean
else:
    print("\n‚úÖ Tutti i file audio esistono!")

# Verifica distribuzione finale
if 'source' in df.columns:
    print(f"\nüìä Distribuzione finale:")
    print(df['source'].value_counts())

# Salva
DATASET_FINAL = 'data/processed/phonemeref_ready.csv'
df.to_csv(DATASET_FINAL, index=False)
print(f"\n‚úÖ Dataset pronto: {DATASET_FINAL}")
DATASET_CSV = DATASET_FINAL

In [None]:
# 2A.4 Verifica vocab.json
import json
from pathlib import Path

vocab_path = Path('data/processed/vocab.json')
if vocab_path.exists():
    with open(vocab_path, encoding='utf-8') as f:
        vocab = json.load(f)
    
    print(f"üìä Vocab: {len(vocab)} simboli")
    
    # Caratteri speciali attesi
    special = ['[PAD]', '[UNK]', '|']
    
    # Caratteri non-IPA problematici
    non_ipa = []
    ipa_chars = []
    for char in vocab.keys():
        if char in special:
            continue
        # Controlla se √® un carattere IPA valido (usando regex)
        if len(char) == 1 and char.isalpha() and not char.isascii():
            ipa_chars.append(char)
        elif char in ['Àà', 'Àå', 'Àê', ' ≥', "'", '-', ' ']:  # Accenti e simboli IPA
            ipa_chars.append(char)
        elif char.lower() in 'abcdefghijklmnopqrstuvwxyz':  # Lettere ASCII (ok per IPA)
            ipa_chars.append(char)
        else:
            non_ipa.append(char)
    
    print(f"\n   Caratteri speciali: {special}")
    print(f"   Caratteri IPA: {len(ipa_chars)}")
    
    if non_ipa:
        print(f"\n   ‚ö†Ô∏è Caratteri sospetti: {non_ipa}")
        print("      Verifica che siano realmente parte dell'IPA!")
    else:
        print(f"\n   ‚úÖ Tutti i caratteri sembrano IPA validi")
    
    # Mostra alcuni caratteri
    print(f"\n   Esempio simboli: {list(vocab.keys())[3:15]}...")
else:
    print("‚ùå vocab.json non trovato!")

### 2B. Rigenera Dataset Augmentato (Opzionale)

Esegui questa sezione solo se vuoi ricreare il dataset da zero.

In [None]:
# 2B.1 Rigenera dataset augmentato
# ‚ö†Ô∏è ATTENZIONE: richiede ~10-15 minuti!

RIGENERA = False  # Cambia in True per rigenerare

if RIGENERA:
    print("üîÑ Rigenerazione dataset augmentato...")
    print("   Questo richieder√† ~10-15 minuti.")
    
    # Solo acoustic augmentation (no TTS)
    !python scripts/build_augmented_dataset.py \
        --input data/processed/phonemeref_processed.csv \
        --output data/processed/phonemeref_augmented.csv \
        --num-variants 2
    
    DATASET_CSV = 'data/processed/phonemeref_augmented.csv'
    print(f"\n‚úÖ Dataset rigenerato: {DATASET_CSV}")
else:
    print("‚ÑπÔ∏è Rigenerazione saltata. Imposta RIGENERA=True per eseguire.")

## 3. Configurazione Training

In [None]:
# 3.1 Configurazione (ottimizzata per Tesla T4)
import yaml
import os

# === CONFIGURAZIONE PRINCIPALE ===
DRIVE_OUTPUT_DIR = '/content/drive/MyDrive/phoneme_training_v2'
# DATASET_CSV √® definito nella sezione 2

config = {
    'seed': 42,
    'model': {
        'name': 'microsoft/wavlm-large',
        'freeze_feature_encoder': True  # Riduce VRAM, training pi√π veloce
    },
    'data': {
        'csv_path': DATASET_CSV,
        'vocab_path': 'data/processed/vocab.json',
        'audio_base_path': '.',
        'val_size': 0.05,
        'test_size': 0.05,
        'sampling_rate': 16000
    },
    'training': {
        'output_dir': DRIVE_OUTPUT_DIR,
        'num_train_epochs': 10,
        
        # === BATCH (ottimizzato per T4 15GB VRAM) ===
        'per_device_train_batch_size': 8,    # Max stabile su T4
        'per_device_eval_batch_size': 8,
        'gradient_accumulation_steps': 2,     # Effettivo: 8*2=16
        
        # === DATALOADER (Colab-safe) ===
        'dataloader_num_workers': 0,          # Evita memory leak su Colab
        'dataloader_pin_memory': False,       # Non necessario con workers=0
        
        # === OPTIMIZER ===
        'learning_rate': 3e-5,
        'warmup_steps': 500,
        'weight_decay': 0.01,
        'optim': 'adamw_torch',
        
        # === GRADIENT CLIPPING (CRITICO per stabilit√†!) ===
        'max_grad_norm': 1.0,                 # Previene gradient explosion
        
        # === MIXED PRECISION (T4 supporta FP16) ===
        'fp16': True,                         # ~2x speedup su T4
        'bf16': False,                        # T4 non supporta BF16
        
        # === EVAL/SAVE STRATEGY ===
        'eval_strategy': 'epoch',             # Valida ogni epoca (pi√π stabile)
        'save_strategy': 'epoch',             # Salva ogni epoca su Drive
        'save_total_limit': 3,                # Mantieni ultimi 3 checkpoint
        'load_best_model_at_end': True,
        'metric_for_best_model': 'per',
        'greater_is_better': False,
        
        # === LOGGING ===
        'logging_steps': 100,                 # Log ogni 100 steps
        'disable_tqdm': False,                # TQDM attivo per progress bar
        
        # === OTTIMIZZAZIONI ===
        'group_by_length': True,              # Raggruppa audio simili (meno padding)
        'gradient_checkpointing': False,      # Non necessario con batch 8
    }
}

# Crea directory output
os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

# Salva config
with open('configs/training_config_colab.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("="*60)
print("üìã CONFIGURAZIONE TRAINING (T4 Optimized)")
print("="*60)
print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üî¢ Epochs: {config['training']['num_train_epochs']}")
print(f"üì¶ Batch: {config['training']['per_device_train_batch_size']} x {config['training']['gradient_accumulation_steps']} = {config['training']['per_device_train_batch_size'] * config['training']['gradient_accumulation_steps']}")
print(f"üìà LR: {config['training']['learning_rate']}")
print(f"‚ö° FP16: {config['training']['fp16']}")
print(f"üîí Gradient Clipping: {config['training']['max_grad_norm']}")
print(f"üìä TQDM: Enabled")
print("="*60)

In [None]:
# 3.2 Verifica checkpoint esistenti e stato training
from pathlib import Path
import json

output_dir = Path(DRIVE_OUTPUT_DIR)
checkpoints = []

if output_dir.exists():
    checkpoints = sorted([
        d for d in output_dir.iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])

print(f"üìÅ Output: {output_dir}")
print("-"*50)

if checkpoints:
    print(f"‚úÖ {len(checkpoints)} checkpoint trovati:")
    
    last_epoch = 0
    last_step = 0
    best_per = None
    
    for cp in checkpoints[-3:]:  # Ultimi 3
        state_file = cp / "trainer_state.json"
        if state_file.exists():
            with open(state_file) as f:
                state = json.load(f)
            epoch = state.get('epoch', 0)
            step = state.get('global_step', 0)
            best = state.get('best_metric', None)
            max_steps = state.get('max_steps', '?')
            
            last_epoch = max(last_epoch, epoch)
            last_step = max(last_step, step)
            if best:
                best_per = best
            
            info = f"Epoch {epoch:.1f}, Step {step}/{max_steps}"
            if best:
                info += f", Best PER: {best:.4f}"
            print(f"   üìÅ {cp.name}: {info}")
    
    # === ANALISI STATO ===
    print("\n" + "="*50)
    print("üìä ANALISI STATO TRAINING")
    print("="*50)
    
    target_epochs = config['training']['num_train_epochs']
    
    print(f"   Ultima epoch salvata: {last_epoch}")
    print(f"   Epochs configurate: {target_epochs}")
    print(f"   Ultimo step: {last_step}")
    if best_per:
        print(f"   Miglior PER: {best_per*100:.2f}%")
    
    if last_epoch >= target_epochs:
        print(f"\n‚ö†Ô∏è TRAINING GI√Ä COMPLETATO!")
        print(f"   Il checkpoint √® a epoch {last_epoch}, target √® {target_epochs}")
        print(f"\n   OPZIONI:")
        print(f"   1. Aumenta 'num_train_epochs' nella cella 3.1 (es. {int(target_epochs + 5)})")
        print(f"   2. Elimina i checkpoint per ricominciare da zero:")
        print(f"      !rm -rf {DRIVE_OUTPUT_DIR}/checkpoint-*")
    else:
        remaining = target_epochs - last_epoch
        print(f"\n‚úÖ Training pu√≤ continuare per {remaining:.0f} epoche")
        print(f"   (da epoch {last_epoch} a {target_epochs})")
else:
    print("‚ùå Nessun checkpoint - Training partir√† da zero")

## 4. Training

### üóëÔ∏è Pulizia Checkpoint Corrotti (Se Necessario)

Esegui questa cella SOLO se:
- La diagnostica (cella precedente) ha rilevato un modello rotto
- Vuoi ricominciare il training da zero
- Hai checkpoint con PER > 90%

In [None]:
# 3.3 Elimina checkpoint e final_model corrotti
import shutil
from pathlib import Path

# ‚ö†Ô∏è ATTENZIONE: Cambia in True per eliminare
DELETE_CORRUPTED = False

if DELETE_CORRUPTED:
    drive_path = Path(DRIVE_OUTPUT_DIR)
    
    if drive_path.exists():
        print(f"üóëÔ∏è Eliminazione contenuto: {drive_path}")
        
        # Elimina checkpoint
        checkpoints = list(drive_path.glob("checkpoint-*"))
        for cp in checkpoints:
            if cp.is_dir():
                shutil.rmtree(cp)
                print(f"   ‚úì Eliminato {cp.name}")
        
        # Elimina final_model
        final_model = drive_path / "final_model"
        if final_model.exists():
            shutil.rmtree(final_model)
            print(f"   ‚úì Eliminato final_model")
        
        # Mantieni solo config e log
        print(f"\n‚úÖ Pulizia completata!")
        print(f"   Il training ripartir√† da zero alla prossima esecuzione")
    else:
        print(f"‚ÑπÔ∏è Directory non trovata: {drive_path}")
else:
    print("‚ÑπÔ∏è Pulizia disabilitata. Imposta DELETE_CORRUPTED=True per eliminare")

In [None]:
# 4.1 Avvia Training
import os
from pathlib import Path

# Silenzia log
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# === OPZIONI ===
RESUME = "auto"  # "auto", True, False

# Rileva checkpoint
drive_path = Path(DRIVE_OUTPUT_DIR)
existing_checkpoints = []
if drive_path.exists():
    existing_checkpoints = sorted([
        d for d in drive_path.iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])

# Determina resume
if RESUME == "auto":
    do_resume = len(existing_checkpoints) > 0
elif RESUME:
    do_resume = True
else:
    do_resume = False

print("="*60)
print("üöÄ AVVIO TRAINING")
print("="*60)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üîÑ Resume: {do_resume}")
if existing_checkpoints:
    print(f"üìç Ultimo: {existing_checkpoints[-1].name}")
print("="*60)

# Comando
cmd = f"python scripts/03_train.py --config configs/training_config_colab.yaml --data-csv {DATASET_CSV}"
if do_resume:
    cmd += " --resume"

!{cmd}

## 5. Valutazione

In [None]:
# 5.1 Visualizza curve di training
import json
import matplotlib.pyplot as plt
from pathlib import Path

# Trova trainer_state.json
state_path = None
for loc in [
    Path(DRIVE_OUTPUT_DIR) / 'final_model' / 'trainer_state.json',
    Path(DRIVE_OUTPUT_DIR) / 'trainer_state.json',
]:
    if loc.exists():
        state_path = loc
        break

# Cerca anche nell'ultimo checkpoint
if not state_path:
    checkpoints = sorted([
        d for d in Path(DRIVE_OUTPUT_DIR).iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ]) if Path(DRIVE_OUTPUT_DIR).exists() else []
    if checkpoints:
        state_path = checkpoints[-1] / 'trainer_state.json'

if state_path and state_path.exists():
    with open(state_path) as f:
        state = json.load(f)
    
    log_history = state.get('log_history', [])
    
    # Estrai metriche
    train_loss = [(h['step'], h['loss']) for h in log_history if 'loss' in h and 'eval_loss' not in h]
    eval_loss = [(h['step'], h['eval_loss']) for h in log_history if 'eval_loss' in h]
    eval_per = [(h['step'], h['eval_per']) for h in log_history if 'eval_per' in h]
    grad_norm = [(h['step'], h['grad_norm']) for h in log_history if 'grad_norm' in h and h.get('grad_norm') is not None]
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    if train_loss:
        steps, losses = zip(*train_loss)
        axes[0,0].plot(steps, losses, 'b-', alpha=0.7)
        axes[0,0].set_xlabel('Step')
        axes[0,0].set_ylabel('Loss')
        axes[0,0].set_title('Training Loss')
        axes[0,0].grid(True, alpha=0.3)
    
    if eval_loss:
        steps, losses = zip(*eval_loss)
        axes[0,1].plot(steps, losses, 'r-o')
        axes[0,1].set_xlabel('Step')
        axes[0,1].set_ylabel('Eval Loss')
        axes[0,1].set_title('Validation Loss')
        axes[0,1].grid(True, alpha=0.3)
    
    if eval_per:
        steps, pers = zip(*eval_per)
        axes[1,0].plot(steps, [p*100 for p in pers], 'g-o')
        axes[1,0].set_xlabel('Step')
        axes[1,0].set_ylabel('PER (%)')
        axes[1,0].set_title('Phoneme Error Rate')
        axes[1,0].grid(True, alpha=0.3)
    
    if grad_norm:
        steps, norms = zip(*grad_norm)
        axes[1,1].plot(steps, norms, 'm-', alpha=0.7)
        axes[1,1].set_xlabel('Step')
        axes[1,1].set_ylabel('Gradient Norm')
        axes[1,1].set_title('Gradient Norm (check for explosion)')
        axes[1,1].axhline(y=1.0, color='r', linestyle='--', label='max_grad_norm')
        axes[1,1].legend()
        axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{DRIVE_OUTPUT_DIR}/training_curves.png', dpi=150)
    plt.show()
    
    if eval_per:
        best_per = min(pers)
        print(f"\nüèÜ Migliore PER: {best_per*100:.2f}%")
    
    # Check for gradient explosion
    if grad_norm:
        max_norm = max(norms)
        if max_norm > 10:
            print(f"\n‚ö†Ô∏è ATTENZIONE: Gradient norm max={max_norm:.2f} - possibile instabilit√†!")
        else:
            print(f"\n‚úÖ Gradient norm stabile (max={max_norm:.2f})")
else:
    print("‚ùå trainer_state.json non trovato - training non ancora completato?")

In [None]:
# Test su alcuni samples (preferisci originali, ma accetta qualsiasi)
df = pd.read_csv(DATASET_CSV)
if 'source' in df.columns:
    original_df = df[df['source'] == 'original']
    if len(original_df) >= 5:
        test_samples = original_df.sample(5, random_state=42)
    elif len(original_df) > 0:
        test_samples = original_df  # Usa tutti gli originali disponibili
    else:
        # Nessun 'original', prova fonti non augmentate
        non_aug_sources = ['speechocean', 'phonemeref']
        non_aug_df = df[df['source'].isin(non_aug_sources)]
        if len(non_aug_df) >= 5:
            test_samples = non_aug_df.sample(5, random_state=42)
        else:
            test_samples = df.sample(min(5, len(df)), random_state=42)
else:
    test_samples = df.sample(min(5, len(df)), random_state=42)

In [None]:
# 5.3 Valutazione formale su test set
from pathlib import Path

model_path = f'{DRIVE_OUTPUT_DIR}/final_model'
if not Path(model_path).exists():
    checkpoints = sorted([
        d for d in Path(DRIVE_OUTPUT_DIR).iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])
    if checkpoints:
        model_path = str(checkpoints[-1])

print(f"üìä Valutazione modello: {model_path}")

# Usa shell escaping corretto per path con spazi
import subprocess
result = subprocess.run([
    "python", "scripts/04_evaluate.py",
    "--model-path", model_path,
    "--test-csv", DATASET_CSV,
    "--audio-base", "."
], capture_output=False)

if result and result.returncode != 0:
    print("‚ö†Ô∏è Errore nella valutazione")

## 6. Salvataggio

In [None]:
# 6.1 Verifica contenuto su Drive
from pathlib import Path

print("="*60)
print("üìÅ CONTENUTO SU GOOGLE DRIVE")
print("="*60)
print(f"Cartella: {DRIVE_OUTPUT_DIR}")
print("-"*60)

drive_path = Path(DRIVE_OUTPUT_DIR)
if drive_path.exists():
    for item in sorted(drive_path.iterdir()):
        if item.is_dir():
            n_files = len(list(item.rglob("*")))
            print(f"  üìÅ {item.name}/ ({n_files} files)")
        else:
            size_mb = item.stat().st_size / 1e6
            print(f"  üìÑ {item.name} ({size_mb:.1f} MB)")

    final_model = drive_path / "final_model"
    if final_model.exists():
        print("\n‚úÖ Modello finale presente!")
    else:
        print("\n‚ö†Ô∏è Modello finale non trovato")
else:
    print("‚ùå Cartella non trovata")

In [None]:
# 6.2 Crea zip per download
import os

FINAL_MODEL = f'{DRIVE_OUTPUT_DIR}/final_model'
ZIP_PATH = f'{DRIVE_OUTPUT_DIR}/final_model.zip'

if os.path.exists(FINAL_MODEL):
    !cd {FINAL_MODEL} && zip -r {ZIP_PATH} .
    print(f"\n‚úÖ Zip creato: {ZIP_PATH}")
    !ls -lh {ZIP_PATH}
else:
    print("‚ùå Modello finale non trovato")

---
## üéâ Fine

Il modello √® salvato su Google Drive:
- `final_model/` - Modello trainato
- `final_model.zip` - Per download rapido
- `training_curves.png` - Grafici
- `checkpoint-*/` - Checkpoint intermedi