# üåç XLS-R Training (Modello Multilingua)

Questo notebook addestra **XLS-R** (wav2vec2-xls-r-300m), un modello multilingua pre-addestrato su 128 lingue.

**Perch√© XLS-R?**
- Pre-training su 128 lingue ‚Üí variet√† fonetica maggiore
- Complementa WavLM (focalizzato su inglese)
- Ottimo per speaker non-nativi

**‚ö†Ô∏è IMPORTANTE:**
- XLS-R √® un modello grande (300M parametri)
- Richiede ~12GB VRAM (usa T4 o migliore)
- Training pi√π lento di WavLM (~2x)

**üî¨ Per l'Ensemble:** Usa il notebook `colab_ensemble.ipynb` dopo aver trainato sia WavLM che XLS-R.

## 1. Setup Ambiente

In [None]:
# 1.1 Verifica GPU
!nvidia-smi

import torch
print(f"\n{'='*50}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA disponibile: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"VRAM: {vram:.1f} GB")
    
    if vram < 12:
        print("\n‚ö†Ô∏è ATTENZIONE: XLS-R richiede ~12GB VRAM")
        print("   Potrebbe essere necessario ridurre batch_size")

In [None]:
# 1.2 Monta Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("‚úÖ Drive montato")

In [None]:
# 1.3 Estrai progetto da zip
import os
import zipfile
from pathlib import Path

ZIP_PATH = '/content/drive/MyDrive/phonemeRef.zip'
EXTRACT_PATH = '/content/DeepLearning-Phoneme'

if not os.path.exists(ZIP_PATH):
    raise FileNotFoundError(f"‚ùå File non trovato: {ZIP_PATH}\nCarica phonemeRef.zip su Google Drive")

print(f"üì¶ Estrazione {ZIP_PATH}...")
with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Trova cartella estratta
extracted = [f for f in os.listdir('/content/') if os.path.isdir(f'/content/{f}') and 'Phoneme' in f]
if extracted:
    EXTRACT_PATH = f'/content/{extracted[0]}'

os.chdir(EXTRACT_PATH)
print(f"‚úÖ Progetto in: {EXTRACT_PATH}")
!ls -la

In [None]:
# 1.4 Installa dipendenze
!pip install -q transformers datasets evaluate jiwer accelerate soundfile librosa pyyaml tqdm audiomentations
!pip install -q torchcodec
print("\n‚úÖ Dipendenze installate")

## 2. Preparazione Dataset

**‚ö†Ô∏è IMPORTANTE:** Usa lo stesso dataset di WavLM per l'ensemble!

In [None]:
# 2.1 Carica e analizza dataset
import pandas as pd
from pathlib import Path

# Opzioni dataset (DEVE essere lo stesso di WavLM!)
DATASET_OPTIONS = [
    'data/processed/combined_augmented.csv',
    'data/processed/combined_dataset.csv',
    'data/processed/phonemeref_processed.csv',
]

DATASET_CSV = None
for opt in DATASET_OPTIONS:
    if Path(opt).exists():
        DATASET_CSV = opt
        break

if not DATASET_CSV:
    raise FileNotFoundError("‚ùå Nessun dataset trovato!")

df = pd.read_csv(DATASET_CSV)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"   Samples: {len(df):,}")
print(f"\n‚ö†Ô∏è IMPORTANTE: Usa lo stesso dataset di WavLM per l'ensemble!")
print(f"\n=== Distribuzione ===")
if 'source' in df.columns:
    print(df['source'].value_counts())

In [None]:
# 2.2 Verifica qualit√† IPA
import pandas as pd
import re

df = pd.read_csv(DATASET_CSV)

# 1. Cerca IPA invalidi (placeholder [word])
placeholder_mask = df['ipa_clean'].str.contains(r'^\[.*\]$', regex=True, na=False)

# 2. Cerca annotazioni problematiche
annotation_mask = df['ipa_clean'].str.contains(
    r'adj\.|n\.|v\.|adv\.|interj\.|for \d|unstressed|stressed|esp\.|also|Brit\.|;',
    regex=True, na=False
)

# 3. IPA troppo corti
short_mask = df['ipa_clean'].str.len() < 2

invalid_mask = placeholder_mask | annotation_mask | short_mask
invalid_count = invalid_mask.sum()

print(f"üîç Analisi qualit√† IPA:")
print(f"   Totale samples: {len(df):,}")
print(f"   IPA invalidi: {invalid_count:,} ({100*invalid_count/len(df):.1f}%)")

if invalid_count > 0:
    df_clean = df[~invalid_mask].copy()
    DATASET_CLEAN = 'data/processed/phonemeref_clean.csv'
    df_clean.to_csv(DATASET_CLEAN, index=False)
    print(f"\n‚úÖ Dataset pulito: {len(df_clean):,} samples")
    DATASET_CSV = DATASET_CLEAN
else:
    print("\n‚úÖ Tutti gli IPA sono validi!")

In [None]:
# 2.3 Fix path e rimuovi file mancanti
import pandas as pd
from pathlib import Path
from tqdm import tqdm

df = pd.read_csv(DATASET_CSV)

def fix_path(path_str):
    path_str = str(path_str).replace('\\', '/')
    if path_str.startswith('data/'):
        return path_str
    if path_str.startswith('audio/'):
        return 'data/raw/phonemeref_data/' + path_str
    if '/audio/' in path_str:
        idx = path_str.find('/audio/')
        return 'data/raw/phonemeref_data' + path_str[idx:]
    if 'data/' in path_str:
        idx = path_str.find('data/')
        return path_str[idx:]
    return path_str

df['audio_path'] = df['audio_path'].apply(fix_path)

# Verifica esistenza file
print("üîç Verifica esistenza file audio...")
existing_mask = [Path(row['audio_path']).exists() for _, row in tqdm(df.iterrows(), total=len(df))]
existing_mask = pd.Series(existing_mask, index=df.index)

n_missing = (~existing_mask).sum()
print(f"\nüìä File esistenti: {existing_mask.sum():,} / File mancanti: {n_missing:,}")

if n_missing > 0:
    df = df[existing_mask].copy()
    print(f"‚úÖ Rimossi {n_missing} samples")

DATASET_FINAL = 'data/processed/phonemeref_ready.csv'
df.to_csv(DATASET_FINAL, index=False)
print(f"\n‚úÖ Dataset pronto: {DATASET_FINAL} ({len(df):,} samples)")
DATASET_CSV = DATASET_FINAL

In [None]:
# 2.4 Verifica vocab.json
import json
from pathlib import Path

vocab_path = Path('data/processed/vocab.json')
if vocab_path.exists():
    with open(vocab_path, encoding='utf-8') as f:
        vocab = json.load(f)
    print(f"üìä Vocab: {len(vocab)} simboli")
    print(f"‚úÖ Stesso vocab.json di WavLM - output allineati per ensemble")
else:
    raise FileNotFoundError("‚ùå vocab.json non trovato!")

## 3. Configurazione Training

In [None]:
# 3.1 Configurazione (ottimizzata per XLS-R)
import yaml
import os

DRIVE_OUTPUT_DIR = '/content/drive/MyDrive/phoneme_xlsr'

config = {
    'seed': 42,
    'model': {
        'name': 'facebook/wav2vec2-xls-r-300m',
        'freeze_feature_encoder': True
    },
    'data': {
        'csv_path': DATASET_CSV,
        'vocab_path': 'data/processed/vocab.json',
        'audio_base_path': '.',
        'val_size': 0.05,
        'test_size': 0.05,
        'sampling_rate': 16000
    },
    'training': {
        'output_dir': DRIVE_OUTPUT_DIR,
        'num_train_epochs': 10,
        'per_device_train_batch_size': 4,
        'per_device_eval_batch_size': 4,
        'gradient_accumulation_steps': 4,
        'learning_rate': 3e-5,
        'warmup_steps': 500,
        'weight_decay': 0.01,
        'fp16': True,
        'gradient_checkpointing': True,
    }
}

os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

with open('configs/training_config_xlsr.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üî¢ Epochs: {config['training']['num_train_epochs']}")
print(f"üì¶ Batch: {config['training']['per_device_train_batch_size']} x {config['training']['gradient_accumulation_steps']}")

In [None]:
# 3.2 Verifica checkpoint esistenti
from pathlib import Path
import json

output_dir = Path(DRIVE_OUTPUT_DIR)
checkpoints = []

if output_dir.exists():
    checkpoints = sorted([
        d for d in output_dir.iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])

print(f"üìÅ Output: {output_dir}")
if checkpoints:
    print(f"‚úÖ {len(checkpoints)} checkpoint trovati")
    for cp in checkpoints[-3:]:
        state_file = cp / "trainer_state.json"
        if state_file.exists():
            with open(state_file) as f:
                state = json.load(f)
            print(f"   üìÅ {cp.name}: Epoch {state.get('epoch', '?')}, PER {state.get('best_metric', '?'):.4f}")
else:
    print("‚ùå Nessun checkpoint - Training da zero")

## 4. Training

In [None]:
# 4.1 Avvia Training XLS-R
import os
from pathlib import Path

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

RESUME = "auto"  # True, False, o "auto"

drive_path = Path(DRIVE_OUTPUT_DIR)
existing_checkpoints = sorted([
    d for d in drive_path.iterdir() 
    if d.is_dir() and d.name.startswith("checkpoint-")
]) if drive_path.exists() else []

do_resume = len(existing_checkpoints) > 0 if RESUME == "auto" else bool(RESUME)

print("="*60)
print("üöÄ AVVIO TRAINING XLS-R")
print("="*60)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üîÑ Resume: {do_resume}")
print("="*60)

cmd = f"python scripts/training/train_xlsr.py --config configs/training_config_xlsr.yaml --data-csv {DATASET_CSV}"
if do_resume:
    cmd += " --resume"

!{cmd}

## 5. Valutazione

In [None]:
# 5.1 Visualizza curve di training
import json
import matplotlib.pyplot as plt
from pathlib import Path

state_path = None
for loc in [
    Path(DRIVE_OUTPUT_DIR) / 'final_model_xlsr' / 'trainer_state.json',
    Path(DRIVE_OUTPUT_DIR) / 'trainer_state.json',
]:
    if loc.exists():
        state_path = loc
        break

if not state_path:
    checkpoints = sorted([
        d for d in Path(DRIVE_OUTPUT_DIR).iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ]) if Path(DRIVE_OUTPUT_DIR).exists() else []
    if checkpoints:
        state_path = checkpoints[-1] / 'trainer_state.json'

if state_path and state_path.exists():
    with open(state_path) as f:
        state = json.load(f)
    
    log_history = state.get('log_history', [])
    train_loss = [(h['step'], h['loss']) for h in log_history if 'loss' in h and 'eval_loss' not in h]
    eval_per = [(h['step'], h['eval_per']) for h in log_history if 'eval_per' in h]
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    if train_loss:
        steps, losses = zip(*train_loss)
        axes[0].plot(steps, losses, 'b-', alpha=0.7)
        axes[0].set_title('Training Loss')
        axes[0].grid(True)
    
    if eval_per:
        steps, pers = zip(*eval_per)
        axes[1].plot(steps, [p*100 for p in pers], 'g-o')
        axes[1].set_title('PER (%)')
        axes[1].grid(True)
        print(f"üèÜ Best PER: {min(pers)*100:.2f}%")
    
    plt.tight_layout()
    plt.savefig(f'{DRIVE_OUTPUT_DIR}/training_curves.png', dpi=150)
    plt.show()
else:
    print("‚ùå trainer_state.json non trovato")

In [None]:
# 5.2 Valutazione su SpeechOcean762
from pathlib import Path

MODEL_PATH = f"{DRIVE_OUTPUT_DIR}/final_model_xlsr"

if Path(MODEL_PATH).exists():
    print(f"üî¨ Valutazione modello XLS-R: {MODEL_PATH}")
    !python scripts/evaluation/evaluate_speechocean.py --model-path {MODEL_PATH}
else:
    print(f"‚ö†Ô∏è Modello non trovato: {MODEL_PATH}")
    print("   Esegui prima il training!")

## 6. Salvataggio Finale

In [None]:
# 6.1 Verifica contenuto su Drive
from pathlib import Path

print(f"üìÅ CONTENUTO: {DRIVE_OUTPUT_DIR}")
print("-"*50)

drive_path = Path(DRIVE_OUTPUT_DIR)
if drive_path.exists():
    for item in sorted(drive_path.iterdir()):
        if item.is_dir():
            n_files = len(list(item.rglob("*")))
            print(f"  üìÅ {item.name}/ ({n_files} files)")
        else:
            size_mb = item.stat().st_size / 1e6
            print(f"  üìÑ {item.name} ({size_mb:.1f} MB)")

    if (drive_path / "final_model_xlsr").exists():
        print("\n‚úÖ Modello finale presente!")
    else:
        print("\n‚ö†Ô∏è Modello finale non trovato")

In [None]:
# 6.2 Crea zip per download
import os

FINAL_MODEL = f'{DRIVE_OUTPUT_DIR}/final_model_xlsr'
ZIP_PATH = f'{DRIVE_OUTPUT_DIR}/final_model_xlsr.zip'

if os.path.exists(FINAL_MODEL):
    !cd {FINAL_MODEL} && zip -r {ZIP_PATH} .
    print(f"\n‚úÖ Zip creato: {ZIP_PATH}")
    !ls -lh {ZIP_PATH}
else:
    print("‚ùå Modello finale non trovato")

---
## üéâ Fine

Modello salvato su Google Drive:
- `final_model_xlsr/` - Modello trainato
- `final_model_xlsr.zip` - Per download

**Prossimo passo:** Usa `colab_ensemble.ipynb` per Late Fusion con WavLM!