# üåç XLS-R Training (Modello Multilingua)

Questo notebook addestra **XLS-R** (wav2vec2-xls-r-300m), un modello multilingua pre-addestrato su 128 lingue.

**Perch√© XLS-R?**
- Pre-training su 128 lingue ‚Üí variet√† fonetica maggiore
- Complementa WavLM (focalizzato su inglese)
- Ottimo per speaker non-nativi

**‚ö†Ô∏è IMPORTANTE:**
- XLS-R √® un modello grande (300M parametri)
- Richiede ~12GB VRAM (usa T4 o migliore)
- Training pi√π lento di WavLM (~2x)

## 1. Setup Ambiente

In [None]:
# 1.1 Verifica GPU
!nvidia-smi

import torch
print(f"\n{'='*50}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA disponibile: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"VRAM: {vram:.1f} GB")
    
    if vram < 12:
        print("\n‚ö†Ô∏è ATTENZIONE: XLS-R richiede ~12GB VRAM")
        print("   Potrebbe essere necessario ridurre batch_size")

In [None]:
# 1.2 Monta Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("‚úÖ Drive montato")

In [None]:
# 1.3 Estrai progetto da zip
import os
import zipfile
from pathlib import Path

ZIP_PATH = '/content/drive/MyDrive/phonemeRef.zip'
EXTRACT_PATH = '/content/DeepLearning-Phoneme'

if not os.path.exists(ZIP_PATH):
    raise FileNotFoundError(f"‚ùå File non trovato: {ZIP_PATH}\nCarica phonemeRef.zip su Google Drive")

print(f"üì¶ Estrazione {ZIP_PATH}...")
with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Trova cartella estratta
extracted = [f for f in os.listdir('/content/') if os.path.isdir(f'/content/{f}') and 'Phoneme' in f]
if extracted:
    EXTRACT_PATH = f'/content/{extracted[0]}'

os.chdir(EXTRACT_PATH)
print(f"‚úÖ Progetto in: {EXTRACT_PATH}")
!ls -la

In [None]:
# 1.4 Installa dipendenze
!pip install -q transformers datasets evaluate jiwer accelerate soundfile librosa pyyaml tqdm
print("\n‚úÖ Dipendenze installate")

## 2. Preparazione Dataset

In [None]:
# 2.1 Carica dataset
import pandas as pd
from pathlib import Path

# Opzioni dataset (DEVE essere lo stesso di WavLM!)
DATASET_OPTIONS = [
    'data/processed/combined_augmented.csv',
    'data/processed/combined_dataset.csv',
    'data/processed/phonemeref_processed.csv',
]

DATASET_CSV = None
for opt in DATASET_OPTIONS:
    if Path(opt).exists():
        DATASET_CSV = opt
        break

if not DATASET_CSV:
    raise FileNotFoundError("‚ùå Nessun dataset trovato!")

df = pd.read_csv(DATASET_CSV)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"   Samples: {len(df):,}")
print(f"\n‚ö†Ô∏è IMPORTANTE: Usa lo stesso dataset di WavLM per l'ensemble!")

if 'source' in df.columns:
    print(f"\nüìä Distribuzione:")
    print(df['source'].value_counts())

In [None]:
# 2.2 Verifica vocab.json
import json

# CRITICO: XLS-R DEVE usare lo stesso vocab di WavLM!
vocab_path = Path('data/processed/vocab.json')
if vocab_path.exists():
    with open(vocab_path, encoding='utf-8') as f:
        vocab = json.load(f)
    print(f"üìä Vocab: {len(vocab)} simboli")
    print(f"   Esempio: {list(vocab.keys())[:10]}")
    print(f"\n‚úÖ Stesso vocab.json di WavLM - output allineati per ensemble")
else:
    raise FileNotFoundError("‚ùå vocab.json non trovato!")

## 3. Configurazione Training

In [None]:
# 3.1 Configurazione (ottimizzata per XLS-R)
import yaml
import os

# === CONFIGURAZIONE PRINCIPALE ===
DRIVE_OUTPUT_DIR = '/content/drive/MyDrive/phoneme_xlsr'

# XLS-R √® pi√π grande - batch size ridotto
config = {
    'seed': 42,
    'model': {
        'name': 'facebook/wav2vec2-xls-r-300m',
        'freeze_feature_encoder': True
    },
    'data': {
        'csv_path': DATASET_CSV,
        'vocab_path': 'data/processed/vocab.json',
        'audio_base_path': '.',
        'val_size': 0.05,
        'test_size': 0.05,
        'sampling_rate': 16000
    },
    'training': {
        'output_dir': DRIVE_OUTPUT_DIR,
        'num_train_epochs': 10,
        # Batch size ridotto per XLS-R (300M parametri)
        'per_device_train_batch_size': 4,
        'per_device_eval_batch_size': 4,
        'gradient_accumulation_steps': 4,  # Effettivo: 4*4=16
        'dataloader_num_workers': 0,
        'dataloader_pin_memory': False,
        'learning_rate': 3e-5,
        'warmup_steps': 500,
        'weight_decay': 0.01,
        'optim': 'adamw_torch',
        'max_grad_norm': 1.0,
        'fp16': True,
        'bf16': False,
        'eval_strategy': 'epoch',
        'save_strategy': 'epoch',
        'save_total_limit': 2,  # Meno checkpoint (modello grande)
        'load_best_model_at_end': True,
        'metric_for_best_model': 'per',
        'greater_is_better': False,
        'logging_steps': 100,
        'disable_tqdm': False,
        'group_by_length': True,
        # Gradient checkpointing per risparmiare VRAM
        'gradient_checkpointing': True,
    }
}

os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

# Salva config
with open('configs/training_config_xlsr.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("="*60)
print("üìã CONFIGURAZIONE XLS-R (300M)")
print("="*60)
print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üî¢ Epochs: {config['training']['num_train_epochs']}")
print(f"üì¶ Batch: {config['training']['per_device_train_batch_size']} x {config['training']['gradient_accumulation_steps']} = {config['training']['per_device_train_batch_size'] * config['training']['gradient_accumulation_steps']}")
print(f"üìà LR: {config['training']['learning_rate']}")
print(f"üíæ Gradient Checkpointing: {config['training']['gradient_checkpointing']}")
print("="*60)

In [None]:
# 3.2 Verifica checkpoint esistenti
from pathlib import Path

output_dir = Path(DRIVE_OUTPUT_DIR)
checkpoints = []

if output_dir.exists():
    checkpoints = sorted([
        d for d in output_dir.iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])

print(f"üìÅ Output: {output_dir}")
if checkpoints:
    print(f"‚úÖ {len(checkpoints)} checkpoint trovati")
    for cp in checkpoints[-3:]:
        print(f"   üìÅ {cp.name}")
else:
    print("‚ùå Nessun checkpoint - Training partir√† da zero")

## 4. Training

In [None]:
# 4.1 Avvia Training XLS-R
import os
from pathlib import Path

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# === OPZIONI ===
RESUME = "auto"

drive_path = Path(DRIVE_OUTPUT_DIR)
existing_checkpoints = []
if drive_path.exists():
    existing_checkpoints = sorted([
        d for d in drive_path.iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])

if RESUME == "auto":
    do_resume = len(existing_checkpoints) > 0
else:
    do_resume = bool(RESUME)

print("="*60)
print("üöÄ AVVIO TRAINING XLS-R (wav2vec2-xls-r-300m)")
print("="*60)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üîÑ Resume: {do_resume}")
print("\n‚ö†Ô∏è XLS-R √® un modello grande - training pi√π lento (~2x WavLM)")
print("="*60)

# Comando
cmd = f"python scripts/train_xlsr.py --config configs/training_config_xlsr.yaml --data-csv {DATASET_CSV}"
if do_resume:
    cmd += " --resume"

!{cmd}

## 5. Valutazione

In [None]:
# 5.1 Valutazione su SpeechOcean762
MODEL_PATH = f"{DRIVE_OUTPUT_DIR}/final_model_xlsr"

if Path(MODEL_PATH).exists():
    print(f"üî¨ Valutazione modello XLS-R: {MODEL_PATH}")
    !python scripts/05_evaluate_speechocean.py --model-path {MODEL_PATH}
else:
    print(f"‚ö†Ô∏è Modello non trovato: {MODEL_PATH}")
    print("   Esegui prima il training!")

## 6. Late Fusion (Ensemble)

In [None]:
# 6.1 Verifica che entrambi i modelli esistano
from pathlib import Path

WAVLM_PATH = '/content/drive/MyDrive/phoneme_wavlm_weighted/final_model_weighted'
XLSR_PATH = f"{DRIVE_OUTPUT_DIR}/final_model_xlsr"

wavlm_exists = Path(WAVLM_PATH).exists()
xlsr_exists = Path(XLSR_PATH).exists()

print("üìä Stato modelli per Ensemble:")
print(f"   WavLM Weighted: {'‚úÖ' if wavlm_exists else '‚ùå'} {WAVLM_PATH}")
print(f"   XLS-R:          {'‚úÖ' if xlsr_exists else '‚ùå'} {XLSR_PATH}")

if wavlm_exists and xlsr_exists:
    print("\nüéâ Entrambi i modelli pronti per Late Fusion!")
else:
    print("\n‚ö†Ô∏è Addestra entrambi i modelli prima del fusion!")

In [None]:
# 6.2 Esegui Late Fusion
from pathlib import Path

WAVLM_PATH = '/content/drive/MyDrive/phoneme_wavlm_weighted/final_model_weighted'
XLSR_PATH = f"{DRIVE_OUTPUT_DIR}/final_model_xlsr"

# Peso per WavLM (0.5 = media semplice, 0.6 = favorisce WavLM)
FUSION_WEIGHT = 0.5

if Path(WAVLM_PATH).exists() and Path(XLSR_PATH).exists():
    print("üî¨ Late Fusion Evaluation")
    print(f"   Peso WavLM: {FUSION_WEIGHT}")
    print(f"   Peso XLS-R: {1-FUSION_WEIGHT}")
    !python scripts/evaluate_fusion.py \
        --model-a {WAVLM_PATH} \
        --model-b {XLSR_PATH} \
        --weight {FUSION_WEIGHT}
else:
    print("‚ö†Ô∏è Uno o entrambi i modelli mancano")
    print("   Esegui prima i training separati!")

In [None]:
# 6.3 Ottimizza peso fusion
from pathlib import Path

WAVLM_PATH = '/content/drive/MyDrive/phoneme_wavlm_weighted/final_model_weighted'
XLSR_PATH = f"{DRIVE_OUTPUT_DIR}/final_model_xlsr"

if Path(WAVLM_PATH).exists() and Path(XLSR_PATH).exists():
    print("üîç Ricerca peso ottimale...\n")
    
    for w in [0.3, 0.4, 0.5, 0.6, 0.7]:
        print(f"{'='*60}")
        print(f"Peso WavLM: {w}, XLS-R: {1-w}")
        print(f"{'='*60}")
        !python scripts/evaluate_fusion.py \
            --model-a {WAVLM_PATH} \
            --model-b {XLSR_PATH} \
            --weight {w} \
            --quiet
        print()
else:
    print("‚ö†Ô∏è Modelli mancanti")

## 7. Salvataggio Finale

In [None]:
# 7.1 Copia modello finale su Drive
import shutil
from pathlib import Path

LOCAL_MODEL = f"{DRIVE_OUTPUT_DIR}/final_model_xlsr"
DRIVE_FINAL = '/content/drive/MyDrive/phoneme_models/xlsr'

if Path(LOCAL_MODEL).exists():
    Path(DRIVE_FINAL).parent.mkdir(parents=True, exist_ok=True)
    shutil.copytree(LOCAL_MODEL, DRIVE_FINAL, dirs_exist_ok=True)
    print(f"‚úÖ Modello XLS-R copiato su: {DRIVE_FINAL}")
else:
    print(f"‚ö†Ô∏è Modello non trovato: {LOCAL_MODEL}")