# üß† WavLM Weighted Layer Sum Training

Questo notebook addestra WavLM con **Weighted Layer Sum**, un'architettura SOTA che combina tutti i 12 hidden states del Transformer con pesi apprendibili.

**Vantaggi:**
- Layer bassi: informazioni acustiche (formanti, pitch)
- Layer alti: informazioni fonetiche/semantiche
- Pesi apprendibili: il modello impara la combinazione ottimale

## 1. Setup Ambiente

In [None]:
# 1.1 Verifica GPU
!nvidia-smi

import torch
print(f"\n{'='*50}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA disponibile: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# 1.2 Monta Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("‚úÖ Drive montato")

In [None]:
# 1.3 Estrai progetto da zip
import os
import zipfile
from pathlib import Path

ZIP_PATH = '/content/drive/MyDrive/phonemeRef.zip'
EXTRACT_PATH = '/content/DeepLearning-Phoneme'

if not os.path.exists(ZIP_PATH):
    raise FileNotFoundError(f"‚ùå File non trovato: {ZIP_PATH}\nCarica phonemeRef.zip su Google Drive")

print(f"üì¶ Estrazione {ZIP_PATH}...")
with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Trova cartella estratta
extracted = [f for f in os.listdir('/content/') if os.path.isdir(f'/content/{f}') and 'Phoneme' in f]
if extracted:
    EXTRACT_PATH = f'/content/{extracted[0]}'

os.chdir(EXTRACT_PATH)
print(f"‚úÖ Progetto in: {EXTRACT_PATH}")
!ls -la

In [None]:
# 1.4 Installa dipendenze
!pip install -q transformers datasets evaluate jiwer accelerate soundfile librosa pyyaml tqdm
print("\n‚úÖ Dipendenze installate")

## 2. Preparazione Dataset

In [None]:
# 2.1 Carica dataset
import pandas as pd
from pathlib import Path

# Opzioni dataset
DATASET_OPTIONS = [
    'data/processed/combined_augmented.csv',
    'data/processed/combined_dataset.csv',
    'data/processed/phonemeref_processed.csv',
]

DATASET_CSV = None
for opt in DATASET_OPTIONS:
    if Path(opt).exists():
        DATASET_CSV = opt
        break

if not DATASET_CSV:
    raise FileNotFoundError("‚ùå Nessun dataset trovato!")

df = pd.read_csv(DATASET_CSV)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"   Samples: {len(df):,}")

if 'source' in df.columns:
    print(f"\nüìä Distribuzione:")
    print(df['source'].value_counts())

In [None]:
# 2.2 Verifica vocab.json
import json

vocab_path = Path('data/processed/vocab.json')
if vocab_path.exists():
    with open(vocab_path, encoding='utf-8') as f:
        vocab = json.load(f)
    print(f"üìä Vocab: {len(vocab)} simboli")
    print(f"   Esempio: {list(vocab.keys())[:10]}")
else:
    raise FileNotFoundError("‚ùå vocab.json non trovato!")

## 3. Configurazione Training

In [None]:
# 3.1 Configurazione (ottimizzata per Tesla T4)
import yaml
import os

# === CONFIGURAZIONE PRINCIPALE ===
DRIVE_OUTPUT_DIR = '/content/drive/MyDrive/phoneme_wavlm_weighted'

config = {
    'seed': 42,
    'model': {
        'name': 'microsoft/wavlm-large',
        'freeze_feature_encoder': True
    },
    'data': {
        'csv_path': DATASET_CSV,
        'vocab_path': 'data/processed/vocab.json',
        'audio_base_path': '.',
        'val_size': 0.05,
        'test_size': 0.05,
        'sampling_rate': 16000
    },
    'training': {
        'output_dir': DRIVE_OUTPUT_DIR,
        'num_train_epochs': 10,
        'per_device_train_batch_size': 8,
        'per_device_eval_batch_size': 8,
        'gradient_accumulation_steps': 2,
        'dataloader_num_workers': 0,
        'dataloader_pin_memory': False,
        'learning_rate': 3e-5,
        'warmup_steps': 500,
        'weight_decay': 0.01,
        'optim': 'adamw_torch',
        'max_grad_norm': 1.0,
        'fp16': True,
        'bf16': False,
        'eval_strategy': 'epoch',
        'save_strategy': 'epoch',
        'save_total_limit': 3,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'per',
        'greater_is_better': False,
        'logging_steps': 100,
        'disable_tqdm': False,
        'group_by_length': True,
    }
}

os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

# Salva config
with open('configs/training_config_weighted.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("="*60)
print("üìã CONFIGURAZIONE WAVLM WEIGHTED (LARGE)")
print("="*60)
print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üî¢ Epochs: {config['training']['num_train_epochs']}")
print(f"üì¶ Batch: {config['training']['per_device_train_batch_size']} x {config['training']['gradient_accumulation_steps']}")
print(f"üìà LR: {config['training']['learning_rate']}")
print("="*60)

In [None]:
# 3.2 Verifica checkpoint esistenti
from pathlib import Path
import json

output_dir = Path(DRIVE_OUTPUT_DIR)
checkpoints = []

if output_dir.exists():
    checkpoints = sorted([
        d for d in output_dir.iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])

print(f"üìÅ Output: {output_dir}")
if checkpoints:
    print(f"‚úÖ {len(checkpoints)} checkpoint trovati")
    for cp in checkpoints[-3:]:
        print(f"   üìÅ {cp.name}")
else:
    print("‚ùå Nessun checkpoint - Training partir√† da zero")

## 4. Training

In [None]:
# 4.1 Avvia Training con script train_weighted.py
import os
from pathlib import Path

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# === OPZIONI ===
RESUME = "auto"

drive_path = Path(DRIVE_OUTPUT_DIR)
existing_checkpoints = []
if drive_path.exists():
    existing_checkpoints = sorted([
        d for d in drive_path.iterdir() 
        if d.is_dir() and d.name.startswith("checkpoint-")
    ])

if RESUME == "auto":
    do_resume = len(existing_checkpoints) > 0
else:
    do_resume = bool(RESUME)

print("="*60)
print("üöÄ AVVIO TRAINING WAVLM WEIGHTED (LARGE)")
print("="*60)
print(f"üìä Dataset: {DATASET_CSV}")
print(f"üìÅ Output: {DRIVE_OUTPUT_DIR}")
print(f"üîÑ Resume: {do_resume}")
print("="*60)

# Comando
cmd = f"python scripts/train_weighted.py --config configs/training_config_weighted.yaml --data-csv {DATASET_CSV}"
if do_resume:
    cmd += " --resume"

!{cmd}

## 5. Valutazione

In [None]:
# 5.1 Valutazione su SpeechOcean762
MODEL_PATH = f"{DRIVE_OUTPUT_DIR}/final_model_weighted"

if Path(MODEL_PATH).exists():
    print(f"üî¨ Valutazione modello: {MODEL_PATH}")
    !python scripts/05_evaluate_speechocean.py --model-path {MODEL_PATH}
else:
    print(f"‚ö†Ô∏è Modello non trovato: {MODEL_PATH}")
    print("   Esegui prima il training!")

In [None]:
# 5.2 Analisi Layer Weights (quali layer sono pi√π importanti)
import torch
import torch.nn.functional as F
from pathlib import Path

MODEL_PATH = f"{DRIVE_OUTPUT_DIR}/final_model_weighted"

try:
    # Carica il modello per vedere i pesi
    checkpoint = torch.load(f"{MODEL_PATH}/pytorch_model.bin", map_location='cpu')
    
    if 'layer_weights' in checkpoint:
        weights = checkpoint['layer_weights']
        normalized = F.softmax(torch.tensor(weights), dim=0)
        
        print("üìä LAYER WEIGHTS (dopo training)")
        print("="*50)
        for i, w in enumerate(normalized):
            bar = "‚ñà" * int(w * 50)
            print(f"Layer {i:2d}: {w:.4f} {bar}")
        
        print(f"\nüìä Layer pi√π importante: {normalized.argmax().item()}")
    else:
        print("‚ö†Ô∏è layer_weights non trovato nel checkpoint")
except Exception as e:
    print(f"‚ö†Ô∏è Errore caricamento: {e}")

## 6. Salvataggio Finale

In [None]:
# 6.1 Copia modello finale su Drive
import shutil
from pathlib import Path

LOCAL_MODEL = f"{DRIVE_OUTPUT_DIR}/final_model_weighted"
DRIVE_FINAL = '/content/drive/MyDrive/phoneme_models/wavlm_weighted'

if Path(LOCAL_MODEL).exists():
    Path(DRIVE_FINAL).parent.mkdir(parents=True, exist_ok=True)
    shutil.copytree(LOCAL_MODEL, DRIVE_FINAL, dirs_exist_ok=True)
    print(f"‚úÖ Modello copiato su: {DRIVE_FINAL}")
else:
    print(f"‚ö†Ô∏è Modello non trovato: {LOCAL_MODEL}")