# üö™ Gated Fusion Trainer ‚Äî 3 modelli (3 combinazioni)

Questo notebook addestra **Gated Fusion** per tutte le **3 coppie possibili** di modelli.

**Differenza da Early Fusion:**
- Early Fusion: concat 2048D ‚Üí CTC head
- **Gated Fusion**: gate = œÉ(W¬∑[h_a, h_b]) ‚Üí h_fused = gate¬∑h_a + (1-gate)¬∑h_b ‚Üí CTC head

**Vantaggi Gated Fusion:**
- Il gate impara automaticamente quale backbone √® migliore per ogni contesto
- Meno parametri trainabili (~46K vs ~88K)
- Interpretabilit√†: analizzando il gate si capisce quale modello contribuisce di pi√π

**Run da eseguire:** (1,2), (1,3), (2,3) = 3 training separati

In [None]:
import os, sys, subprocess
from pathlib import Path

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# Install dependencies + clone repo
pkgs = [
    'transformers>=4.38',
    'datasets>=2.18',
    'evaluate',
    'jiwer',
    'soundfile',
    'librosa',
    'safetensors',
    'accelerate',
    'tqdm',
    'pyyaml',
    'pandas',
]

subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', *pkgs], check=False)

import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'üìä GPU: {torch.cuda.get_device_name(0)}')
    print(f'üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

# Clone repo
IS_KAGGLE = Path('/kaggle').exists()
SKIP_CLONE = str(os.environ.get('DL_PHONEME_SKIP_CLONE', '')).strip().lower() in ('1', 'true', 'yes')
REPO_URL = 'https://github.com/maurocarlu/pronuncIAtion.git'
PROJECT_DIR = Path('/kaggle/working/pronuncIAtion') if IS_KAGGLE else Path.cwd()

if IS_KAGGLE and (not SKIP_CLONE) and REPO_URL:
    if not PROJECT_DIR.exists():
        print('Cloning repo:', REPO_URL)
        subprocess.run(['git', 'clone', REPO_URL, str(PROJECT_DIR)], check=False)
    else:
        print('Repo gi√† presente:', PROJECT_DIR)

if PROJECT_DIR.exists():
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, str(PROJECT_DIR))
print('CWD:', os.getcwd())

## ‚öôÔ∏è Configuration

Configura:
1. Path ai **3 modelli** da combinare
2. Path dataset e vocab
3. Parametri training

Le 3 coppie verranno create automaticamente: (1,2), (1,3), (2,3)

In [None]:
import zipfile
import shutil

# ====== Kaggle: path modelli ======
KAGGLE_MODELS_PATH = Path('/kaggle/input/late-fusion/LateFusion')
EXTRACT_DIR = Path('/kaggle/working/gated_fusion_models_extracted')
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

if KAGGLE_MODELS_PATH.exists() and KAGGLE_MODELS_PATH.is_dir():
    MODELS_ROOT = KAGGLE_MODELS_PATH
else:
    MODELS_ROOT = Path(PROJECT_DIR / 'outputs' / 'backup')

print('MODELS_ROOT:', MODELS_ROOT)

# Trova modelli
candidate_model_dirs = sorted({p.parent for p in Path(MODELS_ROOT).glob('**/config.json')})
print(f'Found {len(candidate_model_dirs)} candidate model dirs')
for p in candidate_model_dirs[:10]:
    print('  ‚úì', p)

# ====== Seleziona 3 modelli ======
MODEL_DIRS = None
if len(candidate_model_dirs) == 3:
    MODEL_DIRS = [str(p) for p in candidate_model_dirs]
elif len(candidate_model_dirs) > 3:
    MODEL_DIRS = [str(p) for p in candidate_model_dirs[-3:]]

# Se vuoi forzare manualmente, decommenta:
# MODEL_DIRS = [
#   str(Path(MODELS_ROOT) / 'hubert_large' / 'final_model_hubert'),
#   str(Path(MODELS_ROOT) / 'wavLM_large' / 'final_model_wavlm_large'),
#   str(Path(MODELS_ROOT) / 'wavlm_base' / 'final_model'),
# ]

assert MODEL_DIRS is not None and len(MODEL_DIRS) == 3, (
    'Imposta MODEL_DIRS manualmente: nello zip/directory ci sono !=3 cartelle modello.'
)

print('\n‚úÖ Selected MODEL_DIRS:')
for i, p in enumerate(MODEL_DIRS, start=1):
    p = Path(p)
    print(f'  Model {i}: {p.name} | exists={p.exists()}')

MODEL_NAMES = [Path(p).name for p in MODEL_DIRS]
print('MODEL_NAMES:', MODEL_NAMES)

# Le 3 coppie possibili
MODEL_PAIRS = [(0,1), (0,2), (1,2)]
print('\nPairs to train:', [(a+1,b+1) for a,b in MODEL_PAIRS])

In [None]:
# ====== Dataset e training config ======
DATA_INPUT = Path('/kaggle/input/pronunciation-data/data')
DATA_TARGET = Path(PROJECT_DIR) / 'data'

# Symlink data
if Path('/kaggle').exists() and DATA_INPUT.exists():
    try:
        if not DATA_TARGET.exists():
            os.symlink(str(DATA_INPUT), str(DATA_TARGET))
            print('‚úì data symlink creato')
    except Exception as e:
        print('‚ö†Ô∏è Symlink fallito:', e)

# Training config
DRIVE_BACKUP = '/kaggle/working/checkpoints' if Path('/kaggle').exists() else str(PROJECT_DIR / 'outputs')
os.makedirs(DRIVE_BACKUP, exist_ok=True)

CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': str(PROJECT_DIR),
    'output_base': f'{DRIVE_BACKUP}/gated_fusion_pairs',
    'epochs': int(os.environ.get('GATEDFUSION_EPOCHS', '5')),
    'batch_size': int(os.environ.get('GATEDFUSION_BATCH', '2')),
    'gradient_accumulation': int(os.environ.get('GATEDFUSION_ACCUM', '8')),
    'learning_rate': float(os.environ.get('GATEDFUSION_LR', '1e-4')),
}

print('\nüìã Training CONFIG:')
for k,v in CONFIG.items():
    if 'path' in k or 'base' in k:
        status = '‚úì' if os.path.exists(str(v)) else '‚úó'
        print(f'  {status} {k}: {v}')
    else:
        print(f'  ‚Ä¢ {k}: {v}')

## üöÄ Training Gated Fusion (3 coppie)

Per ogni coppia, il modello apprende un **gate dinamico** che pesa i due backbone.

**Formula:**
```
gate = œÉ(W ¬∑ [h_a; h_b])      # [batch, time, 1] ‚àà [0, 1]
h_fused = gate * h_a + (1-gate) * h_b
logits = CTC_head(h_fused)
```

Esegui in ordine: (1,2), (1,3), (2,3)

In [None]:
# üö™ GATED FUSION TRAINING ‚Äî coppia (1,2)
import subprocess

a, b = MODEL_PAIRS[0]  # (0,1)
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"
out_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    sys.executable, 'scripts/training/train_gated_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--hubert-path', MODEL_DIRS[a],
    '--wavlm-path', MODEL_DIRS[b],
]

print(f'üöÄ Running Gated Fusion (1,2): {MODEL_NAMES[a]} + {MODEL_NAMES[b]}')
print(' '.join(cmd))
print()

result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

In [None]:
# üö™ GATED FUSION TRAINING ‚Äî coppia (1,3)
import subprocess

a, b = MODEL_PAIRS[1]  # (0,2)
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"
out_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    sys.executable, 'scripts/training/train_gated_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--hubert-path', MODEL_DIRS[a],
    '--wavlm-path', MODEL_DIRS[b],
]

print(f'üöÄ Running Gated Fusion (1,3): {MODEL_NAMES[a]} + {MODEL_NAMES[b]}')
print(' '.join(cmd))
print()

result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

In [None]:
# üö™ GATED FUSION TRAINING ‚Äî coppia (2,3)
import subprocess

a, b = MODEL_PAIRS[2]  # (1,2)
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"
out_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    sys.executable, 'scripts/training/train_gated_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--hubert-path', MODEL_DIRS[a],
    '--wavlm-path', MODEL_DIRS[b],
]

print(f'üöÄ Running Gated Fusion (2,3): {MODEL_NAMES[a]} + {MODEL_NAMES[b]}')
print(' '.join(cmd))
print()

result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

## üîÑ Resume Training (optional)

In [None]:
# üîÑ RESUME ‚Äî scegli quale coppia riprendere
PAIR_INDEX = 0  # 0 -> (1,2), 1 -> (1,3), 2 -> (2,3)

a, b = MODEL_PAIRS[PAIR_INDEX]
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"

cmd = [
    sys.executable, 'scripts/training/train_gated_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--hubert-path', MODEL_DIRS[a],
    '--wavlm-path', MODEL_DIRS[b],
    '--resume',
]

print(f'üîÑ Resuming pair ({a+1},{b+1})')
result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

## üìä Check Training Output

In [None]:
# List output directories (pair runs)
output_base = Path(CONFIG['output_base'])
print('üìÅ output_base:', output_base)

if output_base.exists():
    for d in sorted(output_base.glob('pair_*_*')):
        ckpts = sorted(d.glob('checkpoint-*'))
        final = d / 'final_model_gated_fusion'
        print('-', d.name, '| checkpoints:', len(ckpts), '| final_model:', final.exists())
else:
    print('‚ùå output_base non esiste (run training prima)')

## üíæ Download/Backup

In [None]:
# Create ZIP of final models
import datetime

ZIP_ALL = True  # True = zip tutte le coppie, False = solo PAIR_INDEX
PAIR_INDEX = 0

output_base = Path(CONFIG['output_base'])
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')

targets = []
if ZIP_ALL:
    targets = sorted(output_base.glob('pair_*_*'))
else:
    a, b = MODEL_PAIRS[PAIR_INDEX]
    targets = [output_base / f"pair_{a+1}_{b+1}"]

for t in targets:
    final_model = t / 'final_model_gated_fusion'
    if not final_model.exists():
        print(f'‚ùå Final model not found: {final_model}')
        continue
    zip_name = f"gated_fusion_{t.name}_{timestamp}"
    if Path('/kaggle').exists():
        zip_path = f'/kaggle/working/{zip_name}'
    else:
        zip_path = str(output_base / zip_name)
    print(f'üì¶ Creating ZIP: {zip_name}.zip')
    shutil.make_archive(zip_path, 'zip', str(final_model))
    print(f'‚úì Created: {zip_path}.zip')

In [None]:
# Cleanup disk (Kaggle)
if ENV == 'kaggle':
    for f in ['/root/.cache/huggingface']:
        if os.path.exists(f) and not os.path.islink(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è Cleaned: {f}')