# üîÄ Early Fusion Trainer ‚Äî 3 modelli (3 combinazioni)

Questo notebook (stile simile al late-fusion):
- clona la repo da GitHub (per usare gli script/file ‚Äúcorretti‚Äù)
- legge da Kaggle Input una cartella/zip con **3 modelli fine-tunati**
- lancia il training Early Fusion per tutte le **3 coppie possibili**: (1,2), (1,3), (2,3)

Nota: Early Fusion usa **2 backbone alla volta** ‚Üí per 3 modelli ci sono 3 run separate.

In [None]:
import os, sys, zipfile, glob, re, shutil

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# (Legacy) Setup Colab/Kaggle/Local gestito nelle celle successive.
# Questo notebook ora usa un setup stile late-fusion: install deps + clone repo in una singola cella.
if ENV == 'colab':
    print('Colab: usa la cella deps+clone e poi configura i path dei modelli/dati.')

In [None]:
# (Legacy) Setup Kaggle gestito nelle celle successive.
# Qui lasciamo solo un messaggio per evitare errori/indentation dal vecchio template.
if ENV == 'kaggle':
    print('Kaggle: usa la cella deps+clone e poi la cella di configurazione modelli (ZIP_DATASET_NAME / ZIP_FILENAME).')

In [None]:
# (Legacy) Setup local gestito nelle celle successive.
if ENV == 'local':
    print('Local: usa la cella deps+clone e poi setta LOCAL_EARLYFUSION_MODELS_ROOT se vuoi usare modelli locali.')

In [None]:
# Install deps (Kaggle/Colab/Local) + clone repo (stile late-fusion)
import os, sys, subprocess
from pathlib import Path

pkgs = [
    'transformers>=4.38',
    'datasets>=2.18',
    'evaluate',
    'jiwer',
    'soundfile',
    'librosa',
    'safetensors',
    'accelerate',
    'tqdm',
    'pyyaml',
    'pandas',
]

subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', *pkgs], check=False)

import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'üìä GPU: {torch.cuda.get_device_name(0)}')
    print(f'üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

# ---- clone repo (default ON su Kaggle) ----
IS_KAGGLE = Path('/kaggle').exists()
SKIP_CLONE = str(os.environ.get('DL_PHONEME_SKIP_CLONE', '')).strip().lower() in ('1', 'true', 'yes')

REPO_URL_DEFAULT = 'https://github.com/maurocarlu/pronuncIAtion.git'
REPO_URL = str(os.environ.get('DL_PHONEME_REPO_URL', REPO_URL_DEFAULT)).strip()

PROJECT_DIR = (Path('/kaggle/working/pronuncIAtion') if IS_KAGGLE else Path.cwd())

if IS_KAGGLE and (not SKIP_CLONE) and REPO_URL:
    if not PROJECT_DIR.exists():
        print('Cloning repo:', REPO_URL)
        subprocess.run(['git', 'clone', REPO_URL, str(PROJECT_DIR)], check=False)
    else:
        print('Repo gi√† presente:', PROJECT_DIR)
else:
    if IS_KAGGLE and SKIP_CLONE:
        print('Repo clone skipped (DL_PHONEME_SKIP_CLONE=1).')
    else:
        print('Repo clone skipped (not running on Kaggle).')

# Entra nella repo se esiste (cos√¨ i path coincidono con late-fusion)
if PROJECT_DIR.exists():
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, str(PROJECT_DIR))
print('CWD:', os.getcwd())

## ‚öôÔ∏è Configuration (simile al late-fusion)

- Configura dove si trovano i **3 modelli** in Kaggle Input (cartella o zip).
- Configura output e parametri training.
- La cella successiva creer√† automaticamente le 3 run (coppie).

In [None]:
from pathlib import Path
import zipfile
import shutil
import glob
import os

# ====== Kaggle input: 3 modelli (cartella o zip) ======
# ‚úÖ tuoi path Kaggle
KAGGLE_MODELS_PATH = Path('/kaggle/input/late-fusion/LateFusion')

# Se vuoi usare la logica dataset+nome (come late-fusion notebook), puoi ancora farlo:
KAGGLE_INPUT_DATASET_DIR = '/kaggle/input'
ZIP_DATASET_NAME = os.environ.get('KAGGLE_EARLYFUSION_MODELS_DATASET', 'late-fusion')
ZIP_FILENAME = os.environ.get('KAGGLE_EARLYFUSION_MODELS_NAME', 'LateFusion')  # directory o .zip
ZIP_PATH = Path(KAGGLE_INPUT_DATASET_DIR) / ZIP_DATASET_NAME / ZIP_FILENAME

# Priorit√†: usa direttamente il path completo se esiste
MODELS_INPUT = KAGGLE_MODELS_PATH if KAGGLE_MODELS_PATH.exists() else ZIP_PATH

EXTRACT_DIR = Path('/kaggle/working/early_fusion_models_extracted')
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

print('MODELS_INPUT:', MODELS_INPUT)
print('Exists:', MODELS_INPUT.exists())

if Path('/kaggle').exists():
    assert MODELS_INPUT.exists(), (
        f'Percorso modelli non trovato: {MODELS_INPUT}. '
        'Controlla che il Kaggle Dataset input sia montato correttamente.'
    )

# Kaggle pu√≤ montare direttamente una CARTELLA invece di uno zip
if MODELS_INPUT.exists() and MODELS_INPUT.is_dir():
    MODELS_ROOT = MODELS_INPUT
    print('‚ÑπÔ∏è MODELS_ROOT √® una directory (ok).')
elif MODELS_INPUT.exists():
    MODELS_ROOT = EXTRACT_DIR
    with zipfile.ZipFile(MODELS_INPUT, 'r') as z:
        z.extractall(EXTRACT_DIR)
    print('‚úì Extracted to:', EXTRACT_DIR)
else:
    # Local fallback: prova a usare una cartella locale se non sei su Kaggle
    MODELS_ROOT = Path(os.environ.get('LOCAL_EARLYFUSION_MODELS_ROOT', str(PROJECT_DIR / 'outputs' / 'backup')))
    print('‚ÑπÔ∏è Using local MODELS_ROOT:', MODELS_ROOT)

# Trova directory che contengono config.json (modelli HF)
candidate_model_dirs = sorted({p.parent for p in Path(MODELS_ROOT).glob('**/config.json')})
print(f'Found {len(candidate_model_dirs)} candidate model dirs')
for p in candidate_model_dirs[:20]:
    print('  ‚úì', p)
if len(candidate_model_dirs) > 20:
    print('  ...')

# AUTO-select (se possibile) altrimenti imposta manualmente MODEL_DIRS
MODEL_DIRS = None
if len(candidate_model_dirs) == 3:
    MODEL_DIRS = [str(p) for p in candidate_model_dirs]
elif len(candidate_model_dirs) > 3:
    # euristica: prendi le ultime 3 (spesso i final_model stanno pi√π in profondit√†)
    MODEL_DIRS = [str(p) for p in candidate_model_dirs[-3:]]

# Se vuoi forzare manualmente, decommenta e modifica:
# MODEL_DIRS = [
#   str(Path(MODELS_ROOT) / 'hubert_large' / 'final_model_hubert'),
#   str(Path(MODELS_ROOT) / 'wavLM_large' / 'final_model_wavlm_large'),
#   str(Path(MODELS_ROOT) / 'wav2vec2_phoneme' / 'final_model'),
# ]

assert MODEL_DIRS is not None and len(MODEL_DIRS) == 3, (
    'Imposta MODEL_DIRS manualmente: nello zip/directory ci sono !=3 cartelle modello.'
 )

print('\n‚úÖ Selected MODEL_DIRS:')
for i, p in enumerate(MODEL_DIRS, start=1):
    p = Path(p)
    print(f'  Model {i}: {p} | exists={p.exists()} | has_config={(p/"config.json").exists()}')
    assert p.exists() and (p / 'config.json').exists()

MODEL_NAMES = [Path(p).name for p in MODEL_DIRS]
print('MODEL_NAMES:', MODEL_NAMES)

# ====== dataset/data mount (repo) ======
# ‚úÖ tuo path Kaggle per il dataset
DATA_INPUT = Path('/kaggle/input/pronunciation-data/data')
DATA_TARGET = Path(PROJECT_DIR) / 'data'

if Path('/kaggle').exists():
    if DATA_INPUT.exists():
        try:
            if DATA_TARGET.is_symlink():
                print('‚úì data symlink gi√† presente:', DATA_TARGET)
            else:
                if DATA_TARGET.exists():
                    shutil.rmtree(DATA_TARGET)
                os.symlink(str(DATA_INPUT), str(DATA_TARGET))
                print('‚úì data symlink creato:', DATA_TARGET, '->', DATA_INPUT)
        except Exception as e:
            print('‚ö†Ô∏è Symlink data non riuscito:', repr(e))
            print('   Provo copia (pi√π lenta / pi√π spazio)...')
            try:
                if not DATA_TARGET.exists():
                    shutil.copytree(DATA_INPUT, DATA_TARGET)
                    print('‚úì data copiata in repo:', DATA_TARGET)
            except Exception as e2:
                print('‚ùå Copy fallback fallito:', repr(e2))
    else:
        print('‚ö†Ô∏è DATA_INPUT non esiste:', DATA_INPUT)
        print('   Se non ti serve il dataset della repo, ignora questo warning.')

# ====== training config ======
DRIVE_BACKUP = '/kaggle/working/checkpoints' if Path('/kaggle').exists() else str(PROJECT_DIR / 'outputs')
os.makedirs(DRIVE_BACKUP, exist_ok=True)

CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': str(PROJECT_DIR),
    'output_base': f'{DRIVE_BACKUP}/early_fusion_pairs',
    'epochs': int(os.environ.get('EARLYFUSION_EPOCHS', '5')),
    'batch_size': int(os.environ.get('EARLYFUSION_BATCH', '1')),
    'gradient_accumulation': int(os.environ.get('EARLYFUSION_ACCUM', '16')),
    'learning_rate': float(os.environ.get('EARLYFUSION_LR', '1e-4')),
}

print('\nüìã Training CONFIG:')
for k,v in CONFIG.items():
    if 'path' in k or 'base' in k:
        status = '‚úì' if os.path.exists(str(v)) else '‚úó'
        print(f'  {status} {k}: {v}')
    else:
        print(f'  ‚Ä¢ {k}: {v}')

# Le 3 coppie possibili tra 3 modelli
MODEL_PAIRS = [(0,1), (0,2), (1,2)]
print('\nPairs to train:', [(a+1,b+1) for a,b in MODEL_PAIRS])

## üöÄ Training Early Fusion (3 coppie)

Per ogni run, il modello Early Fusion concatena le feature dei **2 backbone scelti** e allena una CTC head.

Esegui in ordine:
1) Setup (deps + clone repo)
2) Config (selezione 3 modelli da Kaggle input)
3) Le 3 celle di training: (1,2), (1,3), (2,3)

Nota: se sei su Kaggle T4, inizia con `batch_size=1` e `gradient_accumulation=16`.

In [None]:
# üîÄ EARLY FUSION TRAINING ‚Äî coppia (1,2)
import subprocess
from pathlib import Path

a, b = MODEL_PAIRS[0]  # (0,1)
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"
out_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    sys.executable, 'scripts/training/train_early_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--model-a-path', MODEL_DIRS[a],
    '--model-b-path', MODEL_DIRS[b],
    '--no-weighted-backbone-b',
 ]

print('üöÄ Running:')
print(' '.join(cmd))
print()

result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

In [None]:
# üîÄ EARLY FUSION TRAINING ‚Äî coppia (1,3)
import subprocess
from pathlib import Path

a, b = MODEL_PAIRS[1]  # (0,2)
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"
out_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    sys.executable, 'scripts/training/train_early_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--model-a-path', MODEL_DIRS[a],
    '--model-b-path', MODEL_DIRS[b],
    '--no-weighted-backbone-b',
 ]

print('üöÄ Running:')
print(' '.join(cmd))
print()

result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

In [None]:
# üîÄ EARLY FUSION TRAINING ‚Äî coppia (2,3)
import subprocess
from pathlib import Path

a, b = MODEL_PAIRS[2]  # (1,2)
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"
out_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    sys.executable, 'scripts/training/train_early_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--model-a-path', MODEL_DIRS[a],
    '--model-b-path', MODEL_DIRS[b],
    '--no-weighted-backbone-b',
 ]

print('üöÄ Running:')
print(' '.join(cmd))
print()

result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

## üîÑ Resume (optional)

Ogni cella di training riparte automaticamente dall‚Äôultimo checkpoint presente dentro la relativa cartella `pair_X_Y` (lo script fa auto-resume se trova `checkpoint-*`).

Se vuoi forzare un resume manuale, usa la cella sotto e scegli la coppia.

In [None]:
# üîÑ RESUME (manuale) ‚Äî scegli quale coppia riprendere
import subprocess
from pathlib import Path

# 0 -> (1,2), 1 -> (1,3), 2 -> (2,3)
PAIR_INDEX = 0
a, b = MODEL_PAIRS[PAIR_INDEX]
out_dir = Path(CONFIG['output_base']) / f"pair_{a+1}_{b+1}"
out_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    sys.executable, 'scripts/training/train_early_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', str(out_dir),
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
    '--model-a-path', MODEL_DIRS[a],
    '--model-b-path', MODEL_DIRS[b],
    '--no-weighted-backbone-b',
    '--resume',
 ]

print('üöÄ Running (resume):')
print(' '.join(cmd))
print()

result = subprocess.run(cmd, capture_output=False)
print('‚úÖ OK' if result.returncode == 0 else f'‚ùå Exit code {result.returncode}')

## üìä Check Training Output

In [None]:
# List output directories (pair runs)
from pathlib import Path
output_base = Path(CONFIG['output_base'])
print('üìÅ output_base:', output_base)

if output_base.exists():
    for d in sorted(output_base.glob('pair_*_*')):
        ckpts = sorted(d.glob('checkpoint-*'))
        final = d / 'final_model_early_fusion'
        print('-', d.name, '| checkpoints:', len(ckpts), '| final_model:', final.exists())
else:
    print('‚ùå output_base non esiste (run training prima)')

## üíæ Download/Backup (optional)

Crea uno zip del `final_model_early_fusion` per una coppia (o per tutte).

In [None]:
# Create ZIP of final model (choose a pair)
import datetime
from pathlib import Path
import shutil

# Se vuoi zippare tutte le coppie, metti ZIP_ALL=True
ZIP_ALL = False
PAIR_INDEX = 0  # usato solo se ZIP_ALL=False

output_base = Path(CONFIG['output_base'])
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')

targets = []
if ZIP_ALL:
    targets = sorted(output_base.glob('pair_*_*'))
else:
    a, b = MODEL_PAIRS[PAIR_INDEX]
    targets = [output_base / f"pair_{a+1}_{b+1}"]

for t in targets:
    final_model = t / 'final_model_early_fusion'
    if not final_model.exists():
        print(f'‚ùå Final model not found: {final_model}')
        continue
    zip_name = f"early_fusion_{t.name}_{timestamp}"
    if Path('/kaggle').exists():
        zip_path = f'/kaggle/working/{zip_name}'
    else:
        zip_path = str(output_base / zip_name)
    print(f'üì¶ Creating ZIP: {zip_name}.zip')
    shutil.make_archive(zip_path, 'zip', str(final_model))
    print(f'‚úì Created: {zip_path}.zip')

In [None]:
# Cleanup disk (Kaggle)
if ENV == 'kaggle':
    for f in ['/root/.cache/huggingface']:
        if os.path.exists(f) and not os.path.islink(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è {f}')