# üîÄ Early Fusion Trainer - HuBERT + WavLM Multi-Backbone

**Architettura**: Concatena feature di HuBERT Large (1024D) e WavLM Weighted (1024D) ‚Üí CTC Head (2048D)

**Requisiti**: GPU con ‚â•20GB VRAM (A100, RTX 3090+) o batch ridotto

In [None]:
import os, sys, zipfile, glob, re, shutil

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# COLAB Setup
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BACKUP = '/content/drive/MyDrive/phoneme_checkpoints'
    PROJECT_DIR = '/content/DeepLearning-Phoneme'
    ZIP_PATH = '/content/drive/MyDrive/DeepLearning-Phoneme.zip'
    
    if os.path.exists(ZIP_PATH):
        with zipfile.ZipFile(ZIP_PATH, 'r') as z:
            z.extractall('/content')
        print('‚úì Extracted')
    else:
        raise FileNotFoundError(ZIP_PATH)
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)

In [None]:
# KAGGLE Setup
if ENV == 'kaggle':
    PROJECT_DIR = '/kaggle/working/pronuncIAtion'
    DRIVE_BACKUP = '/kaggle/working/checkpoints'
    
    if not os.path.exists(PROJECT_DIR):
        import subprocess
        subprocess.run(['git', 'clone', 'https://github.com/maurocarlu/pronuncIAtion.git', PROJECT_DIR])
    
    DATA_INPUT = '/kaggle/input/pronunciation-data/data'
    DATA_TARGET = f'{PROJECT_DIR}/data'
    
    if os.path.islink(DATA_TARGET):
        print('‚úì Data symlink exists')
    elif os.path.exists(DATA_TARGET):
        shutil.rmtree(DATA_TARGET)
        os.symlink(DATA_INPUT, DATA_TARGET)
    elif os.path.exists(DATA_INPUT):
        os.symlink(DATA_INPUT, DATA_TARGET)
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    print(f'‚úì Kaggle ready: {PROJECT_DIR}')

In [None]:
# LOCAL Setup
if ENV == 'local':
    PROJECT_DIR = os.getcwd()
    if 'notebooks' in PROJECT_DIR:
        PROJECT_DIR = os.path.dirname(PROJECT_DIR)
    DRIVE_BACKUP = f'{PROJECT_DIR}/outputs'

os.makedirs(DRIVE_BACKUP, exist_ok=True)
os.chdir(PROJECT_DIR)
sys.path.insert(0, PROJECT_DIR)
print(f'üìÅ Project: {PROJECT_DIR}')
print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'transformers', 'datasets', 'evaluate', 'jiwer', 'soundfile', 'librosa', 'safetensors'])
import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'üìä GPU: {torch.cuda.get_device_name(0)}')
    print(f'üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

## ‚öôÔ∏è Configuration

In [None]:
CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': PROJECT_DIR,
    'output_dir': f'{DRIVE_BACKUP}/early_fusion',
    
    # Training params - adjust based on VRAM
    'epochs': 5,
    'batch_size': 2,  # Small due to 2x Large models
    'gradient_accumulation': 8,
    'learning_rate': 1e-4,
}

print('üìã Configuration:')
for k,v in CONFIG.items():
    if 'path' in k or 'dir' in k:
        status = '‚úì' if os.path.exists(v) else '‚úó'
        print(f'  {status} {k}: {v}')
    else:
        print(f'  ‚Ä¢ {k}: {v}')

## üöÄ Training Early Fusion

**Architecture**:
```
Audio ‚Üí HuBERT (frozen) ‚Üí 1024D ‚îÄ‚îê
                                  ‚îú‚Üí concat(2048D) ‚Üí CTC Head ‚Üí IPA
Audio ‚Üí WavLM (frozen)  ‚Üí 1024D ‚îÄ‚îò
```

**Memory Tips**:
- 20GB VRAM: batch_size=2, gradient_accumulation=8
- 16GB VRAM: batch_size=1, gradient_accumulation=16
- Uses fp16 + gradient checkpointing automatically

In [None]:
# üîÄ EARLY FUSION TRAINING
import subprocess
result = subprocess.run([
    sys.executable, 'scripts/training/train_early_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', CONFIG['output_dir'],
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--gradient-accumulation', str(CONFIG['gradient_accumulation']),
    '--learning-rate', str(CONFIG['learning_rate']),
], capture_output=False)

if result.returncode == 0:
    print('\n‚úÖ Training completato!')
else:
    print(f'\n‚ùå Training fallito con codice {result.returncode}')

## üîÑ Resume Training (optional)

Se il training √® stato interrotto, esegui questa cella per riprendere dall'ultimo checkpoint.

In [None]:
# üîÑ RESUME FROM CHECKPOINT
import subprocess
result = subprocess.run([
    sys.executable, 'scripts/training/train_early_fusion.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', CONFIG['output_dir'],
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', str(CONFIG['batch_size']),
    '--resume',  # Resume flag
], capture_output=False)

print('‚úÖ Resume completato!' if result.returncode == 0 else f'‚ùå Errore {result.returncode}')

## üìä Check Training Output

In [None]:
# List checkpoints
output_dir = CONFIG['output_dir']
if os.path.exists(output_dir):
    files = sorted(os.listdir(output_dir))
    print(f'üìÅ {output_dir}:')
    for f in files:
        path = os.path.join(output_dir, f)
        if os.path.isdir(path):
            size = sum(os.path.getsize(os.path.join(path, x)) for x in os.listdir(path) if os.path.isfile(os.path.join(path, x)))
            print(f'  üìÇ {f} ({size/1e6:.1f} MB)')
        else:
            print(f'  üìÑ {f}')
else:
    print(f'‚ùå Output dir non esiste: {output_dir}')

## üíæ Download/Backup

In [None]:
# Create ZIP of final model
import datetime

final_model = f"{CONFIG['output_dir']}/final_model_early_fusion"

if os.path.exists(final_model):
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')
    zip_name = f'early_fusion_final_{timestamp}'
    
    if ENV == 'kaggle':
        zip_path = f'/kaggle/working/{zip_name}'
    else:
        zip_path = f'/content/{zip_name}' if ENV == 'colab' else f'{DRIVE_BACKUP}/{zip_name}'
    
    print(f'üì¶ Creating ZIP: {zip_name}.zip')
    shutil.make_archive(zip_path, 'zip', final_model)
    print(f'‚úì Created: {zip_path}.zip')
    
    if ENV == 'colab':
        from google.colab import files
        files.download(f'{zip_path}.zip')
else:
    print(f'‚ùå Final model not found: {final_model}')
    print('üí° Run training first or check for checkpoints')

In [None]:
# Cleanup disk (Kaggle)
if ENV == 'kaggle':
    for f in ['/root/.cache/huggingface']:
        if os.path.exists(f) and not os.path.islink(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è {f}')