# üöÄ Unified Trainer - Phoneme Recognition Benchmark

## Models
1. Baseline MLP (Linear Probe)
2. HuBERT Large
3. WavLM Weighted
4. XLS-R
5. **Wav2Vec2 Large**
6. **Whisper Encoder**
7. **SpeechTokenizer**
8. **Qwen2-Audio**

## 1. Setup Ambiente

In [None]:
import os, sys, zipfile, glob, re

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# COLAB Setup - Mount Drive & Extract from ZIP
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BACKUP = '/content/drive/MyDrive/phoneme_checkpoints'
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    
    PROJECT_DIR = '/content/DeepLearning-Phoneme'
    
    ZIP_PATH = '/content/drive/MyDrive/DeepLearning-Phoneme.zip'
    
    if os.path.exists(ZIP_PATH):
        print(f'üì¶ Extracting project from {ZIP_PATH}...')
        with zipfile.ZipFile(ZIP_PATH, 'r') as z:
            z.extractall('/content')
        print('‚úì Project extracted')
    else:
        print(f'‚ùå ERROR: ZIP non trovato: {ZIP_PATH}')
        raise FileNotFoundError(ZIP_PATH)
    
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    print('‚úì Colab ready')

In [None]:
# KAGGLE Setup - Clone from GitHub (public repo)
if ENV == 'kaggle':
    PROJECT_DIR = '/kaggle/working/pronuncIAtion'
    DRIVE_BACKUP = '/kaggle/working/checkpoints'
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    
    if not os.path.exists(PROJECT_DIR):
        print('üì¶ Cloning from GitHub...')
        !git clone https://github.com/maurocarlu/pronuncIAtion.git $PROJECT_DIR
        print('‚úì Repository cloned')
    else:
        print('‚úì Repository gi√† presente, pulling updates...')
        !cd $PROJECT_DIR && git pull
    
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    
    !pip install -q speechtokenizer bitsandbytes accelerate soundfile librosa
    
    print(f'‚úì Kaggle ready')
    print(f'üìÅ Project: {os.getcwd()}')

In [None]:
# LOCAL Setup
if ENV == 'local':
    PROJECT_DIR = os.getcwd()
    if 'notebooks' in PROJECT_DIR:
        PROJECT_DIR = os.path.dirname(PROJECT_DIR)
    DRIVE_BACKUP = f'{PROJECT_DIR}/outputs'

os.makedirs(DRIVE_BACKUP, exist_ok=True)
if os.path.exists(PROJECT_DIR):
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)

print(f'üìÅ Project: {PROJECT_DIR}')
print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
# Install dependencies
!pip install -q transformers datasets evaluate jiwer soundfile librosa
import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')

---
## 2. Configuration

In [None]:
CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': PROJECT_DIR,
    'epochs': 10,
    'output_base': DRIVE_BACKUP,
}
for k,v in CONFIG.items():
    if 'path' in k:
        print(f"{'‚úì' if os.path.exists(v) else '‚úó'} {k}: {v}")

---
## 3. Resume from Checkpoint (Kaggle)

In [None]:
# Estrai il checkpoint pi√π recente per il resume (esegui SOLO se vuoi riprendere)
MODEL_TO_RESUME = 'wav2vec2'  # Cambia con: whisper_encoder, speechtokenizer, qwen_audio

if ENV == 'kaggle':
    backup_dir = '/kaggle/working/drive_backup'
    output_dir = f'{DRIVE_BACKUP}/{MODEL_TO_RESUME}'
    
    # Trova tutti i checkpoint ZIP per questo modello
    pattern = f'{backup_dir}/{MODEL_TO_RESUME}_checkpoint-*.zip'
    checkpoint_zips = glob.glob(pattern)
    
    if checkpoint_zips:
        # Ordina per step number (estrai numero dal filename)
        def get_step(path):
            match = re.search(r'checkpoint-(\d+)', path)
            return int(match.group(1)) if match else 0
        
        checkpoint_zips.sort(key=get_step, reverse=True)
        latest_zip = checkpoint_zips[0]
        step_num = get_step(latest_zip)
        
        print(f'üì¶ Trovato checkpoint: {latest_zip}')
        print(f'   Step: {step_num}')
        
        # Estrai nella cartella corretta
        checkpoint_dir = f'{output_dir}/checkpoint-{step_num}'
        os.makedirs(output_dir, exist_ok=True)
        
        with zipfile.ZipFile(latest_zip, 'r') as z:
            z.extractall(checkpoint_dir)
        
        print(f'‚úì Checkpoint estratto in: {checkpoint_dir}')
        print(f'\nüöÄ Ora puoi eseguire il training con --resume')
    else:
        print(f'‚ùå Nessun checkpoint trovato per {MODEL_TO_RESUME}')
        print(f'   Pattern cercato: {pattern}')

---
## 4. Training - SOTA Models

In [None]:
# 1. WAV2VEC2 LARGE (aggiungi --resume per riprendere)
!python scripts/training/train_wav2vec2.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/wav2vec2" \
    --epochs {CONFIG['epochs']} --batch-size 4 --resume

In [None]:
# 2. WHISPER ENCODER
!python scripts/training/train_whisper_encoder.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/whisper_encoder" \
    --epochs {CONFIG['epochs']} --batch-size 4

In [None]:
# 3. SPEECHTOKENIZER (Discrete)
!python scripts/training/train_speechtokenizer.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/speechtokenizer" \
    --epochs {CONFIG['epochs']} --batch-size 8

In [None]:
# 4. QWEN2-AUDIO (4-bit)
!python scripts/training/train_qwen_audio.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/qwen_audio" \
    --epochs {CONFIG['epochs']} --batch-size 2

---
## 5. Evaluation

In [None]:
MODEL = 'wav2vec2'
!python scripts/evaluation/evaluate_speechocean.py \
    --model-path "{CONFIG['output_base']}/{MODEL}/final_model_{MODEL}"

---
## 6. Utilities

In [None]:
# Pulizia disco
if ENV == 'kaggle':
    import shutil
    folders = ['/kaggle/working/checkpoints', '/root/.cache/huggingface']
    for f in folders:
        if os.path.exists(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è Deleted: {f}')
    !df -h /kaggle/working

In [None]:
# Download checkpoints
if ENV == 'kaggle':
    import shutil
    for model in ['wav2vec2', 'whisper_encoder', 'speechtokenizer', 'qwen_audio']:
        model_path = f'{DRIVE_BACKUP}/{model}'
        if os.path.exists(model_path):
            zip_path = f'/kaggle/working/{model}_checkpoint'
            shutil.make_archive(zip_path, 'zip', model_path)
            print(f'‚úì {model}: {zip_path}.zip')
    print('\nüì• Scarica gli ZIP dal pannello Output!')