# üöÄ Unified Trainer - Phoneme Recognition Benchmark

## Models
1. Baseline MLP (Linear Probe)
2. HuBERT Large
3. WavLM Weighted
4. XLS-R
5. **Wav2Vec2 Large**
6. **Whisper Encoder**
7. **SpeechTokenizer**
8. **Qwen2-Audio**

## 1. Setup Ambiente

In [None]:
import os, sys, zipfile

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# COLAB Setup - Mount Drive & Extract from ZIP (NO GITHUB!)
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BACKUP = '/content/drive/MyDrive/phoneme_checkpoints'
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    
    PROJECT_DIR = '/content/DeepLearning-Phoneme'
    
    # Extract PROJECT from Drive ZIP (contains code + data)
    ZIP_PATH = '/content/drive/MyDrive/DeepLearning-Phoneme.zip'
    
    if os.path.exists(ZIP_PATH):
        print(f'üì¶ Extracting project from {ZIP_PATH}...')
        with zipfile.ZipFile(ZIP_PATH, 'r') as z:
            z.extractall('/content')
        print('‚úì Project extracted')
    else:
        print(f'‚ùå ERROR: ZIP non trovato: {ZIP_PATH}')
        print('Carica DeepLearning-Phoneme.zip su Google Drive in MyDrive/')
        raise FileNotFoundError(ZIP_PATH)
    
    # Change to project directory
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
            
    print('‚úì Colab ready')
    print(f'üìÅ Working dir: {os.getcwd()}')

In [None]:
# KAGGLE Setup - Use Kaggle Dataset (NO GITHUB!)
if ENV == 'kaggle':
    import shutil
    
    # Il dataset √® in /kaggle/input/deeplearning-phoneme/DeepLearning-Phoneme
    INPUT_DIR = '/kaggle/input/deeplearning-phoneme/DeepLearning-Phoneme'
    PROJECT_DIR = '/kaggle/working/DeepLearning-Phoneme'
    DRIVE_BACKUP = '/kaggle/working/checkpoints'
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    
    # Copy from input to working (Kaggle input is read-only)
    if os.path.exists(INPUT_DIR) and not os.path.exists(PROJECT_DIR):
        print(f'üì¶ Copying project from {INPUT_DIR}...')
        shutil.copytree(INPUT_DIR, PROJECT_DIR)
        print('‚úì Project copied')
    elif os.path.exists(PROJECT_DIR):
        print('‚úì Project gi√† presente in working directory')
    else:
        print('‚ùå ERROR: Dataset non trovato!')
        print(f'Path atteso: {INPUT_DIR}')
        print('Aggiungi il dataset "deeplearning-phoneme" al notebook Kaggle')
        raise FileNotFoundError(INPUT_DIR)
    
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    
    # Install dependencies
    !pip install -q speechtokenizer bitsandbytes accelerate
    
    print(f'‚úì Kaggle ready')
    print(f'üìÅ Project: {os.getcwd()}')
    print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
# LOCAL Setup
if ENV == 'local':
    PROJECT_DIR = os.getcwd()
    if 'notebooks' in PROJECT_DIR:
        PROJECT_DIR = os.path.dirname(PROJECT_DIR)
    DRIVE_BACKUP = f'{PROJECT_DIR}/outputs'

os.makedirs(DRIVE_BACKUP, exist_ok=True)
if os.path.exists(PROJECT_DIR):
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)

print(f'üìÅ Project: {PROJECT_DIR}')
print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
# Install dependencies
!pip install -q transformers datasets evaluate jiwer soundfile librosa torchcodec
import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')

---
## 2. Configuration

In [None]:
CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': PROJECT_DIR,
    'epochs': 10,
    'output_base': DRIVE_BACKUP,
}
for k,v in CONFIG.items():
    if 'path' in k:
        print(f"{'‚úì' if os.path.exists(v) else '‚úó'} {k}: {v}")

---
## 3. Training - Existing Models

In [None]:
# Baseline MLP
!python scripts/training/train_baseline_mlp.py \
    --csv-path "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/baseline_mlp" \
    --epochs {CONFIG['epochs']}

In [None]:
# HuBERT Large
!python scripts/training/train_hubert.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/hubert_large" \
    --epochs {CONFIG['epochs']}

---
## 4. Training - NEW SOTA Models

In [None]:
# 1. WAV2VEC2 LARGE
!python scripts/training/train_wav2vec2.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/wav2vec2" \
    --epochs {CONFIG['epochs']} --batch-size 4

In [None]:
# 2. WHISPER ENCODER
!python scripts/training/train_whisper_encoder.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/whisper_encoder" \
    --epochs {CONFIG['epochs']} --batch-size 4

In [None]:
# 3. SPEECHTOKENIZER (Discrete)
!python scripts/training/train_speechtokenizer.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/speechtokenizer" \
    --epochs {CONFIG['epochs']} --batch-size 8

In [None]:
# 4. QWEN2-AUDIO (4-bit)
!python scripts/training/train_qwen_audio.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/qwen_audio" \
    --epochs {CONFIG['epochs']} --batch-size 2

---
## 5. Evaluation

In [None]:
# Evaluate (replace MODEL with: wav2vec2, whisper_encoder, speechtokenizer, qwen_audio)
MODEL = 'wav2vec2'
!python scripts/evaluation/evaluate_speechocean.py \
    --model-path "{CONFIG['output_base']}/{MODEL}/final_model_{MODEL}"

---
## 6. Download Checkpoints (Kaggle only)

In [None]:
# KAGGLE: I checkpoint sono in /kaggle/working/checkpoints
# Puoi scaricarli dal pannello Output dopo il commit
if ENV == 'kaggle':
    import shutil
    # Crea ZIP per download
    for model in ['wav2vec2', 'whisper_encoder', 'speechtokenizer', 'qwen_audio']:
        model_path = f'{DRIVE_BACKUP}/{model}'
        if os.path.exists(model_path):
            zip_path = f'/kaggle/working/{model}_checkpoint'
            shutil.make_archive(zip_path, 'zip', model_path)
            print(f'‚úì {model}: {zip_path}.zip')
    print('\nüì• Scarica gli ZIP dal pannello Output!')