# üöÄ Unified Trainer - Phoneme Recognition Benchmark

In [None]:
import os, sys, zipfile, glob, re, shutil

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# COLAB Setup
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BACKUP = '/content/drive/MyDrive/phoneme_checkpoints'
    PROJECT_DIR = '/content/DeepLearning-Phoneme'
    ZIP_PATH = '/content/drive/MyDrive/DeepLearning-Phoneme.zip'
    
    if os.path.exists(ZIP_PATH):
        with zipfile.ZipFile(ZIP_PATH, 'r') as z:
            z.extractall('/content')
        print('‚úì Extracted')
    else:
        raise FileNotFoundError(ZIP_PATH)
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)

In [None]:
# KAGGLE Setup
if ENV == 'kaggle':
    PROJECT_DIR = '/kaggle/working/pronuncIAtion'
    DRIVE_BACKUP = '/kaggle/working/checkpoints'
    
    if not os.path.exists(PROJECT_DIR):
        import subprocess
        subprocess.run(['git', 'clone', 'https://github.com/maurocarlu/pronuncIAtion.git', PROJECT_DIR])
    
    DATA_INPUT = '/kaggle/input/pronunciation-data/data'
    DATA_TARGET = f'{PROJECT_DIR}/data'
    
    if os.path.islink(DATA_TARGET):
        print('‚úì Data symlink exists')
    elif os.path.exists(DATA_TARGET):
        shutil.rmtree(DATA_TARGET)
        os.symlink(DATA_INPUT, DATA_TARGET)
    elif os.path.exists(DATA_INPUT):
        os.symlink(DATA_INPUT, DATA_TARGET)
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    print(f'‚úì Kaggle ready: {PROJECT_DIR}')

In [None]:
# LOCAL Setup
if ENV == 'local':
    PROJECT_DIR = os.getcwd()
    if 'notebooks' in PROJECT_DIR:
        PROJECT_DIR = os.path.dirname(PROJECT_DIR)
    DRIVE_BACKUP = f'{PROJECT_DIR}/outputs'

os.makedirs(DRIVE_BACKUP, exist_ok=True)
os.chdir(PROJECT_DIR)
sys.path.insert(0, PROJECT_DIR)
print(f'üìÅ Project: {PROJECT_DIR}')
print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'transformers', 'datasets', 'evaluate', 'jiwer', 'soundfile', 'librosa', 'bitsandbytes'])
import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')

## Configuration

In [None]:
CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': PROJECT_DIR,
    'epochs': 10,
    'output_base': DRIVE_BACKUP,
}

for k,v in CONFIG.items():
    if 'path' in k:
        status = '‚úì' if os.path.exists(v) else '‚úó'
        print(f'{status} {k}: {v}')

## Training - Choose Model

In [None]:
# 1. WAV2VEC2-BERT 2.0 (Recommended - Stable)
import subprocess
subprocess.run([
    sys.executable, 'scripts/training/train_w2v2_bert.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/w2v2_bert",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '8'
])

In [None]:
# 2. MMS 1B (Massively Multilingual Speech)
# Note: Requires 16GB VRAM or use --use-4bit for smaller GPUs
import subprocess
subprocess.run([
    sys.executable, 'scripts/training/train_mms.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/mms",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '8'
])

In [None]:
# 3. WHISPER ENCODER
import subprocess
subprocess.run([
    sys.executable, 'scripts/training/train_whisper_encoder.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/whisper_encoder",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4'
])

In [None]:
# 4. QWEN2-AUDIO (Linear Probe - encoder frozen)
import subprocess
subprocess.run([
    sys.executable, 'scripts/training/train_qwen_audio.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/qwen_audio",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '2'
])

In [None]:
# 5. SPEECHTOKENIZER
import subprocess
subprocess.run([
    sys.executable, 'scripts/training/train_speechtokenizer.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/speechtokenizer",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4'
])

In [None]:
# 6. WAV2VEC2 PHONEME (lv60-pmp - Domain Init)
import subprocess
subprocess.run([
    sys.executable, 'scripts/training/train_wav2vec2_phoneme.py',
    '--data-csv', CONFIG['csv_path'],
    '--vocab-path', CONFIG['vocab_path'],
    '--audio-base', CONFIG['audio_base'],
    '--output-dir', f"{CONFIG['output_base']}/wav2vec2_phoneme",
    '--epochs', str(CONFIG['epochs']),
    '--batch-size', '4',
    '--learning-rate', '3e-5',
])

## Utilities

In [None]:
# Cleanup disk (Kaggle)
if ENV == 'kaggle':
    for f in ['/kaggle/working/checkpoints', '/root/.cache/huggingface']:
        if os.path.exists(f) and not os.path.islink(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è {f}')

In [None]:
# Download checkpoints as ZIP (Kaggle)
if ENV == 'kaggle':
    for model in ['w2v2_bert', 'mms', 'whisper_encoder', 'qwen_audio', 'speechtokenizer']:
        p = f'{DRIVE_BACKUP}/{model}'
        if os.path.exists(p):
            shutil.make_archive(f'/kaggle/working/{model}_ckpt', 'zip', p)
            print(f'‚úì {model}')

In [None]:
# üì• Download checkpoints via browser (Colab only)
if ENV == 'colab':
    from google.colab import files
    import datetime
    
    models_to_download = ['w2v2_bert', 'mms', 'whisper_encoder', 'qwen_audio', 'speechtokenizer']
    
    for model in models_to_download:
        model_dir = f'{DRIVE_BACKUP}/{model}'
        final_model = f'{model_dir}/final_model'
        
        # Prefer final_model if exists, otherwise use latest checkpoint
        if os.path.exists(final_model):
            source_dir = final_model
            zip_name = f'{model}_final'
        elif os.path.exists(model_dir):
            # Find latest checkpoint
            checkpoints = sorted(glob.glob(f'{model_dir}/checkpoint-*'), 
                               key=lambda x: int(x.split('-')[-1]) if x.split('-')[-1].isdigit() else 0)
            if checkpoints:
                source_dir = checkpoints[-1]
                zip_name = f'{model}_{os.path.basename(source_dir)}'
            else:
                continue
        else:
            continue
        
        # Create ZIP in /content (faster than Drive)
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')
        zip_path = f'/content/{zip_name}_{timestamp}'
        print(f'üì¶ Zipping {source_dir}...')
        shutil.make_archive(zip_path, 'zip', source_dir)
        zip_file = f'{zip_path}.zip'
        size_mb = os.path.getsize(zip_file) / (1024*1024)
        print(f'‚úì Created {zip_file} ({size_mb:.1f} MB)')
        
        # Trigger browser download
        print('‚¨áÔ∏è Starting download...')
        files.download(zip_file)
        print(f'‚úì {model} download complete!')