# üöÄ Unified Trainer - Phoneme Recognition Benchmark

In [None]:
import os, sys, zipfile, glob, re, shutil

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# COLAB Setup
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BACKUP = '/content/drive/MyDrive/phoneme_checkpoints'
    PROJECT_DIR = '/content/DeepLearning-Phoneme'
    ZIP_PATH = '/content/drive/MyDrive/DeepLearning-Phoneme.zip'
    
    if os.path.exists(ZIP_PATH):
        with zipfile.ZipFile(ZIP_PATH, 'r') as z:
            z.extractall('/content')
        print('‚úì Extracted')
    else:
        raise FileNotFoundError(ZIP_PATH)
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)

In [None]:
# KAGGLE Setup - GitHub (code) + Dataset (data)
if ENV == 'kaggle':
    PROJECT_DIR = '/kaggle/working/pronuncIAtion'
    DRIVE_BACKUP = '/kaggle/working/checkpoints'
    
    # 1. Clone code from GitHub
    if not os.path.exists(PROJECT_DIR):
        print('üì¶ Cloning from GitHub...')
        !git clone https://github.com/maurocarlu/pronuncIAtion.git $PROJECT_DIR
    else:
        !cd $PROJECT_DIR && git pull
    
    # 2. Link data from Kaggle Dataset (instead of copy - saves disk space)
    DATA_INPUT = '/kaggle/input/pronunciation-data/data'
    DATA_TARGET = f'{PROJECT_DIR}/data'
    
    # Remove existing empty data folder and create symlink
    if os.path.islink(DATA_TARGET):
        print('‚úì Data symlink exists')
    elif os.path.exists(DATA_TARGET):
        print('üóëÔ∏è Removing empty data folder...')
        shutil.rmtree(DATA_TARGET)
        os.symlink(DATA_INPUT, DATA_TARGET)
        print(f'‚úì Created symlink: {DATA_TARGET} -> {DATA_INPUT}')
    elif os.path.exists(DATA_INPUT):
        os.symlink(DATA_INPUT, DATA_TARGET)
        print(f'‚úì Created symlink: {DATA_TARGET} -> {DATA_INPUT}')
    else:
        print('‚ùå Dataset not found!')
        print('Add "pronunciation-data" dataset to the notebook')
        !ls -la /kaggle/input/
    
    # Verify audio files exist
    test_audio = f'{DATA_TARGET}/augmented_focused/audio'
    if os.path.exists(test_audio):
        audio_count = len(os.listdir(test_audio))
        print(f'‚úì Audio files found: {audio_count} in augmented_focused')
    else:
        print(f'‚ùå Audio folder not found: {test_audio}')
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    
    !pip install -q soundfile librosa
    
    print(f'\n‚úì Kaggle ready')
    print(f'üìÅ Project: {PROJECT_DIR}')

In [None]:
# LOCAL Setup
if ENV == 'local':
    PROJECT_DIR = os.getcwd()
    if 'notebooks' in PROJECT_DIR:
        PROJECT_DIR = os.path.dirname(PROJECT_DIR)
    DRIVE_BACKUP = f'{PROJECT_DIR}/outputs'

os.makedirs(DRIVE_BACKUP, exist_ok=True)
os.chdir(PROJECT_DIR)
sys.path.insert(0, PROJECT_DIR)

print(f'üìÅ Project: {PROJECT_DIR}')
print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
!pip install -q transformers datasets evaluate jiwer soundfile librosa
import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')

## Configuration

In [None]:
CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': PROJECT_DIR,
    'epochs': 10,
    'output_base': DRIVE_BACKUP,
}

# Verify paths
for k,v in CONFIG.items():
    if 'path' in k:
        print(f"{'‚úì' if os.path.exists(v) else '‚úó'} {k}: {v}")

# === DEBUG: VERIFY DATA LOADING ===
print("\nüîç DEBUG: Testing Data Loading & Audio...")
import pandas as pd
import librosa
import numpy as np

try:
    # 1. Check CSV
    df = pd.read_csv(CONFIG['csv_path'])
    print(f"   CSV Samples: {len(df)}")
    print(f"   Columns: {list(df.columns)}")
    
    # 2. Check Audio Loading for 3 random samples
    samples = df.sample(3)
    for idx, row in samples.iterrows():
        path = row['audio_path']
        # Handle windows backslashes in path just in case
        clean_path = path.replace('\\', '/')
        full_path = f"{CONFIG['audio_base']}/{clean_path}"
        
        print(f"\n   Testing sample {idx}: {full_path}")
        if os.path.exists(full_path):
            y, sr = librosa.load(full_path, sr=16000)
            duration = len(y)/sr
            is_silent = np.allclose(y, 0, atol=1e-3)
            print(f"   ‚úì Loaded: {duration:.2f}s | SR: {sr}")
            print(f"   ‚úì Silent: {is_silent} | Range: [{y.min():.3f}, {y.max():.3f}]")
            print(f"   IPA: /{row['ipa_clean']}/")
        else:
            print(f"   ‚ùå File NOT found!")
except Exception as e:
    print(f"‚ùå DATA DEBUG FAILED: {e}")
print("\n" + "="*40)

## Training

In [None]:
# WAV2VEC2
!python scripts/training/train_wav2vec2.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/wav2vec2" \
    --epochs {CONFIG['epochs']} \
    --learning-rate 3e-4 \
    --batch-size 4

In [None]:
# WHISPER ENCODER
!python scripts/training/train_whisper_encoder.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/whisper_encoder" \
    --epochs {CONFIG['epochs']} --batch-size 4

In [None]:
# QWEN2-AUDIO
!python scripts/training/train_qwen_audio.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/qwen_audio" \
    --epochs {CONFIG['epochs']} --batch-size 2

## Utilities

In [None]:
# Cleanup disk
if ENV == 'kaggle':
    for f in ['/kaggle/working/checkpoints', '/root/.cache/huggingface']:
        if os.path.exists(f) and not os.path.islink(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è {f}')
    !df -h /kaggle/working

In [None]:
# Download checkpoints
if ENV == 'kaggle':
    for model in ['wav2vec2', 'whisper_encoder', 'qwen_audio']:
        p = f'{DRIVE_BACKUP}/{model}'
        if os.path.exists(p):
            shutil.make_archive(f'/kaggle/working/{model}_ckpt', 'zip', p)
            print(f'‚úì {model}')