# üöÄ Unified Trainer - Phoneme Recognition Benchmark

## Models
1. Wav2Vec2 Large
2. Whisper Encoder
3. SpeechTokenizer
4. Qwen2-Audio

## 1. Setup Ambiente

In [None]:
import os, sys, zipfile, glob, re, shutil

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# COLAB Setup - Mount Drive & Extract from ZIP
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BACKUP = '/content/drive/MyDrive/phoneme_checkpoints'
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    
    PROJECT_DIR = '/content/DeepLearning-Phoneme'
    ZIP_PATH = '/content/drive/MyDrive/DeepLearning-Phoneme.zip'
    
    if os.path.exists(ZIP_PATH):
        print(f'üì¶ Extracting from {ZIP_PATH}...')
        with zipfile.ZipFile(ZIP_PATH, 'r') as z:
            z.extractall('/content')
        print('‚úì Extracted')
    else:
        raise FileNotFoundError(f'ZIP non trovato: {ZIP_PATH}')
    
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    print('‚úì Colab ready')

In [None]:
# KAGGLE Setup - GitHub (code) + Dataset (data)
if ENV == 'kaggle':
    PROJECT_DIR = '/kaggle/working/pronuncIAtion'
    DRIVE_BACKUP = '/kaggle/working/checkpoints'
    
    # === 1. Clone code from GitHub ===
    if not os.path.exists(PROJECT_DIR):
        print('üì¶ Cloning code from GitHub...')
        !git clone https://github.com/maurocarlu/pronuncIAtion.git $PROJECT_DIR
    else:
        print('‚úì Repo exists, pulling updates...')
        !cd $PROJECT_DIR && git pull
    
    # === 2. Copy data from Kaggle Dataset ===
    # Il dataset 'pronuncIAtion-data' contiene la cartella data/ zippata
    # Kaggle path: /kaggle/input/pronunciation-data/data
    DATA_INPUT = '/kaggle/input/pronunciation-data/data'
    DATA_TARGET = f'{PROJECT_DIR}/data'
    
    if os.path.exists(DATA_INPUT):
        if not os.path.exists(DATA_TARGET):
            print(f'üì¶ Copying data from dataset...')
            shutil.copytree(DATA_INPUT, DATA_TARGET)
            print('‚úì Data copied')
        else:
            print('‚úì Data folder exists')
    else:
        # Prova path alternativo (se lo ZIP √® chiamato diversamente)
        alt_paths = [
            '/kaggle/input/pronuncIAtion-data/data',
            '/kaggle/input/pronunciation-data',
        ]
        found = False
        for alt in alt_paths:
            if os.path.exists(alt):
                if not os.path.exists(DATA_TARGET):
                    shutil.copytree(alt, DATA_TARGET)
                print(f'‚úì Data from {alt}')
                found = True
                break
        if not found:
            print('‚ùå Dataset non trovato!')
            print('Aggiungi il dataset "pronunciation-data" al notebook Kaggle')
            print('Il dataset deve contenere la cartella data/')
            !ls -la /kaggle/input/
    
    os.makedirs(DRIVE_BACKUP, exist_ok=True)
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, PROJECT_DIR)
    
    # Install dependencies
    !pip install -q soundfile librosa
    
    print(f'\n‚úì Kaggle ready')
    print(f'üìÅ Project: {PROJECT_DIR}')
    print(f'üìÇ Data: {DATA_TARGET}')

In [None]:
# LOCAL Setup
if ENV == 'local':
    PROJECT_DIR = os.getcwd()
    if 'notebooks' in PROJECT_DIR:
        PROJECT_DIR = os.path.dirname(PROJECT_DIR)
    DRIVE_BACKUP = f'{PROJECT_DIR}/outputs'

os.makedirs(DRIVE_BACKUP, exist_ok=True)
os.chdir(PROJECT_DIR)
sys.path.insert(0, PROJECT_DIR)

print(f'üìÅ Project: {PROJECT_DIR}')
print(f'üíæ Checkpoints: {DRIVE_BACKUP}')

In [None]:
!pip install -q transformers datasets evaluate jiwer soundfile librosa
import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')

---
## 2. Configuration

In [None]:
CONFIG = {
    'csv_path': f'{PROJECT_DIR}/data/processed/combined_augmented.csv',
    'vocab_path': f'{PROJECT_DIR}/data/processed/vocab.json',
    'audio_base': PROJECT_DIR,
    'epochs': 10,
    'output_base': DRIVE_BACKUP,
}

# Verifica files
for k,v in CONFIG.items():
    if 'path' in k:
        exists = os.path.exists(v)
        print(f"{'‚úì' if exists else '‚úó'} {k}: {v}")
        if not exists and 'csv' in k:
            print('  ‚ö†Ô∏è CSV non trovato! Verifica che il dataset sia stato copiato.')

---
## 3. Resume from Checkpoint (opzionale)

In [None]:
# Estrai checkpoint per resume (esegui SOLO se hai un backup)
MODEL_TO_RESUME = 'wav2vec2'

if ENV == 'kaggle':
    backup_dir = '/kaggle/working/drive_backup'
    output_dir = f'{DRIVE_BACKUP}/{MODEL_TO_RESUME}'
    
    pattern = f'{backup_dir}/{MODEL_TO_RESUME}_checkpoint-*.zip'
    checkpoint_zips = glob.glob(pattern)
    
    if checkpoint_zips:
        def get_step(path):
            match = re.search(r'checkpoint-(\d+)', path)
            return int(match.group(1)) if match else 0
        
        checkpoint_zips.sort(key=get_step, reverse=True)
        latest_zip = checkpoint_zips[0]
        step_num = get_step(latest_zip)
        
        checkpoint_dir = f'{output_dir}/checkpoint-{step_num}'
        os.makedirs(output_dir, exist_ok=True)
        
        with zipfile.ZipFile(latest_zip, 'r') as z:
            z.extractall(checkpoint_dir)
        
        print(f'‚úì Checkpoint {step_num} estratto')
    else:
        print(f'Nessun checkpoint trovato per {MODEL_TO_RESUME}')

---
## 4. Training

In [None]:
# WAV2VEC2 (aggiungi --resume per riprendere da checkpoint)
!python scripts/training/train_wav2vec2.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/wav2vec2" \
    --epochs {CONFIG['epochs']} --batch-size 4

In [None]:
# WHISPER ENCODER
!python scripts/training/train_whisper_encoder.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/whisper_encoder" \
    --epochs {CONFIG['epochs']} --batch-size 4

In [None]:
# QWEN2-AUDIO
!python scripts/training/train_qwen_audio.py \
    --data-csv "{CONFIG['csv_path']}" \
    --vocab-path "{CONFIG['vocab_path']}" \
    --audio-base "{CONFIG['audio_base']}" \
    --output-dir "{CONFIG['output_base']}/qwen_audio" \
    --epochs {CONFIG['epochs']} --batch-size 2

---
## 5. Utilities

In [None]:
# Pulizia disco
if ENV == 'kaggle':
    for f in ['/kaggle/working/checkpoints', '/root/.cache/huggingface']:
        if os.path.exists(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è {f}')
    !df -h /kaggle/working

In [None]:
# Download checkpoints
if ENV == 'kaggle':
    for model in ['wav2vec2', 'whisper_encoder', 'qwen_audio']:
        p = f'{DRIVE_BACKUP}/{model}'
        if os.path.exists(p):
            shutil.make_archive(f'/kaggle/working/{model}_ckpt', 'zip', p)
            print(f'‚úì {model}')
    print('üì• Download da Output panel')