# Fine-Tuning Parakeet RNNT 1.1B Multilingual

This notebook provides a complete pipeline for fine-tuning NVIDIA's `parakeet-rnnt-1.1b-multilingual` model using the NeMo framework.

**Reference:** [NeMo Multilang ASR Tutorial](https://github.com/NVIDIA-NeMo/NeMo/blob/main/tutorials/asr/Multilang_ASR.ipynb)

## Features
- Load pre-trained Parakeet RNNT 1.1B Multilingual model
- Prepare datasets with NeMo manifest format
- Configure model for fine-tuning with best practices
- Train with PyTorch Lightning
- Evaluate and run inference

## Requirements
- NVIDIA GPU with 16GB+ VRAM (recommended)
- Python 3.8+
- CUDA 11.8+


## 1. Environment Setup


In [None]:
# Detect environment
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("üåê Running on Google Colab")
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("üíª Running locally")


In [None]:
# Install dependencies
# IMPORTANT: After running this cell, RESTART THE RUNTIME before continuing!

INSTALL_PACKAGES = True  # Set to False if already installed

if INSTALL_PACKAGES:
    print("üì¶ Installing NeMo and dependencies...")
    print("This may take 5-10 minutes...\n")
    
    # Core dependencies
    !pip install -q Cython packaging
    
    # Install NeMo toolkit with ASR support
    # Option 1: From PyPI (stable)
    !pip install -q 'nemo_toolkit[asr]'
    
    # Option 2: From source (latest features - uncomment if needed)
    # !pip install -q git+https://github.com/NVIDIA/NeMo.git#egg=nemo_toolkit[asr]
    
    # Additional dependencies
    !pip install -q soundfile librosa datasets jiwer
    
    print("\n" + "="*60)
    print("‚ö†Ô∏è  IMPORTANT: Please restart the runtime now!")
    print("   Go to: Runtime -> Restart runtime")
    print("   Then continue from the next cell.")
    print("="*60)


In [None]:
# Core imports
import os
import sys
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Union, Any
from dataclasses import dataclass, field

import numpy as np
import torch
import pytorch_lightning as pl

# NeMo imports
import nemo
import nemo.collections.asr as nemo_asr
from nemo.utils.exp_manager import exp_manager
from omegaconf import OmegaConf, open_dict

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Check versions and GPU
print(f"‚úì PyTorch: {torch.__version__}")
print(f"‚úì NeMo: {nemo.__version__}")
print(f"‚úì PyTorch Lightning: {pl.__version__}")

if torch.cuda.is_available():
    print(f"\nüéÆ GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"   CUDA: {torch.version.cuda}")
else:
    print("\n‚ö†Ô∏è  No GPU detected! Training will be very slow.")

IN_COLAB = 'google.colab' in sys.modules


## 2. Configuration

Modify the configuration below based on your dataset and hardware.


In [None]:
@dataclass
class DataConfig:
    """Dataset configuration."""
    # Manifest paths (NeMo JSON Lines format)
    train_manifest: str = "./data/train_manifest.json"
    val_manifest: str = "./data/val_manifest.json"
    test_manifest: Optional[str] = "./data/test_manifest.json"
    
    # Audio settings
    sample_rate: int = 16000
    max_duration: float = 20.0  # seconds
    min_duration: float = 0.1   # seconds
    
    # Batch settings
    train_batch_size: int = 16
    val_batch_size: int = 16
    num_workers: int = 4
    
    # Spec Augmentation
    enable_spec_augment: bool = True
    freq_masks: int = 2
    time_masks: int = 10
    freq_width: int = 27
    time_width: float = 0.05


@dataclass 
class TrainingConfig:
    """Training configuration."""
    # Model
    pretrained_model: str = "nvidia/parakeet-rnnt-1.1b-multilingual"
    
    # Output
    output_dir: str = "./outputs/parakeet-rnnt-finetuned"
    exp_name: str = "parakeet_rnnt_finetune"
    
    # Training hyperparameters
    max_epochs: int = 50
    learning_rate: float = 1e-4
    min_lr: float = 1e-6
    warmup_steps: int = 1000
    weight_decay: float = 1e-3
    
    # Optimizer & Scheduler
    optimizer: str = "adamw"
    scheduler: str = "CosineAnnealing"
    
    # Precision & Gradient
    precision: str = "16-mixed"
    grad_clip: float = 1.0
    accumulate_grad_batches: int = 1
    
    # Checkpointing
    save_top_k: int = 3
    checkpoint_every_n_epochs: int = 1
    
    # Early stopping
    early_stop_patience: int = 10
    
    # Encoder freezing
    freeze_encoder: bool = False
    
    # Device
    devices: int = 1
    accelerator: str = "gpu"
    
    # Logging
    log_every_n_steps: int = 50
    
    # Resume
    resume_from_checkpoint: Optional[str] = None


In [None]:
# ============================================================================
# üìù CONFIGURE YOUR TRAINING HERE
# ============================================================================

DATA_CONFIG = DataConfig(
    # üìÅ Your manifest file paths
    train_manifest="./data/train_manifest.json",
    val_manifest="./data/val_manifest.json",
    test_manifest="./data/test_manifest.json",  # Optional
    
    # üîä Audio settings
    sample_rate=16000,
    max_duration=20.0,  # Filter out audio longer than this
    min_duration=0.5,   # Filter out audio shorter than this
    
    # üì¶ Batch settings (adjust based on GPU memory)
    train_batch_size=8,   # Reduce if OOM
    val_batch_size=8,
    num_workers=4,
    
    # üé≠ Data augmentation
    enable_spec_augment=True,
)

TRAINING_CONFIG = TrainingConfig(
    # ü§ñ Pre-trained model
    pretrained_model="nvidia/parakeet-rnnt-1.1b-multilingual",
    
    # üìÇ Output directory
    output_dir="./outputs/parakeet-rnnt-finetuned",
    exp_name="parakeet_finetune",
    
    # üéØ Training hyperparameters
    max_epochs=50,
    learning_rate=1e-4,    # Lower for small datasets (5e-5)
    warmup_steps=1000,
    weight_decay=1e-3,
    
    # ‚ö° Performance
    precision="16-mixed",  # Use bf16-mixed if your GPU supports it
    accumulate_grad_batches=2,  # Increase for effective larger batch size
    
    # üßä Encoder freezing (recommended for small datasets)
    freeze_encoder=False,  # Set True if dataset < 10 hours
    
    # ‚è±Ô∏è Early stopping
    early_stop_patience=10,
    
    # üíæ Checkpointing
    save_top_k=3,
)

# Colab-specific paths
if IN_COLAB:
    DATA_CONFIG.train_manifest = "/content/drive/MyDrive/data/train_manifest.json"
    DATA_CONFIG.val_manifest = "/content/drive/MyDrive/data/val_manifest.json"
    DATA_CONFIG.test_manifest = "/content/drive/MyDrive/data/test_manifest.json"
    TRAINING_CONFIG.output_dir = "/content/drive/MyDrive/outputs/parakeet-finetuned"

print("Configuration loaded!")
print(f"  Model: {TRAINING_CONFIG.pretrained_model}")
print(f"  Train manifest: {DATA_CONFIG.train_manifest}")
print(f"  Val manifest: {DATA_CONFIG.val_manifest}")
print(f"  Output dir: {TRAINING_CONFIG.output_dir}")


## 3. Dataset Preparation

NeMo uses JSON Lines manifest files where each line is a JSON object:

```json
{"audio_filepath": "/path/to/audio.wav", "text": "transcription", "duration": 2.5, "lang": "en"}
```

**Required fields:**
- `audio_filepath`: Path to audio file
- `text`: Transcription text
- `duration`: Audio duration in seconds

**Optional fields:**
- `lang`: Language code (for multilingual models)


In [None]:
import soundfile as sf

def create_manifest_from_folder(
    audio_dir: str,
    output_manifest: str,
    transcriptions: Dict[str, str] = None,
    transcription_file: str = None,
    language: str = "en",
    audio_extensions: List[str] = [".wav", ".flac", ".mp3", ".ogg"]
) -> str:
    """
    Create NeMo manifest from a folder of audio files.
    
    Args:
        audio_dir: Directory containing audio files
        output_manifest: Path for output manifest file
        transcriptions: Dict mapping filename -> transcription
        transcription_file: JSON file with {filename: transcription} mapping
        language: Language code for all samples
        audio_extensions: Audio file extensions to include
    
    Returns:
        Path to created manifest
    """
    audio_dir = Path(audio_dir)
    output_manifest = Path(output_manifest)
    output_manifest.parent.mkdir(parents=True, exist_ok=True)
    
    # Load transcriptions
    if transcription_file and Path(transcription_file).exists():
        with open(transcription_file, 'r', encoding='utf-8') as f:
            transcriptions = json.load(f)
    elif transcriptions is None:
        transcriptions = {}
    
    entries = []
    skipped = 0
    
    for ext in audio_extensions:
        for audio_path in audio_dir.rglob(f"*{ext}"):
            try:
                # Get duration
                info = sf.info(str(audio_path))
                duration = info.duration
                
                # Get transcription
                filename = audio_path.name
                text = transcriptions.get(filename, transcriptions.get(str(audio_path), ""))
                
                if not text:
                    skipped += 1
                    continue
                
                entry = {
                    "audio_filepath": str(audio_path.absolute()),
                    "text": text.strip(),
                    "duration": round(duration, 3),
                    "lang": language
                }
                entries.append(entry)
                
            except Exception as e:
                logger.warning(f"Error processing {audio_path}: {e}")
                skipped += 1
    
    # Write manifest
    with open(output_manifest, 'w', encoding='utf-8') as f:
        for entry in entries:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
    print(f"‚úì Created manifest: {output_manifest}")
    print(f"  Samples: {len(entries)}")
    print(f"  Skipped: {skipped}")
    
    return str(output_manifest)


def create_manifest_from_huggingface(
    dataset_name: str,
    output_manifest: str,
    audio_output_dir: str,
    split: str = "train",
    config_name: str = None,
    audio_column: str = "audio",
    text_column: str = "sentence",
    language: str = "en",
    max_samples: int = None
) -> str:
    """
    Create NeMo manifest from a HuggingFace dataset.
    """
    from datasets import load_dataset
    
    output_manifest = Path(output_manifest)
    audio_output_dir = Path(audio_output_dir)
    output_manifest.parent.mkdir(parents=True, exist_ok=True)
    audio_output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Loading dataset: {dataset_name}...")
    
    if config_name:
        dataset = load_dataset(dataset_name, config_name, split=split, trust_remote_code=True)
    else:
        dataset = load_dataset(dataset_name, split=split, trust_remote_code=True)
    
    if max_samples:
        dataset = dataset.select(range(min(len(dataset), max_samples)))
    
    print(f"Processing {len(dataset)} samples...")
    
    entries = []
    
    for idx, sample in enumerate(dataset):
        try:
            audio = sample[audio_column]
            text = sample[text_column]
            
            if isinstance(audio, dict):
                array = audio['array']
                sr = audio.get('sampling_rate', 16000)
                
                # Save audio file
                audio_path = audio_output_dir / f"audio_{idx:06d}.wav"
                sf.write(str(audio_path), array, sr)
                
                duration = len(array) / sr
            else:
                audio_path = Path(audio)
                info = sf.info(str(audio_path))
                duration = info.duration
            
            entry = {
                "audio_filepath": str(audio_path.absolute()),
                "text": text.strip(),
                "duration": round(duration, 3),
                "lang": language
            }
            entries.append(entry)
            
            if (idx + 1) % 1000 == 0:
                print(f"  Processed {idx + 1}/{len(dataset)} samples")
                
        except Exception as e:
            logger.warning(f"Error processing sample {idx}: {e}")
    
    # Write manifest
    with open(output_manifest, 'w', encoding='utf-8') as f:
        for entry in entries:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
    print(f"\n‚úì Created manifest: {output_manifest}")
    print(f"  Samples: {len(entries)}")
    
    return str(output_manifest)


In [None]:
def validate_manifest(manifest_path: str) -> Dict[str, Any]:
    """Validate manifest and return statistics."""
    manifest_path = Path(manifest_path)
    
    if not manifest_path.exists():
        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
    
    stats = {
        "total_samples": 0,
        "total_duration_hours": 0,
        "missing_files": 0,
        "languages": {},
        "duration_range": {"min": float('inf'), "max": 0}
    }
    
    with open(manifest_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                entry = json.loads(line.strip())
                
                stats["total_samples"] += 1
                
                duration = entry.get("duration", 0)
                stats["total_duration_hours"] += duration / 3600
                stats["duration_range"]["min"] = min(stats["duration_range"]["min"], duration)
                stats["duration_range"]["max"] = max(stats["duration_range"]["max"], duration)
                
                lang = entry.get("lang", "unknown")
                stats["languages"][lang] = stats["languages"].get(lang, 0) + 1
                
                audio_path = entry.get("audio_filepath")
                if audio_path and not Path(audio_path).exists():
                    stats["missing_files"] += 1
                    
            except json.JSONDecodeError:
                pass
    
    if stats["duration_range"]["min"] == float('inf'):
        stats["duration_range"]["min"] = 0
    
    return stats


def print_manifest_stats(manifest_path: str):
    """Print manifest statistics."""
    try:
        stats = validate_manifest(manifest_path)
        print(f"\nüìä Manifest: {Path(manifest_path).name}")
        print(f"   Samples: {stats['total_samples']:,}")
        print(f"   Duration: {stats['total_duration_hours']:.2f} hours")
        print(f"   Range: {stats['duration_range']['min']:.1f}s - {stats['duration_range']['max']:.1f}s")
        print(f"   Languages: {stats['languages']}")
        if stats['missing_files'] > 0:
            print(f"   ‚ö†Ô∏è  Missing files: {stats['missing_files']}")
    except FileNotFoundError:
        print(f"\n‚ùå Manifest not found: {manifest_path}")


In [None]:
# Example: Create manifests from HuggingFace dataset
# Uncomment and modify as needed

CREATE_FROM_HUGGINGFACE = False  # Set to True to create manifests

if CREATE_FROM_HUGGINGFACE:
    # Download and prepare Common Voice English subset
    create_manifest_from_huggingface(
        dataset_name="mozilla-foundation/common_voice_11_0",
        output_manifest="./data/train_manifest.json",
        audio_output_dir="./data/audio/train",
        split="train",
        config_name="en",  # Language code
        audio_column="audio",
        text_column="sentence",
        language="en",
        max_samples=5000  # Limit for testing
    )
    
    create_manifest_from_huggingface(
        dataset_name="mozilla-foundation/common_voice_11_0",
        output_manifest="./data/val_manifest.json",
        audio_output_dir="./data/audio/val",
        split="validation",
        config_name="en",
        audio_column="audio",
        text_column="sentence",
        language="en",
        max_samples=500
    )


In [None]:
# Validate your manifests
print_manifest_stats(DATA_CONFIG.train_manifest)
print_manifest_stats(DATA_CONFIG.val_manifest)

if DATA_CONFIG.test_manifest:
    print_manifest_stats(DATA_CONFIG.test_manifest)


## 4. Load Pre-trained Model


In [None]:
# Load pre-trained Parakeet RNNT model
print(f"Loading model: {TRAINING_CONFIG.pretrained_model}")
print("This may take a few minutes for initial download...\n")

model_name = TRAINING_CONFIG.pretrained_model.replace('nvidia/', '')

try:
    model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name=model_name)
    print(f"\n‚úì Model loaded successfully!")
    print(f"  Encoder: {model.encoder.__class__.__name__}")
    print(f"  Decoder: {model.decoder.__class__.__name__}")
    print(f"  Joint: {model.joint.__class__.__name__}")
except Exception as e:
    print(f"\n‚ùå Error loading model: {e}")
    print("\nTrying alternative loading method...")
    model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name)


## 5. Configure Model for Fine-Tuning


In [None]:
# Update model configuration
cfg = model.cfg

with open_dict(cfg):
    # ==================
    # Training Data
    # ==================
    cfg.train_ds.manifest_filepath = DATA_CONFIG.train_manifest
    cfg.train_ds.batch_size = DATA_CONFIG.train_batch_size
    cfg.train_ds.num_workers = DATA_CONFIG.num_workers
    cfg.train_ds.sample_rate = DATA_CONFIG.sample_rate
    cfg.train_ds.max_duration = DATA_CONFIG.max_duration
    cfg.train_ds.min_duration = DATA_CONFIG.min_duration
    cfg.train_ds.shuffle = True
    cfg.train_ds.pin_memory = True
    
    # ==================
    # Validation Data
    # ==================
    cfg.validation_ds.manifest_filepath = DATA_CONFIG.val_manifest
    cfg.validation_ds.batch_size = DATA_CONFIG.val_batch_size
    cfg.validation_ds.num_workers = DATA_CONFIG.num_workers
    cfg.validation_ds.sample_rate = DATA_CONFIG.sample_rate
    cfg.validation_ds.shuffle = False
    cfg.validation_ds.pin_memory = True
    
    # ==================
    # Optimizer
    # ==================
    cfg.optim.name = TRAINING_CONFIG.optimizer
    cfg.optim.lr = TRAINING_CONFIG.learning_rate
    cfg.optim.weight_decay = TRAINING_CONFIG.weight_decay
    cfg.optim.betas = [0.9, 0.98]
    
    # ==================
    # Scheduler
    # ==================
    cfg.optim.sched.name = TRAINING_CONFIG.scheduler
    cfg.optim.sched.warmup_steps = TRAINING_CONFIG.warmup_steps
    cfg.optim.sched.min_lr = TRAINING_CONFIG.min_lr
    
    # ==================
    # Spec Augmentation
    # ==================
    if hasattr(cfg, 'spec_augment') and DATA_CONFIG.enable_spec_augment:
        cfg.spec_augment.freq_masks = DATA_CONFIG.freq_masks
        cfg.spec_augment.time_masks = DATA_CONFIG.time_masks
        cfg.spec_augment.freq_width = DATA_CONFIG.freq_width
        cfg.spec_augment.time_width = DATA_CONFIG.time_width

print("‚úì Model configuration updated")

# Freeze encoder if specified
if TRAINING_CONFIG.freeze_encoder:
    print("üßä Freezing encoder layers")
    model.encoder.freeze()


In [None]:
# Setup data loaders
print("Setting up data loaders...")
model.setup_training_data(cfg.train_ds)
model.setup_validation_data(cfg.validation_ds)
print("‚úì Data loaders ready")


## 6. Setup Trainer


In [None]:
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
    LearningRateMonitor
)

# Create output directory
Path(TRAINING_CONFIG.output_dir).mkdir(parents=True, exist_ok=True)

# Callbacks
callbacks = [
    LearningRateMonitor(logging_interval='step'),
]

# Trainer
trainer = pl.Trainer(
    devices=TRAINING_CONFIG.devices,
    accelerator=TRAINING_CONFIG.accelerator,
    max_epochs=TRAINING_CONFIG.max_epochs,
    precision=TRAINING_CONFIG.precision,
    accumulate_grad_batches=TRAINING_CONFIG.accumulate_grad_batches,
    gradient_clip_val=TRAINING_CONFIG.grad_clip,
    log_every_n_steps=TRAINING_CONFIG.log_every_n_steps,
    enable_checkpointing=True,
    callbacks=callbacks,
    default_root_dir=TRAINING_CONFIG.output_dir,
)

# NeMo Experiment Manager
exp_manager_config = {
    'exp_dir': TRAINING_CONFIG.output_dir,
    'name': TRAINING_CONFIG.exp_name,
    'checkpoint_callback_params': {
        'monitor': 'val_wer',
        'mode': 'min',
        'save_top_k': TRAINING_CONFIG.save_top_k,
        'save_last': True,
    },
    'create_tensorboard_logger': True,
    'create_wandb_logger': False,
}

# Add early stopping
if TRAINING_CONFIG.early_stop_patience > 0:
    exp_manager_config['early_stopping_callback_params'] = {
        'monitor': 'val_wer',
        'patience': TRAINING_CONFIG.early_stop_patience,
        'min_delta': 0.001,
        'mode': 'min',
    }

exp_manager(trainer, exp_manager_config)

print("‚úì Trainer configured")
print(f"  Output: {TRAINING_CONFIG.output_dir}")
print(f"  Epochs: {TRAINING_CONFIG.max_epochs}")
print(f"  Precision: {TRAINING_CONFIG.precision}")


## 7. Train!


In [None]:
# Start training
print("="*60)
print("üöÄ STARTING FINE-TUNING")
print("="*60)
print(f"\nModel: {TRAINING_CONFIG.pretrained_model}")
print(f"Epochs: {TRAINING_CONFIG.max_epochs}")
print(f"Learning rate: {TRAINING_CONFIG.learning_rate}")
print(f"Batch size: {DATA_CONFIG.train_batch_size}")
print(f"Gradient accumulation: {TRAINING_CONFIG.accumulate_grad_batches}")
print(f"Effective batch size: {DATA_CONFIG.train_batch_size * TRAINING_CONFIG.accumulate_grad_batches}")
print("\nTraining...\n")

trainer.fit(
    model,
    ckpt_path=TRAINING_CONFIG.resume_from_checkpoint
)

print("\n" + "="*60)
print("‚úì Training complete!")
print("="*60)


In [None]:
# Save final model
final_model_path = Path(TRAINING_CONFIG.output_dir) / "final_model.nemo"
model.save_to(str(final_model_path))
print(f"\nüíæ Model saved to: {final_model_path}")


## 8. Evaluation


In [None]:
# Evaluate on test set
if DATA_CONFIG.test_manifest and Path(DATA_CONFIG.test_manifest).exists():
    print("Evaluating on test set...")
    
    with open_dict(model.cfg):
        model.cfg.test_ds.manifest_filepath = DATA_CONFIG.test_manifest
        model.cfg.test_ds.batch_size = DATA_CONFIG.val_batch_size
        model.cfg.test_ds.num_workers = DATA_CONFIG.num_workers
    
    model.setup_test_data(model.cfg.test_ds)
    test_results = trainer.test(model)
    
    print(f"\nüìä Test Results: {test_results}")
else:
    print("No test manifest found, skipping evaluation.")


## 9. Inference


In [None]:
def load_finetuned_model(model_path: str):
    """Load fine-tuned model for inference."""
    loaded_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(model_path)
    loaded_model.eval()
    if torch.cuda.is_available():
        loaded_model = loaded_model.cuda()
    return loaded_model


def transcribe(model_to_use, audio_paths: Union[str, List[str]], batch_size: int = 4):
    """Transcribe audio files."""
    if isinstance(audio_paths, str):
        audio_paths = [audio_paths]
    return model_to_use.transcribe(paths2audio_files=audio_paths, batch_size=batch_size)


In [None]:
# Example: Load and use fine-tuned model
# Uncomment to test

# model_path = "./outputs/parakeet-rnnt-finetuned/final_model.nemo"
# finetuned_model = load_finetuned_model(model_path)

# # Transcribe audio files
# audio_files = [
#     "path/to/audio1.wav",
#     "path/to/audio2.wav",
# ]
# transcriptions = transcribe(finetuned_model, audio_files)

# for audio, text in zip(audio_files, transcriptions):
#     print(f"{Path(audio).name}: {text}")


## 10. Tips & Best Practices

### Dataset Size Recommendations

| Dataset Size | Learning Rate | Freeze Encoder | Epochs |
|--------------|--------------|----------------|--------|
| < 10 hours | 5e-5 | Yes | 50-100 |
| 10-100 hours | 1e-4 | First 5 epochs | 30-50 |
| > 100 hours | 3e-4 | No | 20-30 |

### Memory Optimization

If you run out of GPU memory:
1. Reduce `batch_size`
2. Increase `accumulate_grad_batches`
3. Reduce `max_duration`
4. Use `precision="16-mixed"`

### Multilingual Fine-Tuning

For multilingual datasets:
1. Include `lang` field in manifest entries
2. Balance samples across languages
3. Consider language-specific augmentation

### Manifest Format

```json
{"audio_filepath": "/path/audio.wav", "text": "hello world", "duration": 2.5, "lang": "en"}
{"audio_filepath": "/path/audio2.wav", "text": "bonjour monde", "duration": 3.1, "lang": "fr"}
```


In [None]:
print("\n" + "="*60)
print("üéâ Fine-tuning pipeline complete!")
print("="*60)
print(f"\nOutput directory: {TRAINING_CONFIG.output_dir}")
print(f"Model file: {TRAINING_CONFIG.output_dir}/final_model.nemo")
print("\nNext steps:")
print("  1. Review TensorBoard logs for training curves")
print("  2. Test model on your evaluation data")
print("  3. Export model for deployment")
