# Speech Recognition Model Training
## Based on ChaosLingua System Architecture - Panelist 1
### Implements Fine-tuned Whisper for Romanian Speech-to-Text

In [None]:
# Install PyTorch with CUDA support (Kaggle GPU)
%pip install torch torchvision torchaudio
%pip install transformers datasets evaluate accelerate
%pip install librosa soundfile --quiet
%pip install protobuf sentencepiece tiktoken --quiet

In [None]:
# Setup HuggingFace API access
from huggingface_hub import login

# Use your NEW token here
hf_api_key = "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"
login(token=hf_api_key)

In [None]:
import pandas as pd
import requests
import librosa
import soundfile as sf
import numpy as np
from datasets import Dataset, Audio

def load_asr_dataset_simple(dataset_name, split='train'):
    """Load ASR dataset - simplified version for speech recognition"""
    
    api_url = f"https://huggingface.co/api/datasets/{dataset_name}/parquet/default/{split}"
    print(f"üîó Loading: {dataset_name} ({split})")
    
    try:
        # Get parquet URLs from API
        response = requests.get(api_url, timeout=30)
        if response.status_code != 200:
            print(f"‚ùå API failed: {response.status_code}")
            return None
            
        parquet_urls = response.json()
        print(f"üìÅ Found {len(parquet_urls)} parquet file(s)")
        
        # Load each parquet file and combine
        dfs = []
        for i, parquet_url in enumerate(parquet_urls):
            print(f"  Loading file {i+1}: {parquet_url}")
            
            try:
                df_chunk = pd.read_parquet(parquet_url)
                dfs.append(df_chunk)
                print(f"    ‚úÖ {len(df_chunk)} rows")
            except Exception as e:
                print(f"    ‚ùå Failed: {str(e)}")
                continue
        
        if not dfs:
            print(f"‚ùå No files loaded successfully")
            return None
        
        # Combine all chunks
        final_df = pd.concat(dfs, ignore_index=True)
        print(f"üéâ SUCCESS: {len(final_df)} rows, {len(final_df.columns)} columns")
        return final_df
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        return None

# Define ASR datasets
asr_datasets_config = [
    ('espnet/yodas2', ['train', 'validation']),  # Primary ASR dataset
    ('qmeeus/vp-er-10l', ['train', 'test'])       # Voice processing features
]

loaded_asr_datasets = {}

for dataset_name, splits in asr_datasets_config:
    print(f"\n{'='*60}")
    print(f"üì¶ Dataset: {dataset_name}")
    
    dataset_splits = {}
    for split in splits:
        df = load_asr_dataset_simple(dataset_name, split)
        
        if df is not None:
            dataset_splits[split] = df
            print(f"\nüìä {split.upper()} split:")
            print(f"   Shape: {df.shape}")
            print(f"   Columns: {df.columns.tolist()}")
            print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
            
            print(f"\nüìã Sample Data:")
            print(df.head(2))
    
    if dataset_splits:
        loaded_asr_datasets[dataset_name] = dataset_splits
        
    print("="*60)

print(f"\nüèÜ RESULTS:")
print(f"   Successfully loaded: {len(loaded_asr_datasets)} ASR datasets")

# Quick analysis of what you got
for name, splits_dict in loaded_asr_datasets.items():
    print(f"\n   {name}:")
    for split, df in splits_dict.items():
        print(f"      {split}: {len(df):,} rows")

total_rows = sum(len(df) for splits_dict in loaded_asr_datasets.values() for df in splits_dict.values())
print(f"\n   TOTAL: {total_rows:,} speech recognition examples! üî•")

In [None]:
# Convert DataFrames to HuggingFace Dataset format and preprocess
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Dataset

# Initialize Whisper processor
processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="romanian", task="transcribe")

# Process datasets for ASR training
def prepare_asr_dataset(df, audio_column='audio', text_column='text'):
    """Prepare dataset for ASR training with proper audio handling"""
    
    # Check what columns we have
    print(f"Available columns: {df.columns.tolist()}")
    
    # Handle different column names across datasets
    if audio_column not in df.columns:
        # Try common audio column names
        audio_candidates = ['path', 'file', 'audio_path', 'file_path']
        for candidate in audio_candidates:
            if candidate in df.columns:
                audio_column = candidate
                break
    
    if text_column not in df.columns:
        # Try common text column names
        text_candidates = ['sentence', 'transcript', 'transcription', 'label']
        for candidate in text_candidates:
            if candidate in df.columns:
                text_column = candidate
                break
    
    print(f"Using audio column: {audio_column}")
    print(f"Using text column: {text_column}")
    
    # Create simplified dataset with just audio and text
    if audio_column in df.columns and text_column in df.columns:
        simplified_df = df[[audio_column, text_column]].copy()
        simplified_df.columns = ['audio', 'text']
        return simplified_df
    else:
        print(f"‚ùå Could not find proper audio/text columns")
        return None

# Process all datasets and splits
all_asr_data = {}

for dataset_name, splits_dict in loaded_asr_datasets.items():
    print(f"\n{'='*60}")
    print(f"üì¶ Processing ASR: {dataset_name}")
    
    for split, df in splits_dict.items():
        prepared_df = prepare_asr_dataset(df)
        
        if prepared_df is not None:
            key = f"{dataset_name}_{split}"
            all_asr_data[key] = prepared_df
            print(f"   {split}: {len(df)} rows ‚Üí {len(prepared_df)} prepared rows")
    
    print("="*60)

# Combine train splits for training
train_dfs = [df for key, df in all_asr_data.items() if 'train' in key]
if train_dfs:
    combined_train = pd.concat(train_dfs, ignore_index=True)
    print(f"\nüìä Combined training data: {len(combined_train)}")
else:
    combined_train = None
    print(f"\n‚ö†Ô∏è  No training data available")

# Combine validation splits for validation
val_dfs = [df for key, df in all_asr_data.items() if 'validation' in key]
if val_dfs:
    combined_val = pd.concat(val_dfs, ignore_index=True)
    print(f"üìä Combined validation data: {len(combined_val)}")
else:
    combined_val = None
    print(f"‚ö†Ô∏è  No validation data available")

# Combine test splits for testing
test_dfs = [df for key, df in all_asr_data.items() if 'test' in key]
if test_dfs:
    combined_test = pd.concat(test_dfs, ignore_index=True)
    print(f"üìä Combined test data: {len(combined_test)}")
else:
    combined_test = None
    print(f"‚ö†Ô∏è  No test data available")

# Convert to HuggingFace Dataset
if combined_train is not None:
    train_dataset = Dataset.from_pandas(combined_train)
    # Cast audio column to Audio feature
    train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
else:
    train_dataset = None

if combined_val is not None:
    val_dataset = Dataset.from_pandas(combined_val)
    val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16000))
else:
    val_dataset = None

if combined_test is not None:
    test_dataset = Dataset.from_pandas(combined_test)
    test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
else:
    test_dataset = None

def prepare_dataset(batch):
    """Prepare audio and text for Whisper training"""
    # Load and resample audio
    audio = batch["audio"]
    
    # Process audio features
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
    
    # Process text labels
    labels = processor(text=batch["text"], return_tensors="pt").input_ids[0]
    
    return {"input_features": input_features, "labels": labels}

# Tokenize the datasets
if train_dataset:
    tokenized_train = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
    print(f"\n‚úÖ Tokenized train dataset: {len(tokenized_train)} examples")
else:
    tokenized_train = None

if val_dataset:
    tokenized_val = val_dataset.map(prepare_dataset, remove_columns=val_dataset.column_names)
    print(f"‚úÖ Tokenized validation dataset: {len(tokenized_val)} examples")
else:
    tokenized_val = None

if test_dataset:
    tokenized_test = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)
    print(f"‚úÖ Tokenized test dataset: {len(tokenized_test)} examples")
else:
    tokenized_test = None

In [None]:
# Initialize Whisper model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

# Set generation config for Romanian
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
import torch

# Check CUDA availability
if torch.cuda.is_available():
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   CUDA version: {torch.version.cuda}")
    print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU detected - check your Kaggle accelerator settings!")

print(f"PyTorch version: {torch.__version__}")

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./speech_recognition_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,     # Whisper needs more memory
    per_device_eval_batch_size=8,      
    gradient_accumulation_steps=4,      
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
)

In [None]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=processor,
)

In [None]:
# Train model
trainer.train()

In [None]:
# Evaluate using WER (Word Error Rate)
import evaluate
import numpy as np

wer_metric = evaluate.load("wer")

if tokenized_test is not None:
    print("üîç Evaluating on test set...")
    results = trainer.predict(tokenized_test)
    
    # Decode predictions
    predictions = processor.batch_decode(results.predictions, skip_special_tokens=True)
    
    # Get references from test dataset
    references = combined_test["text"].tolist()
    
    # Calculate WER
    wer_score = wer_metric.compute(predictions=predictions, references=references)
    print(f"\nüìä Test Results:")
    print(f"   Word Error Rate (WER): {wer_score:.4f}")
    print(f"   Accuracy: {(1 - wer_score) * 100:.2f}%")
    
    # Show some examples
    print(f"\nüìù Sample Transcriptions:")
    for i in range(min(5, len(predictions))):
        print(f"   Reference: {references[i]}")
        print(f"   Predicted: {predictions[i]}")
        print()
else:
    print("‚ö†Ô∏è No test dataset available for evaluation")

In [None]:
# Save model
model.save_pretrained("speech_recognition_model")
processor.save_pretrained("speech_recognition_model")
print("‚úÖ Speech recognition model saved successfully!")