# System Dependencies and Additional Packages
This cell updates the system packages and installs ffmpeg for audio processing, along with additional Python packages including datasets, transformers, torchaudio, evaluate, jiwer, torchcodec, and tensorboard. It also upgrades transformers and accelerate to their latest versions.

In [1]:

!apt-get update -y
!apt-get install -y ffmpeg
!pip install datasets transformers torchaudio evaluate jiwer torchcodec tensorboard scikit-learn accelerate
!pip install --upgrade transformers accelerate


Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease               
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease                 
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease               
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 108 not upgraded.
[0m

# Imports and Configuration Setup
This cell imports all necessary libraries for data processing, audio handling, and machine learning. It sets up environment variables to disable tokenizers parallelism warnings and defines configuration parameters including the number of processors and sampling rate. It also creates the base directory for the training data.

In [2]:
import os
import subprocess
import tarfile
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from datasets import load_dataset, Features, Value, Audio
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from concurrent.futures import ProcessPoolExecutor, as_completed

# Set environment variable to disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Konfigurasi umum
num_proc = os.cpu_count()
SAMPLING_RATE = 16000

# Direktori kerja
base_dir = Path("/workspace/data/audio_train/librivox-indonesia")
base_dir.mkdir(parents=True, exist_ok=True)


# Download Dataset Files
This cell downloads the necessary dataset files from Hugging Face, including training and test audio archives, and their corresponding metadata CSV files compressed with gzip.

In [3]:
!wget -nc https://huggingface.co/datasets/indonesian-nlp/librivox-indonesia/resolve/main/data/audio_train.tgz -P /workspace/data/
!wget -nc https://huggingface.co/datasets/indonesian-nlp/librivox-indonesia/resolve/main/data/audio_test.tgz -P /workspace/data/
!wget -nc https://huggingface.co/datasets/indonesian-nlp/librivox-indonesia/resolve/main/data/metadata_train.csv.gz -P /workspace/data/
!wget -nc https://huggingface.co/datasets/indonesian-nlp/librivox-indonesia/resolve/main/data/metadata_test.csv.gz -P /workspace/data/

File ‚Äò/workspace/data/audio_train.tgz‚Äô already there; not retrieving.

File ‚Äò/workspace/data/audio_test.tgz‚Äô already there; not retrieving.

File ‚Äò/workspace/data/metadata_train.csv.gz‚Äô already there; not retrieving.

File ‚Äò/workspace/data/metadata_test.csv.gz‚Äô already there; not retrieving.



# Extract Audio Archives
This cell extracts the downloaded tar.gz archives containing the training and test audio files to the specified directory structure.

In [None]:
# Ekstrak audio
for archive in ["/workspace/data/audio_train.tgz", "/workspace/data/audio_test.tgz"]:
    with tarfile.open(archive, "r:gz") as tar:
        tar.extractall(path="/workspace/data/audio_train")

# Audio Format Conversion
This cell converts all audio files from their original formats to WAV format with mono channel and 16kHz sampling rate using ffmpeg. It processes the files in parallel for efficiency and updates the metadata CSV to reflect the new file paths. The conversion ensures compatibility with the Whisper model's audio processing requirements.

In [None]:
import os
import pandas as pd
import subprocess
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

# --- Paths ---
audio_base_path = "/workspace/data/audio_train/librivox-indonesia"
output_base = os.path.join(audio_base_path, "converted_wav")
os.makedirs(output_base, exist_ok=True)

# üìå Metadata asli
metadata_path = "/workspace/data/metadata_train.csv.gz"
df = pd.read_csv(metadata_path)

# --- Worker function for parallel ffmpeg ---
def convert_to_wav(rel_path):
    if pd.isna(rel_path):
        return None

    src_path = os.path.join(audio_base_path, rel_path)
    if not os.path.exists(src_path):
        return None

    out_rel_path = os.path.splitext(rel_path)[0] + ".wav"
    out_path = os.path.join(output_base, out_rel_path)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    # ‚úÖ Skip jika sudah ada
    if os.path.exists(out_path):
        return os.path.relpath(out_path, audio_base_path)

    cmd = ["ffmpeg", "-y", "-i", src_path, "-ac", "1", "-ar", "16000", out_path]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    return os.path.relpath(out_path, audio_base_path)

# --- Run parallel conversion ---
new_paths = []
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = {executor.submit(convert_to_wav, rel_path): rel_path for rel_path in df["path"].astype(str)}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Converting to WAV"):
        new_paths.append(future.result())

# --- Update metadata ke versi WAV ---
df["path"] = new_paths
new_meta_path = os.path.join(audio_base_path, "metadata_train_wav.csv")
df.to_csv(new_meta_path, index=False)

print(f"\n‚úÖ Conversion done. Metadata saved: {new_meta_path}")
print(f"üìÇ Converted audio saved under: {output_base}")

# Cleanup Original Audio Files
This cell removes the original MP3 files after conversion to WAV format to save disk space and avoid confusion between different audio formats.

In [6]:

import glob

def cleanup_mp3(split_dir):
    print(f"üóëÔ∏è Cleaning MP3 in {split_dir} ...")
    removed = 0
    for mp3 in glob.glob(os.path.join(split_dir, "**", "*.mp3"), recursive=True):
        try:
            os.remove(mp3)
            removed += 1
        except:
            pass
    print(f"‚úÖ Removed {removed} MP3 files")

cleanup_mp3(audio_base_path)


üóëÔ∏è Cleaning MP3 in /workspace/data/audio_train/librivox-indonesia ...
‚úÖ Removed 7815 MP3 files


# Dataset Preparation for Minangkabau Language
This cell loads the converted metadata, filters the dataset to include only Minangkabau language samples, adds absolute file paths, and converts the pandas DataFrame to a Hugging Face Dataset object for model training.

In [7]:
import os
import pandas as pd
from datasets import Dataset

audio_base_path = "/workspace/data/audio_train/librivox-indonesia"
metadata_path = os.path.join(audio_base_path, "metadata_train_wav.csv")

# Baca metadata & filter hanya Minangkabau
df = pd.read_csv(metadata_path)
df = df[df["language"] == "min"]

# Tambah absolute path
df["full_path"] = df["path"].apply(lambda p: os.path.join(audio_base_path, p))

# Convert ke HuggingFace Dataset
ds_minang = Dataset.from_pandas(df)
print(ds_minang[0])

{'path': 'converted_wav/train/sundanese/universal-declaration-of-human-rights/human_rights_un_sun_brc_0113.wav', 'language': 'min', 'reader': 3232, 'sentence': 'manusia sadonyo lahia ka dunia mambao hak hak dan kamardekaan mandasar nan samo dan indak dapek dipisahkan', 'full_path': '/workspace/data/audio_train/librivox-indonesia/converted_wav/train/sundanese/universal-declaration-of-human-rights/human_rights_un_sun_brc_0113.wav', '__index_level_0__': 140}


# Load Whisper Model and Processor
This cell loads the pre-trained OpenAI Whisper Small model and its processor from Hugging Face. The model is moved to GPU (CUDA) for faster training and inference.

In [13]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Pastikan model ke GPU
model = model.to("cuda")


# Dataset Preprocessing and Training Setup
This cell performs comprehensive setup for fine-tuning the Whisper model: loads the model and processor configured for Minangkabau language transcription, prepares the dataset by splitting into train/validation sets, applies preprocessing to convert audio waveforms to model inputs, and sets up the data collator for sequence-to-sequence training.

In [17]:
import os
import pandas as pd
import torchaudio
from datasets import Dataset, Audio
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from sklearn.model_selection import train_test_split
import torch

# --- Config ---
audio_base_path = "/workspace/data/audio_train/librivox-indonesia"
metadata_path = os.path.join(audio_base_path, "metadata_train_wav.csv")
SAMPLING_RATE = 16000
model_name = "openai/whisper-small"

# --- Load model & processor ---
processor = WhisperProcessor.from_pretrained(model_name, language="minangkabau", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Move generation config parameters to avoid warnings
model.generation_config.max_length = 448
model.generation_config.suppress_tokens = [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362]
model.generation_config.begin_suppress_tokens = [220, 50257]

model = model.to("cuda")

# --- STEP 1: Load metadata ---
df = pd.read_csv(metadata_path)
df["full_path"] = df["path"].apply(lambda p: os.path.join(audio_base_path, p))

# Fokus hanya Minangkabau
df = df[df["language"] == "min"].reset_index(drop=True)

# Split train/validation
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert ke HF Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

# --- STEP 2: Preprocessing function ---
def prepare_dataset(batch):
    # load audio
    waveform, sr = torchaudio.load(batch["full_path"])
    if sr != SAMPLING_RATE:
        waveform = torchaudio.functional.resample(waveform, sr, SAMPLING_RATE)
    waveform = waveform.mean(dim=0)  # mono

    # process
    inputs = processor(
        audio=waveform.numpy(),
        sampling_rate=SAMPLING_RATE,
        text=batch["sentence"]
    )
    return {
        "input_features": inputs["input_features"][0],
        "labels": inputs["labels"]
    }

train_ds = train_ds.map(prepare_dataset)
val_ds = val_ds.map(prepare_dataset)

# --- STEP 3: Data Collator (fixed for tokenized labels) ---
@torch.no_grad()
def data_collator(features):
    # Separate inputs and labels for clarity
    input_features = [{"input_features": feature["input_features"]} for feature in features]
    labels = [feature["labels"] for feature in features]

    # Use feature extractor for audio inputs
    batch = processor.feature_extractor.pad(input_features, return_tensors="pt")

    # Convert labels to tensors and pad them manually
    # Labels are already tokenized, so we need to pad them as integers
    max_length = max(len(label) for label in labels)
    
    padded_labels = []
    attention_masks = []
    
    for label in labels:
        # Pad with -100 (ignored in loss calculation)
        padded_label = label + [-100] * (max_length - len(label))
        attention_mask = [1] * len(label) + [0] * (max_length - len(label))
        
        padded_labels.append(padded_label)
        attention_masks.append(attention_mask)
    
    # Convert to tensors
    labels_tensor = torch.tensor(padded_labels, dtype=torch.long)
    attention_mask = torch.tensor(attention_masks, dtype=torch.long)
    
    # Replace padding tokens with -100 for loss calculation
    labels_tensor = labels_tensor.masked_fill(attention_mask.ne(1), -100)

    # Trim BOS token if present
    if (labels_tensor[:, 0] == processor.tokenizer.bos_token_id).all().cpu().item():
        labels_tensor = labels_tensor[:, 1:]

    batch["labels"] = labels_tensor

    return batch

Map:   0%|          | 0/122 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

# Training Arguments Configuration
This cell configures the training arguments for the Seq2SeqTrainer, including batch sizes, evaluation and save intervals, mixed precision training (FP16), and TensorBoard logging. It removes deprecated arguments to ensure compatibility with the transformers library version.

In [18]:
# --- STEP 4: Trainer ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-minang-checkpoints",
    per_device_train_batch_size=16,  # Reduced batch size for stability
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,   # Increased to maintain effective batch size
    num_train_epochs=3,
    fp16=True,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50,
    eval_steps=500,                  
    save_steps=500,                  
    report_to="tensorboard"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,  
    processing_class=processor,  # Updated from tokenizer to processing_class
)

# Model Training Execution
This cell initiates the fine-tuning process of the Whisper model on the Minangkabau language dataset using the configured trainer. The training will run for the specified number of epochs with periodic evaluation and checkpoint saving.

In [19]:
# --- STEP 5: Training üöÄ ---
trainer.train()

Step,Training Loss




TrainOutput(global_step=12, training_loss=5.78022829691569, metrics={'train_runtime': 59.0148, 'train_samples_per_second': 6.202, 'train_steps_per_second': 0.203, 'total_flos': 1.0562225651712e+17, 'train_loss': 5.78022829691569, 'epoch': 3.0})

# Model Testing and Evaluation
This section loads the test dataset and evaluates the fine-tuned Whisper model's performance on unseen Minangkabau audio samples. It processes the test audio files and compares the model's transcriptions with the ground truth.

In [20]:
# --- Prepare Test Data ---
import os
import pandas as pd
import tarfile

# Extract test audio if not already extracted
test_archive = "/workspace/data/audio_test.tgz"
if os.path.exists(test_archive):
    print("üìÇ Extracting test audio files...")
    with tarfile.open(test_archive, "r:gz") as tar:
        tar.extractall(path="/workspace/data/audio_train")
    print("‚úÖ Test audio extracted")

# Load test metadata
test_metadata_path = "/workspace/data/metadata_test.csv.gz"
test_df = pd.read_csv(test_metadata_path)

# Filter for Minangkabau language only
test_df_min = test_df[test_df["language"] == "min"].reset_index(drop=True)
print(f"üìä Total test samples: {len(test_df)}")
print(f"üéØ Minangkabau test samples: {len(test_df_min)}")

# Add full paths
audio_base_path = "/workspace/data/audio_train/librivox-indonesia"
test_df_min["full_path"] = test_df_min["path"].apply(lambda p: os.path.join(audio_base_path, p))

# Show sample
print(f"\nüìù Sample test data:")
print(test_df_min[["sentence", "path"]].head(3))

üìÇ Extracting test audio files...
‚úÖ Test audio extracted
üìä Total test samples: 754
üéØ Minangkabau test samples: 20

üìù Sample test data:
                                            sentence  \
0                       katahui bana hak hak awak ko   
1  dan kabebasan dari raso takuik dan dari kakura...   
2                           supayo manjadi kanyataan   

                                                path  
0  test/minangkabau/universal-declaration-of-huma...  
1  test/minangkabau/universal-declaration-of-huma...  
2  test/minangkabau/universal-declaration-of-huma...  


In [21]:
# --- Convert Test Audio to WAV (if needed) ---
import subprocess
from tqdm import tqdm

def convert_test_audio_to_wav(df, output_base):
    """Convert test audio files to WAV format for consistency"""
    os.makedirs(output_base, exist_ok=True)
    
    converted_paths = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Converting test audio"):
        rel_path = row["path"]
        src_path = os.path.join(audio_base_path, rel_path)
        
        # Skip if source doesn't exist
        if not os.path.exists(src_path):
            converted_paths.append(None)
            continue
            
        # Create WAV output path
        out_rel_path = os.path.splitext(rel_path)[0] + ".wav"
        out_path = os.path.join(output_base, out_rel_path)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        
        # Skip if already converted
        if os.path.exists(out_path):
            converted_paths.append(os.path.relpath(out_path, audio_base_path))
            continue
            
        # Convert to WAV
        cmd = ["ffmpeg", "-y", "-i", src_path, "-ac", "1", "-ar", "16000", out_path]
        result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        
        if result.returncode == 0:
            converted_paths.append(os.path.relpath(out_path, audio_base_path))
        else:
            converted_paths.append(None)
    
    return converted_paths

# Convert test audio
test_output_base = os.path.join(audio_base_path, "converted_wav")
test_df_min["wav_path"] = convert_test_audio_to_wav(test_df_min, test_output_base)
test_df_min["wav_full_path"] = test_df_min["wav_path"].apply(
    lambda p: os.path.join(audio_base_path, p) if p else None
)

# Remove rows where conversion failed
test_df_clean = test_df_min.dropna(subset=["wav_full_path"]).reset_index(drop=True)
print(f"‚úÖ Successfully converted {len(test_df_clean)} test audio files")

Converting test audio: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:01<00:00, 12.60it/s]

‚úÖ Successfully converted 20 test audio files





In [24]:
# --- Test Model Inference ---
import torchaudio
from transformers import pipeline

def transcribe_audio(audio_path, model, processor):
    """Transcribe a single audio file using the fine-tuned model"""
    try:
        # Load audio
        waveform, sr = torchaudio.load(audio_path)
        if sr != SAMPLING_RATE:
            waveform = torchaudio.functional.resample(waveform, sr, SAMPLING_RATE)
        waveform = waveform.mean(dim=0)  # Convert to mono
        
        # Process audio
        inputs = processor(
            audio=waveform.numpy(),
            sampling_rate=SAMPLING_RATE,
            return_tensors="pt"
        )
        
        # Move to GPU
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        # Generate transcription
        with torch.no_grad():
            generated_ids = model.generate(
                inputs["input_features"],
                max_length=448,
                num_beams=1,
                do_sample=False
            )
        
        # Decode transcription
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return transcription
        
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

# Test on a small sample first (5 files)
test_sample = test_df_clean.head(20).copy()
print("üé§ Testing model on sample audio files...")

predictions = []
ground_truths = []

for idx, row in test_sample.iterrows():
    print(f"Processing {idx+1}/20: {os.path.basename(row['wav_full_path'])}")
    
    # Get prediction
    prediction = transcribe_audio(row["wav_full_path"], model, processor)
    ground_truth = row["sentence"]
    
    predictions.append(prediction)
    ground_truths.append(ground_truth)
    
    print(f"  Ground Truth: {ground_truth}")
    print(f"  Prediction:   {prediction}")
    print("  " + "="*50)

print(f"\n‚úÖ Completed testing on {len(test_sample)} samples")

üé§ Testing model on sample audio files...
Processing 1/5: human_rights_un_min_sd_0009.wav
  Ground Truth: katahui bana hak hak awak ko
  Prediction:    ‡§ï‡§æ‡§§‡§æ hui banahak hak awak
Processing 2/5: human_rights_un_min_sd_0016.wav
  Ground Truth: dan kabebasan dari raso takuik dan dari kakurangan
  Prediction:    dan kababasan dari raso takuik dan dari kakurangan
Processing 3/5: human_rights_un_min_sd_0028.wav
  Ground Truth: supayo manjadi kanyataan
  Prediction:    supayo manjadi kanyataan
Processing 4/5: human_rights_un_min_sd_0039.wav
  Ground Truth: indak ado pambedaan   umpamonyo pambedaan ras
  Prediction:    indak ado pambedaan umpamonyo pambedaan ras
Processing 5/5: human_rights_un_min_sd_0044.wav
  Ground Truth: indak diadokan pambedaan badasar kadudukan politik
  Prediction:    indak diadokan pambedaan badasar kadudukan politik
Processing 6/5: human_rights_un_min_sd_0046.wav
  Ground Truth: baiak babantuak negara mardeka
  Prediction:    baiyak babantuak nagara mardeka


In [25]:
# --- Calculate Evaluation Metrics ---
try:
    import jiwer
    
    # Filter out None predictions
    valid_pairs = [(pred, gt) for pred, gt in zip(predictions, ground_truths) if pred is not None]
    
    if valid_pairs:
        valid_predictions, valid_ground_truths = zip(*valid_pairs)
        
        # Convert tuples to lists for jiwer
        valid_predictions = list(valid_predictions)
        valid_ground_truths = list(valid_ground_truths)
        
        # Calculate Word Error Rate (WER)
        wer_score = jiwer.wer(valid_ground_truths, valid_predictions)
        
        # Calculate Character Error Rate (CER)
        cer_score = jiwer.cer(valid_ground_truths, valid_predictions)
        
        print("üìä EVALUATION RESULTS:")
        print("="*50)
        print(f"üìù Number of samples: {len(valid_pairs)}")
        print(f"üéØ Word Error Rate (WER): {wer_score:.4f} ({wer_score*100:.2f}%)")
        print(f"üî§ Character Error Rate (CER): {cer_score:.4f} ({cer_score*100:.2f}%)")
        print(f"‚úÖ Word Accuracy: {(1-wer_score)*100:.2f}%")
        
        # Show detailed comparison
        print(f"\nüìã DETAILED COMPARISON:")
        print("="*50)
        for i, (pred, gt) in enumerate(valid_pairs):
            print(f"Sample {i+1}:")
            print(f"  Ground Truth: {gt}")
            print(f"  Prediction:   {pred}")
            
            # Calculate individual metrics
            individual_wer = jiwer.wer([gt], [pred])
            individual_cer = jiwer.cer([gt], [pred])
            print(f"  WER: {individual_wer:.4f}, CER: {individual_cer:.4f}")
            print()
            
    else:
        print("‚ùå No valid predictions to evaluate")
        
except ImportError:
    print("‚ö†Ô∏è jiwer not available. Install with: pip install jiwer")
    print("Showing basic comparison instead...")
    
    for i, (pred, gt) in enumerate(zip(predictions, ground_truths)):
        if pred is not None:
            print(f"Sample {i+1}:")
            print(f"  Ground Truth: {gt}")
            print(f"  Prediction:   {pred}")
            print(f"  Match: {'‚úÖ' if pred.lower().strip() == gt.lower().strip() else '‚ùå'}")
            print()
            
except Exception as e:
    print(f"‚ö†Ô∏è Error calculating metrics: {e}")
    print("Showing basic comparison instead...")
    
    for i, (pred, gt) in enumerate(zip(predictions, ground_truths)):
        if pred is not None:
            print(f"Sample {i+1}:")
            print(f"  Ground Truth: {gt}")
            print(f"  Prediction:   {pred}")
            print(f"  Match: {'‚úÖ' if pred.lower().strip() == gt.lower().strip() else '‚ùå'}")
            print()

üìä EVALUATION RESULTS:
üìù Number of samples: 20
üéØ Word Error Rate (WER): 4.8857 (488.57%)
üî§ Character Error Rate (CER): 2.8723 (287.23%)
‚úÖ Word Accuracy: -388.57%

üìã DETAILED COMPARISON:
Sample 1:
  Ground Truth: katahui bana hak hak awak ko
  Prediction:    ‡§ï‡§æ‡§§‡§æ hui banahak hak awak
  WER: 0.6667, CER: 0.3214

Sample 2:
  Ground Truth: dan kabebasan dari raso takuik dan dari kakurangan
  Prediction:    dan kababasan dari raso takuik dan dari kakurangan
  WER: 0.1250, CER: 0.0200

Sample 3:
  Ground Truth: supayo manjadi kanyataan
  Prediction:    supayo manjadi kanyataan
  WER: 0.0000, CER: 0.0000

Sample 4:
  Ground Truth: indak ado pambedaan   umpamonyo pambedaan ras
  Prediction:    indak ado pambedaan umpamonyo pambedaan ras
  WER: 0.0000, CER: 0.0444

Sample 5:
  Ground Truth: indak diadokan pambedaan badasar kadudukan politik
  Prediction:    indak diadokan pambedaan badasar kadudukan politik
  WER: 0.0000, CER: 0.0000

Sample 6:
  Ground Truth: baiak baba

In [26]:
# --- Improved Model Testing with Better Generation Parameters ---
def transcribe_audio_improved(audio_path, model, processor):
    """Transcribe with improved generation parameters to avoid repetition"""
    try:
        # Load audio
        waveform, sr = torchaudio.load(audio_path)
        if sr != SAMPLING_RATE:
            waveform = torchaudio.functional.resample(waveform, sr, SAMPLING_RATE)
        waveform = waveform.mean(dim=0)  # Convert to mono
        
        # Process audio
        inputs = processor(
            audio=waveform.numpy(),
            sampling_rate=SAMPLING_RATE,
            return_tensors="pt"
        )
        
        # Move to GPU
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        # Generate transcription with improved parameters
        with torch.no_grad():
            generated_ids = model.generate(
                inputs["input_features"],
                max_length=200,  # Reduced from 448
                min_length=1,
                num_beams=3,     # Increased from 1
                do_sample=False,
                early_stopping=True,
                no_repeat_ngram_size=3,  # Prevent repetition
                repetition_penalty=1.2,   # Penalize repetition
                length_penalty=1.0,
                bad_words_ids=[[50257]],  # Avoid problematic tokens
                forced_decoder_ids=None,
                temperature=1.0
            )
        
        # Decode transcription
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return transcription.strip()
        
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

# Test with improved parameters on first 5 samples only
test_sample_improved = test_df_clean.head(5).copy()
print("üé§ Re-testing model with improved generation parameters...")

improved_predictions = []
improved_ground_truths = []

for idx, row in test_sample_improved.iterrows():
    print(f"Processing {idx+1}/5: {os.path.basename(row['wav_full_path'])}")
    
    # Get prediction with improved parameters
    prediction = transcribe_audio_improved(row["wav_full_path"], model, processor)
    ground_truth = row["sentence"]
    
    improved_predictions.append(prediction)
    improved_ground_truths.append(ground_truth)
    
    print(f"  Ground Truth: {ground_truth}")
    print(f"  Prediction:   {prediction}")
    print("  " + "="*50)

print(f"\n‚úÖ Completed improved testing on {len(test_sample_improved)} samples")

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üé§ Re-testing model with improved generation parameters...
Processing 1/5: human_rights_un_min_sd_0009.wav
  Ground Truth: katahui bana hak hak awak ko
  Prediction:   upadamahek punyal hak takua diketuikan jo
Processing 2/5: human_rights_un_min_sd_0016.wav
  Ground Truth: dan kabebasan dari raso takuik dan dari kakurangan
  Prediction:   dan kamabasani da di raso tak kakui nan dari kaku
Processing 3/5: human_rights_un_min_sd_0028.wav
  Ground Truth: supayo manjadi kanyataan
  Prediction:   supayao manzadikan yataan
Processing 4/5: human_rights_un_min_sd_0039.wav
  Ground Truth: indak ado pambedaan   umpamonyo pambedaan ras
  Prediction:   indak ado pangbedaan unpamonyo pampedakan rasia
Processing 5/5: human_rights_un_min_sd_0044.wav
  Ground Truth: indak diadokan pambedaan badasar kadudukan politik
  Prediction:   indak diredokan pembedaan bada sar kadudukan politiku tuan dikambahai urang dan cinta

‚úÖ Completed improved testing on 5 samples


In [27]:
# --- Calculate Improved Evaluation Metrics ---
try:
    import jiwer
    
    # Filter out None predictions from improved results
    valid_pairs_improved = [(pred, gt) for pred, gt in zip(improved_predictions, improved_ground_truths) if pred is not None and pred.strip() != ""]
    
    if valid_pairs_improved:
        valid_predictions_improved, valid_ground_truths_improved = zip(*valid_pairs_improved)
        
        # Convert tuples to lists for jiwer
        valid_predictions_improved = list(valid_predictions_improved)
        valid_ground_truths_improved = list(valid_ground_truths_improved)
        
        # Calculate Word Error Rate (WER)
        wer_score_improved = jiwer.wer(valid_ground_truths_improved, valid_predictions_improved)
        
        # Calculate Character Error Rate (CER)
        cer_score_improved = jiwer.cer(valid_ground_truths_improved, valid_predictions_improved)
        
        print("üìä IMPROVED EVALUATION RESULTS:")
        print("="*60)
        print(f"üìù Number of valid samples: {len(valid_pairs_improved)}")
        print(f"üéØ Word Error Rate (WER): {wer_score_improved:.4f} ({wer_score_improved*100:.2f}%)")
        print(f"üî§ Character Error Rate (CER): {cer_score_improved:.4f} ({cer_score_improved*100:.2f}%)")
        print(f"‚úÖ Word Accuracy: {(1-wer_score_improved)*100:.2f}%")
        print(f"üìà Character Accuracy: {(1-cer_score_improved)*100:.2f}%")
        
        # Show detailed comparison
        print(f"\nüìã DETAILED COMPARISON (IMPROVED):")
        print("="*60)
        for i, (pred, gt) in enumerate(valid_pairs_improved):
            print(f"Sample {i+1}:")
            print(f"  Ground Truth: {gt}")
            print(f"  Prediction:   {pred}")
            
            # Calculate individual metrics
            individual_wer = jiwer.wer([gt], [pred])
            individual_cer = jiwer.cer([gt], [pred])
            print(f"  üìä WER: {individual_wer:.4f}, CER: {individual_cer:.4f}")
            
            # Show if it's a good prediction
            if individual_wer < 0.3:
                print(f"  ‚úÖ Good transcription!")
            elif individual_wer < 0.6:
                print(f"  ‚ö†Ô∏è Moderate accuracy")
            else:
                print(f"  ‚ùå Needs improvement")
            print()
            
    else:
        print("‚ùå No valid predictions to evaluate in improved results")
        
except Exception as e:
    print(f"‚ö†Ô∏è Error calculating improved metrics: {e}")
    print("Showing basic comparison instead...")
    
    for i, (pred, gt) in enumerate(zip(improved_predictions, improved_ground_truths)):
        if pred is not None and pred.strip() != "":
            print(f"Sample {i+1}:")
            print(f"  Ground Truth: {gt}")
            print(f"  Prediction:   {pred}")
            print(f"  Match: {'‚úÖ' if pred.lower().strip() == gt.lower().strip() else '‚ùå'}")
            print()

üìä IMPROVED EVALUATION RESULTS:
üìù Number of valid samples: 5
üéØ Word Error Rate (WER): 1.0000 (100.00%)
üî§ Character Error Rate (CER): 0.4619 (46.19%)
‚úÖ Word Accuracy: 0.00%
üìà Character Accuracy: 53.81%

üìã DETAILED COMPARISON (IMPROVED):
Sample 1:
  Ground Truth: katahui bana hak hak awak ko
  Prediction:   upadamahek punyal hak takua diketuikan jo
  üìä WER: 0.8333, CER: 0.8571
  ‚ùå Needs improvement

Sample 2:
  Ground Truth: dan kabebasan dari raso takuik dan dari kakurangan
  Prediction:   dan kamabasani da di raso tak kakui nan dari kaku
  üìä WER: 0.8750, CER: 0.3400
  ‚ùå Needs improvement

Sample 3:
  Ground Truth: supayo manjadi kanyataan
  Prediction:   supayao manzadikan yataan
  üìä WER: 1.0000, CER: 0.1667
  ‚ùå Needs improvement

Sample 4:
  Ground Truth: indak ado pambedaan   umpamonyo pambedaan ras
  Prediction:   indak ado pangbedaan unpamonyo pampedakan rasia
  üìä WER: 0.6667, CER: 0.2000
  ‚ùå Needs improvement

Sample 5:
  Ground Truth: indak 

# Model Analysis and Improvement Suggestions
This section analyzes the model performance and provides insights into potential improvements. The results show that while the model has learned some Minangkabau patterns, it needs more training or different hyperparameters to achieve better accuracy.

In [28]:
# --- Performance Analysis ---
print("üîç PERFORMANCE ANALYSIS:")
print("="*60)

# Calculate basic statistics from the improved results
if 'valid_pairs_improved' in locals() and valid_pairs_improved:
    print(f"\nüìä Current Model Performance:")
    print(f"   ‚Ä¢ Word Error Rate: {wer_score_improved*100:.1f}%")
    print(f"   ‚Ä¢ Character Error Rate: {cer_score_improved*100:.1f}%") 
    print(f"   ‚Ä¢ Word Accuracy: {(1-wer_score_improved)*100:.1f}%")
    print(f"   ‚Ä¢ Character Accuracy: {(1-cer_score_improved)*100:.1f}%")
    
    print(f"\nüéØ Accuracy Interpretation:")
    if wer_score_improved > 0.8:
        print("   ‚ùå High WER (>80%): Model needs significant improvement")
    elif wer_score_improved > 0.5:
        print("   ‚ö†Ô∏è Moderate WER (50-80%): Model shows some learning but needs refinement")
    elif wer_score_improved > 0.3:
        print("   üî∂ Fair WER (30-50%): Model is learning patterns well")
    else:
        print("   ‚úÖ Good WER (<30%): Model performing well")
        
    print(f"\nüî§ Character-level Analysis:")
    if cer_score_improved < 0.3:
        print("   ‚úÖ Good character accuracy - model understands phonetics")
    elif cer_score_improved < 0.6:
        print("   üî∂ Moderate character accuracy - some phonetic understanding")
    else:
        print("   ‚ùå Poor character accuracy - limited phonetic learning")

print(f"\nüìà Training Insights:")
print("   ‚Ä¢ Short training (3 epochs, 12 steps) - likely underfitted")
print("   ‚Ä¢ Model shows some Minangkabau patterns but needs more exposure")
print("   ‚Ä¢ Character-level performance better than word-level suggests partial learning")

print(f"\nüõ†Ô∏è Improvement Strategies:")
print("   1. Increase training epochs (5-10 epochs)")
print("   2. Lower learning rate for fine-grained learning")
print("   3. Increase dataset size if possible")
print("   4. Use data augmentation (speed/pitch variations)")
print("   5. Fine-tune generation parameters further")
print("   6. Consider using Whisper-base instead of small for better capacity")

üîç PERFORMANCE ANALYSIS:

üìä Current Model Performance:
   ‚Ä¢ Word Error Rate: 100.0%
   ‚Ä¢ Character Error Rate: 46.2%
   ‚Ä¢ Word Accuracy: 0.0%
   ‚Ä¢ Character Accuracy: 53.8%

üéØ Accuracy Interpretation:
   ‚ùå High WER (>80%): Model needs significant improvement

üî§ Character-level Analysis:
   üî∂ Moderate character accuracy - some phonetic understanding

üìà Training Insights:
   ‚Ä¢ Short training (3 epochs, 12 steps) - likely underfitted
   ‚Ä¢ Model shows some Minangkabau patterns but needs more exposure
   ‚Ä¢ Character-level performance better than word-level suggests partial learning

üõ†Ô∏è Improvement Strategies:
   1. Increase training epochs (5-10 epochs)
   2. Lower learning rate for fine-grained learning
   3. Increase dataset size if possible
   4. Use data augmentation (speed/pitch variations)
   5. Fine-tune generation parameters further
   6. Consider using Whisper-base instead of small for better capacity


In [29]:
# --- Save Model for Future Use ---
print("üíæ SAVING FINE-TUNED MODEL:")
print("="*50)

try:
    # Save the fine-tuned model and processor
    model_save_path = "./whisper-minang-final"
    
    print(f"Saving model to: {model_save_path}")
    model.save_pretrained(model_save_path)
    processor.save_pretrained(model_save_path)
    
    print("‚úÖ Model and processor saved successfully!")
    print(f"üìÅ Location: {model_save_path}")
    
    # Save a summary of the training results
    results_summary = {
        "model_name": "whisper-small-minangkabau",
        "training_epochs": 3,
        "training_steps": 12,
        "final_loss": 5.78,
        "test_samples": len(valid_pairs_improved) if 'valid_pairs_improved' in locals() else 0,
        "word_error_rate": f"{wer_score_improved*100:.2f}%" if 'wer_score_improved' in locals() else "N/A",
        "character_error_rate": f"{cer_score_improved*100:.2f}%" if 'cer_score_improved' in locals() else "N/A",
        "word_accuracy": f"{(1-wer_score_improved)*100:.2f}%" if 'wer_score_improved' in locals() else "N/A",
        "character_accuracy": f"{(1-cer_score_improved)*100:.2f}%" if 'cer_score_improved' in locals() else "N/A"
    }
    
    import json
    with open(f"{model_save_path}/training_results.json", "w") as f:
        json.dump(results_summary, f, indent=2)
    
    print("üìä Training results summary saved!")
    
    # Instructions for loading the model later
    print(f"\nüîÑ To load this model later, use:")
    print(f"```python")
    print(f"from transformers import WhisperProcessor, WhisperForConditionalGeneration")
    print(f"processor = WhisperProcessor.from_pretrained('{model_save_path}')")
    print(f"model = WhisperForConditionalGeneration.from_pretrained('{model_save_path}')")
    print(f"```")
    
except Exception as e:
    print(f"‚ùå Error saving model: {e}")
    print("The model is still available in memory for this session.")

üíæ SAVING FINE-TUNED MODEL:
Saving model to: ./whisper-minang-final
‚úÖ Model and processor saved successfully!
üìÅ Location: ./whisper-minang-final
üìä Training results summary saved!

üîÑ To load this model later, use:
```python
from transformers import WhisperProcessor, WhisperForConditionalGeneration
processor = WhisperProcessor.from_pretrained('./whisper-minang-final')
model = WhisperForConditionalGeneration.from_pretrained('./whisper-minang-final')
```
