<a href="https://colab.research.google.com/github/mbk9889/Speech-to-text/blob/main/fine_tuning_Whisper_turbo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install dependencies
!pip install -q transformers datasets accelerate peft librosa soundfile

In [None]:
# Step 2: Import libraries
import os
import librosa
import soundfile as sf
from datasets import load_dataset
import torch
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

In [None]:
# Step 3: Define model and processor
model_id = "openai/whisper-large-v3-turbo"  # Using the turbo version
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,      # use FP16 if GPU available
    low_cpu_mem_usage=True
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
# Step 4: Prepare your custom dataset
# Assume you have a CSV file (e.g., "train.csv" and "test.csv") with columns:
# "audio_filepath" (full path to the audio file) and "transcript" (the ground truth text)
data_files = {"train": "data/train.csv", "test": "data/test.csv"}
dataset = load_dataset("csv", data_files=data_files)


In [None]:
# Step 5: Load and resample audio files
def load_audio(example):
    # Load the audio file with librosa and force sampling rate to 16kHz (the expected rate)
    audio, sr = librosa.load(example["audio_filepath"], sr=16000)
    example["audio_array"] = audio
    example["sampling_rate"] = sr
    return example

dataset = dataset.map(load_audio)

In [None]:
# Step 6: Prepare dataset samples for training
def prepare_sample(batch):
    # Process audio to input_features (log-Mel spectrogram)
    inputs = processor(batch["audio_array"], sampling_rate=batch["sampling_rate"])
    batch["input_features"] = inputs.input_features[0]
    # Tokenize the transcription; note that the tokenizer pads/truncates automatically later
    batch["labels"] = processor.tokenizer(batch["transcript"]).input_ids
    return batch

In [None]:
# Remove unnecessary columns after mapping
remove_columns = ["audio_filepath", "transcript", "audio_array", "sampling_rate"]
dataset = dataset.map(prepare_sample, remove_columns=remove_columns)


In [None]:
# Step 7: Define TrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    evaluation_strategy="steps",
    num_train_epochs=3,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-5,
    fp16=True,
)

In [None]:
# Step 8: Create a data collator
data_collator = DataCollatorForSeq2Seq(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer
)

In [None]:
# Step 9: Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,  # used for padding input_features
)

In [None]:
# Step 10: Fine-tune the model
trainer.train()

In [None]:
# Step 11: Save the fine-tuned model and processor
trainer.save_model("./whisper-finetuned")
processor.save_pretrained("./whisper-finetuned")