# **Install Dependencies**

In [None]:
!pip install datasets==4.4.1
!pip install transformers==4.57.1
!pip install accelerate==1.11.0
!pip install bitsandbytes==0.48.2
!pip install peft==0.18.0
!pip install evaluate==0.4.6
!pip install jiwer==4.0.0
!pip install tensorboard==2.19.0
!pip install gradio==5.49.1

In [None]:
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install torchcodec

# **PyTorch GPU Setup and Memory Management**

In [1]:
import os


os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import torch
import gc

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

def clear_gpu():
    torch.clear_autocast_cache()
    torch.cuda.ipc_collect()
    torch.cuda.empty_cache()
    gc.collect()

if device=="gpu":
    clear_gpu()

Device: cuda


In [4]:
import multiprocessing

def optimal_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

num_cpus = optimal_workers()
num_cpus

4

# **Load Dataset**

In [5]:
from datasets import load_dataset, Audio

dataset = load_dataset("MightyStudent/Egyptian-ASR-MGB-3")
dataset = dataset.cast_column("audio", Audio(decode=False))
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 116
    })
})

In [6]:
dataset["train"][0]["sentence"]

'دلوقتي الموضوع simple خالص الخيوط تجميلية تقنية قوية جداً جداً هي بقالها فترة على فكرة موجودة بس حالياً هنا زاء الرقبة هنا ... هي هو الخيوط بتعمل الشكل اللي إحنا شايفينه ده الترهل اللي موجود ف الرقبة بيتشد بالشكل ده طبعاً واضح الخطين اللي كنتي بتتكلمي عليهم من شوية والخطين اللي تحت اللي بيبقوا المريونات اللي مديني الـ sad face )'

# **Loading Whisper V3 Components**

In [7]:
model_id = "openai/whisper-large-v3"
language="Arabic"
task="transcribe"

In [8]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor


feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)

tokenizer = WhisperTokenizer.from_pretrained(model_id,
                                             language=language,
                                             task=task)

processor = WhisperProcessor.from_pretrained(model_id,
                                             language=language,
                                             task=task)

2025-11-19 14:31:00.063105: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763562660.086307     771 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763562660.093307     771 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [9]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [10]:
del dataset

In [11]:
train_dataset = train_dataset.select(range(150))
test_dataset = test_dataset.select(range(5))

# **Preparing Audio Batches for Whisper**

In [12]:
import io
import soundfile as sf

def prepare_batch(batch):
    audio_arrays = []
    for audio in batch["audio"]:
        arr, sr = sf.read(io.BytesIO(audio["bytes"]))
        audio_arrays.append(arr)
    
    batch["input_features"] = processor(
        audio_arrays, sampling_rate=16000, return_tensors="pt"
    ).input_features

    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch


train_dataset = train_dataset.map(prepare_batch, batched=True, batch_size=64)
test_dataset = test_dataset.map(prepare_batch, batched=True, batch_size=64)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

# **Quantization & LoRA Config Setup**

In [13]:
with_quantization_config = True
with_lora_peft = True

In [14]:
from transformers import WhisperForConditionalGeneration
from transformers import BitsAndBytesConfig


bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.float16)

attn_implementation = "eager"

# **Load Whisper Model (with 4-bit QLoRA Support)**

In [15]:
model = WhisperForConditionalGeneration.from_pretrained(model_id,
                                                        quantization_config= bnb_config if with_quantization_config else None,
                                                        attn_implementation=attn_implementation,
                                                        device_map="auto")

model.generation_config.pad_token_id = tokenizer.pad_token_id

# **Find Target Linear Layers for LoRA Injection**

In [16]:
import bitsandbytes as bnb


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit 
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if "lm_head" in lora_module_names: 
            lora_module_names.remove("lm_head")

    return list(lora_module_names)

# **Apply QLoRA PEFT to the Whisper Model**

In [17]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


if with_lora_peft:
    target_modules = find_all_linear_names(model)

    qlora_config = LoraConfig(r=128, 
                            lora_alpha=32,  
                            target_modules=target_modules,  
                            lora_dropout=0.05,  
                            bias="none")
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, qlora_config)
    model.print_trainable_parameters()



trainable params: 230,686,720 || all params: 1,774,177,280 || trainable%: 13.0025


# **Configure Whisper's Generation Settings**

In [18]:
model.generation_config.language = language
model.generation_config.task = task
model.generation_config.forced_decoder_ids = None

# **Custom Data Collator for Speech → Text (Seq2Seq)**

In [19]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [20]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor,
                                                     decoder_start_token_id=model.config.decoder_start_token_id)

# **Computing WER (Word Error Rate) for Whisper Predictions**

In [21]:
import evaluate

metric = evaluate.load("wer")

In [22]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

# **Training Whisper with Seq2SeqTrainer**

In [23]:
from transformers import Seq2SeqTrainingArguments


output_dir = "Whisper-large-V3-EG"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=5,
    num_train_epochs=1,
    gradient_checkpointing=True,
    fp16=True,
    bf16=False,
    eval_strategy="steps",
    predict_with_generate=True,
    generation_max_length=1024,
    eval_accumulation_steps=1,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,

)

In [24]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor
)

In [25]:
processor.save_pretrained(training_args.output_dir)

[]

In [26]:
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
10,0.7749,0.63506,40.996169
20,0.6541,0.628933,40.996169
30,0.6885,0.622592,40.996169
40,0.6663,0.617332,40.229885
50,0.8181,0.613239,40.229885
60,0.7335,0.610425,40.229885
70,0.7506,0.609047,39.08046


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TrainOutput(global_step=75, training_loss=0.7211265945434571, metrics={'train_runtime': 1282.737, 'train_samples_per_second': 0.117, 'train_steps_per_second': 0.058, 'total_flos': 5.89350076416e+17, 'train_loss': 0.7211265945434571, 'epoch': 1.0})

# **Uploading Whisper Model & Processor to Hugging Face Hub**

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_token = user_secrets.get_secret("Whisper-EG")

In [None]:
HF_token

In [None]:
from huggingface_hub import login

login(token=HF_token)

In [None]:

model.save_pretrained("Whisper-large-V3-EG")
processor.save_pretrained("Whisper-large-V3-EG")

from huggingface_hub import login, create_repo

model.push_to_hub("KhaledHelmy/Whisper-large-V3-EG", private=True, token=HF_TOKEN)
processor.push_to_hub("KhaledHelmy/Whisper-large-V3-EG", private=True, token=HF_TOKEN)

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

model.save_pretrained("Whisper-large-V3-EG")
processor.save_pretrained("Whisper-large-V3-EG")


model.push_to_hub("Whisper-large-V3-EG", private=True)
processor.push_to_hub("Whisper-large-V3-EG", private=True)

# **Testing Fine-Tuned Model on Sample Audio**

In [27]:
processor = WhisperProcessor.from_pretrained("KhaledHelmy/Whisper-large-V3-EG")
model = WhisperForConditionalGeneration.from_pretrained("KhaledHelmy/Whisper-large-V3-EG")

for i in range(2):
    sample = test_dataset[i]
    audio_array, sr = sf.read(io.BytesIO(sample['audio']['bytes']))

    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features

    with torch.no_grad():
        predicted_ids = model.generate(inputs)
 
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    print(f"Example {i+1}")
    print("Ground truth:", sample["sentence"])
    print("Prediction  :", transcription)
    print("-"*50)

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/923M [00:00<?, ?B/s]

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


Example 1
Ground truth: إن تنزل تريزيجيه وتوسع الملعب بالشكل دوت مش في صالح المنتخب المصري وح وخصوصاً إن ما كانش فيه زيادة من تحت كوكا الوحيد بين أربع لاعيبة محمد صلاح بعيد على الطرف التاني ف تريزيجيه ما كانش لاقي دور الدور الوحيد كان بتاع رمضان صبحي صلحه كوبر ولكن ط ال ال الاصلاح كان متأخر جداً وأنا في رأيي يمكن للمهارة الفردية والامكانيات ورغبة لاعيبة المنتخب المصري لأحراز هدف هي اللي عدلت الكفة لسيت لا طريقة من كوبر
Prediction  :  أن تنزل تريزيجي وتوسع الملعب بالشكل ده هو مش بصالح المنتخب المصري وخصوصا أنه ما كانش فيه زيادة من تحت كوكة الوحيد بين أربع لعيبة محمد صلاح بيد على الطرف الثاني فتريزيجي ما كانش لأي دور الدور الوحيد كان بتعرمض عن صفحة صلحوا كوكة الوحيد كان متأخر جدا وانا في رأيي يمكن للمهارة الفردية والإمكانيات ورغبة لعيبة المنتخب المصري لأحرز هدف هي اللي عدلت الكافة ليس لا طريقة من كوكة
--------------------------------------------------
Example 2
Ground truth: لفات كمان طالعة موضة دلوقتي أشكال كتير جداً منها هتليق عليها ولا لا إحنا النهارده هنحاول شوية نساعدكوا تختاروا الل