# Some installations

In [None]:
!pip install datasets
!pip install jiwer



In [None]:
model_path = "openai/whisper-small"

# Preparing the dataset ( Egyptian dialect )

In [None]:
from datasets import load_dataset
dataset = load_dataset("MightyStudent/Egyptian-ASR-MGB-3",
                        split="train",
                        trust_remote_code = True,
                        streaming = True
                       )
print(dataset)

IterableDataset({
    features: ['audio', 'sentence'],
    num_shards: 2
})


In [None]:
dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='egyptian-asr-mgb-3', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=955602824, num_examples=1159, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=920687996, post_processing_size=None, dataset_size=955602824, size_in_bytes=None)

In [None]:
train_dataset = dataset.take(1000)
test_dataset = dataset.take(100)

train_dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='egyptian-asr-mgb-3', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=955602824, num_examples=1159, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=920687996, post_processing_size=None, dataset_size=955602824, size_in_bytes=None)

# Testing Whisper-medium without fine tuning

In [None]:
# import torch
# from transformers import pipeline

# pipe = pipeline("automatic-speech-recognition", model = model_path)

# from jiwer import wer

# def translate(audio):
#     outputs = pipe(audio, generate_kwargs={
#             "task": "translate",
#             "language": "ar",
#             "max_new_tokens": 256
#         })
#     return outputs["text"]

# def transcribe(audio):
#     outputs = pipe(audio, generate_kwargs={
#             "task": "transcribe",
#             "language": "ar",
#             "max_new_tokens": 256
#         })
#     return outputs["text"]

# def compare_transcriptions(whisper_transcription, ground_truth):
#     print("Whisper Transcription:", whisper_transcription)
#     print("Ground -------- Truth:", ground_truth)
#     error_rate = wer(ground_truth, whisper_transcription)
#     print(f"Word Error Rate: {error_rate * 100}%")


# # Create an iterator to stream the dataset
# dataset_iter = iter(test_dataset)

# sample = next(dataset_iter)
# whisper_transcription = transcribe(sample["audio"].copy())
# ground_truth = sample.get("sentence", "No transcription available")
# compare_transcriptions(whisper_transcription, ground_truth)



# The error rate without fine tuning is 32%. let's try with fine tuning



# Fine tuning

In [None]:
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

In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
dataset_iter = iter(train_dataset)
print(next(dataset_iter)['audio'])

{'path': 'comedy_78_first_12min_part_0.wav', 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.26385498,
       -0.27554321, -0.29098511]), 'sampling_rate': 16000}


In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(model_path, language="Arabic", task="transcribe")


In [None]:
def prepare_dataset(batch):
    # Since 'audio' is a list of audio samples in a batch,
    # we process each sample individually.
    input_features = []
    labels = []

    for audio in batch["audio"]:
        # compute log-Mel input features from input audio array
        input_features.append(processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0])

    # encode target text to label ids  (Outside the loop as sentence is directly accessible)
    labels = processor.tokenizer(batch["sentence"]).input_ids

    batch["input_features"] = input_features
    batch["labels"] = labels
    return batch

In [None]:

train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names,  # Remove raw data after processing
    batched=True,
    batch_size=32,
    # batch_writer_size=50,

    )

test_dataset = test_dataset.map(
    prepare_dataset,
    remove_columns=test_dataset.column_names,
    batched=True,
    batch_size=8,
)

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_path)

model.generation_config.language = "arabic"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None


In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


In [None]:
!pip install evaluate
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}




In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./Gemyu/Whisper_EG_dialect",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    disable_tqdm=False,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=50,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)


In [None]:
from transformers import Seq2SeqTrainer,ProgressCallback

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
)

trainer.add_callback(ProgressCallback)
trainer.train()

  0%|          | 0/1000 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Wer
100,0.9771,1.137155,80.86318
200,0.7052,0.812497,69.86363
300,0.4916,0.618867,59.913083
400,0.3305,0.373892,38.16874
500,0.1662,0.272844,32.429192
600,0.1234,0.182906,24.846396
700,0.07,0.086665,13.067586
800,0.0404,0.082111,14.04166


{'loss': 1.1089, 'grad_norm': 8.783940315246582, 'learning_rate': 9.200000000000001e-07, 'epoch': 0.05}
{'loss': 0.9771, 'grad_norm': 7.961747169494629, 'learning_rate': 1.9200000000000003e-06, 'epoch': 1.04}
{'eval_loss': 1.1371546983718872, 'eval_wer': 80.86317997901993, 'eval_runtime': 109.8975, 'eval_samples_per_second': 0.91, 'eval_steps_per_second': 0.118, 'epoch': 1.04}




{'loss': 0.8349, 'grad_norm': 5.514706611633301, 'learning_rate': 2.92e-06, 'epoch': 2.02}
{'loss': 0.7052, 'grad_norm': 5.644146919250488, 'learning_rate': 3.920000000000001e-06, 'epoch': 3.01}
{'eval_loss': 0.8124969005584717, 'eval_wer': 69.86362955192567, 'eval_runtime': 97.5546, 'eval_samples_per_second': 1.025, 'eval_steps_per_second': 0.133, 'epoch': 3.01}
{'loss': 0.5719, 'grad_norm': 6.128654956817627, 'learning_rate': 4.92e-06, 'epoch': 3.06}
{'loss': 0.4916, 'grad_norm': 4.33698034286499, 'learning_rate': 5.92e-06, 'epoch': 4.05}
{'eval_loss': 0.6188666224479675, 'eval_wer': 59.91308257155702, 'eval_runtime': 96.1283, 'eval_samples_per_second': 1.04, 'eval_steps_per_second': 0.135, 'epoch': 4.05}
{'loss': 0.4205, 'grad_norm': 4.141900539398193, 'learning_rate': 6.92e-06, 'epoch': 5.04}
{'loss': 0.3305, 'grad_norm': 4.3865437507629395, 'learning_rate': 7.92e-06, 'epoch': 6.02}
{'eval_loss': 0.3738919794559479, 'eval_wer': 38.16873969728758, 'eval_runtime': 95.5735, 'eval_samp

In [None]:
kwargs = {
    "dataset_tags": "MightyStudent/Egyptian-ASR-MGB-3",
    "dataset": "Egyptian-ASR",
    "dataset_args": "config: ar, split: train",
    "language": "ar",
    "model_name": "Whisper_EG_dialect",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}


In [None]:
trainer.push_to_hub(**kwargs)

# Testing and deploying the endpoint

In [8]:
# !pip install sounddevice
# !pip install wavio
# !pip install vocode
# !pip install librosa

from transformers import pipeline

from huggingface_hub import login

import soundfile as sf  # For handling audio files

import librosa

login(token="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")

pipe = pipeline("automatic-speech-recognition", model = "Gemyu/Whisper_EG_dialect")


def test_transcription(audio_path):
    # Load audio file (make sure it's in 16kHz mono format)
    audio_input, sample_rate = librosa.load(audio_path, mono=True, sr=16000) # Load audio, convert to mono, and resample to 16kHz

      # Remove forced_decoder_ids from generation_config
    pipe.model.generation_config.forced_decoder_ids = None
    # Perform transcription
    result = pipe({"raw": audio_input, "sampling_rate": sample_rate})
    return result["text"]

test_transcription("/content/Recording (4).wav")

Device set to use cpu


' I am a model, I am good at work I am a magician and magician, I am a'