# **Fine Tuned Model : Access On Hugging Face Model Hub [Click](https://huggingface.co/Praveendecode/finetuned-whishper-small-marathi)**

# **GPU Info Check**

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

#Script: https://github.com/praveendecode/Voice_AI/blob/main/Source/Fine_Tuning_Whisper_OpenAI_Small.ipynb
print("An example flow is in comments")
# his code checks if the environment is connected to a GPU by using the nvidia-smi command.

# If the command fails or contains the word "failed," it prints "Not connected to a GPU." Otherwise, it prints the GPU informatio

Wed Apr 30 14:56:07 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57       Driver Version: 515.57       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:02:00.0 Off |                  N/A |
| 22%   28C    P8    15W / 250W |     18MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:03:00.0 Off |                  N/A |
| 22%   30C    P8    17W / 250W |      1MiB / 11264MiB |      0%      Default |
|       

In [2]:
import os ,torch

if torch.cuda.is_available() :
  os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" #it was prviously the 0 for one GPU 
  print('yes')
  print(f"No of GPUs Available:  {torch.cuda.device_count()}")

# This code checks if a GPU is available. If it is, it sets the CUDA_VISIBLE_DEVICES environment variable to "0" and prints "yes."
# This is a way to specify which GPU to use when multiple GPUs are available

yes
No of GPUs Available:  4


In [3]:
import warnings as w
w.filterwarnings('ignore')

## Load Dataset

In [4]:
#Again
import librosa
# Function to load audio
def load_audio_file(file_path):
    # Load audio using librosa
    try:
        audio, sr = librosa.load(file_path, sr=None)  # sr=None preserves the original sampling rate
        return audio, sr
    except Exception as e:
        print(f"Error loading audio file {file_path}: {e}")
        return None, None

# Function to load transcription from the .tsv file
def load_transcription(file_path):
    # Load the transcription from the tsv file
    if not os.path.exists(file_path):
        print(f"Warning: transcription file does not exist: {file_path}")
        return []
   
    with open(file_path, 'r') as f:
        transcriptions = f.readlines()
   
    transcription_data = []
    for line in transcriptions:
        parts = line.strip().split('\t')  # Assuming the tsv is tab-delimited
        if len(parts) == 2:  # Ensure there are exactly two parts: ID and sentence
            id_, sentence = parts
            transcription_data.append({
                "id": id_,
                "sentence": sentence
            })
        else:
            print(f"Warning: Skipping line in transcription file (invalid format): {line}")
   
    return transcription_data

# Function to create the example dictionary for each speaker
def create_example_dict(speaker_dir):
    # Example of the speaker_dir: /train/speaker1 or /test/speaker1
    speaker_name = os.path.basename(speaker_dir)
    print(f"Processing speaker: {speaker_name}")
   
    # Paths to transcription and audio files
    transcription_file = os.path.join(speaker_dir, f"{speaker_name}.tsv")
    audio_dir = os.path.join(speaker_dir, speaker_name)
   
    # Load transcriptions
    transcriptions = load_transcription(transcription_file)
    if not transcriptions:
        print(f"Warning: No transcriptions found for {speaker_name}")
   
    examples = []
    for transcription in transcriptions:
        # Use the ID from the transcription as the filename for the corresponding .wav file
        audio_file = os.path.join(audio_dir, f"{transcription['sentence']}.wav")  # Match audio file using transcription ID
        print(f"Looking for audio file: {audio_file}")
       
        if os.path.exists(audio_file):
            audio, sr = load_audio_file(audio_file)
            if audio is not None:
                examples.append({
                    "audio": audio,
                    "id": transcription['sentence'],  # Correctly assign sentence here
                    "speaker": speaker_name,
                    "path": audio_file,
                    "sentence": transcription['id'],  # ID corresponds to the .wav filename
                })
            else:
                print(f"Error loading audio: {audio_file}")
        else:
            print(f"Audio file not found: {audio_file}")
   
    return examples

# Function to load the dataset from train and test directories
def load_dataset_from_directories(base_dir):
    """
    Split the dataset into train and test using a speaker-based split.
    """
    # Get list of speaker directories in train and test
    data_dict = {"train": [], "test": []}
   
    for split in ['train', 'test']:
        split_dir = os.path.join(base_dir, split)
        if not os.path.isdir(split_dir):
            print(f"Warning: {split_dir} does not exist")
            continue
       
        # Iterate over speaker directories
        for speaker_dir in os.listdir(split_dir):
            speaker_path = os.path.join(split_dir, speaker_dir)
            if os.path.isdir(speaker_path):  # Only consider directories (speakers)
                print(f"Processing {speaker_path}....")

                # Create dataset entries for this speaker
                examples = create_example_dict(speaker_path)
                if examples:
                    data_dict[split].extend(examples)
                else:
                    print(f"No valid examples found for speaker {speaker_path}")
   
    # Convert to Hugging Face Dataset
    if data_dict['train'] and data_dict['test']:
        train_dataset = Dataset.from_list(data_dict['train'])
        test_dataset = Dataset.from_list(data_dict['test'])
        return DatasetDict({
            "train": train_dataset,
            "test": test_dataset
        })
    else:
        print("No data loaded for train or test splits.")
        return None
# Example usage
from datasets import Dataset, DatasetDict
base_dir = '/data3/sharif/Datasets/BaltiSpeechDataset'  # Directory where train/test dirs are located
dataset = load_dataset_from_directories(base_dir)

# Example of what you get in the dataset
if dataset:
    print("Dataset loaded successfully!")
    print(dataset)
else:
    print("Dataset is empty.")

Processing /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi....
Processing speaker: Nabi
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/ID.wav
Audio file not found: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/ID.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/157nabi.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/158nabi.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/159nabi.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/160nabi.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/161nabi.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/162nabi.wav
Looking for audio file: /data3/sharif/Datasets/BaltiSpeechDataset/train/N

In [5]:
common_voice = dataset

In [6]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'id', 'speaker', 'path', 'sentence'],
        num_rows: 8120
    })
    test: Dataset({
        features: ['audio', 'id', 'speaker', 'path', 'sentence'],
        num_rows: 894
    })
})


In [7]:
print(f"Train-set Len:  {len(common_voice['train'])}")
print(f"Test -set Len:  {len(common_voice['test'])}")

print("Before preprocessing:",common_voice['train']['sentence'][120]) 


Train-set Len:  8120
Test -set Len:  894
Before preprocessing: غبول


In [8]:
from Dataset_Preprocessing import preprocess_dataset

common_voice = preprocess_dataset(common_voice)


DatasetDict({
    train: Dataset({
        features: ['audio', 'id', 'speaker', 'path', 'sentence'],
        num_rows: 8120
    })
    test: Dataset({
        features: ['audio', 'id', 'speaker', 'path', 'sentence'],
        num_rows: 894
    })
})


Filter: 100%|██████████| 8120/8120 [06:05<00:00, 22.20 examples/s]
Filter: 100%|██████████| 894/894 [00:35<00:00, 25.28 examples/s]
Map: 100%|██████████| 8120/8120 [00:08<00:00, 926.92 examples/s] 
Map: 100%|██████████| 894/894 [00:00<00:00, 1562.64 examples/s]
Map: 100%|██████████| 8120/8120 [00:07<00:00, 1060.76 examples/s]
Map: 100%|██████████| 894/894 [00:00<00:00, 1922.59 examples/s]
Map: 100%|██████████| 8120/8120 [00:07<00:00, 1057.62 examples/s]
Map: 100%|██████████| 894/894 [00:00<00:00, 1876.80 examples/s]
Map: 100%|██████████| 8120/8120 [00:07<00:00, 1060.28 examples/s]
Map: 100%|██████████| 894/894 [00:00<00:00, 1494.50 examples/s]


In [9]:
print("After preprocessing :",common_voice['train']['sentence'][120]) 


After preprocessing : غبول


In [10]:
print("sentence after preprocessing:",common_voice['train']['sentence'][55]) 
print("id after preprocessing:",common_voice['train']['id'][55])
print("speaker after preprocessing:",common_voice['train']['speaker'][55]) 
print("path after preprocessing:",common_voice['train']['path'][55])

sentence after preprocessing: اشی
id after preprocessing: 48
speaker after preprocessing: Ishrat Abbas
path after preprocessing: /data3/sharif/Datasets/BaltiSpeechDataset/train/Ishrat Abbas/Ishrat Abbas/48.wav


In [11]:
# Load the PT Whisper model 

model_name_or_path = "/data3/sharif/Datasets/openai_whisper_tiny"   #whisper_tiny
language = "Urdu"
language_abbr = "ur"
task = "transcribe"

In [12]:
# from transformers import WhisperTokenizer
# from datasets import Audio, load_dataset,DatasetDict

# print("==================== 🎵 Pre-Audio 🎵 ===========================")
# tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

# input_str = common_voice["train"][0]["sentence"] 
# labels = tokenizer(input_str).input_ids
# decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
# decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

# print(f"Input:                 {input_str}")
# print(f"Decoded w/ special:    {decoded_with_special}")
# print(f"Decoded w/out special: {decoded_str}")
# print(f"Are equal:             {input_str == decoded_str}")

# common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
# print(common_voice["train"][0])

In [13]:
print("sentence After preprocessing:",common_voice['train']['sentence'][55]) 
print("id After preprocessing:",common_voice['train']['id'][55])
print("speaker After preprocessing:",common_voice['train']['speaker'][55]) 
print("path After preprocessing:",common_voice['train']['path'][55])

sentence After preprocessing: اشی
id After preprocessing: 48
speaker After preprocessing: Ishrat Abbas
path After preprocessing: /data3/sharif/Datasets/BaltiSpeechDataset/train/Ishrat Abbas/Ishrat Abbas/48.wav


In [14]:
common_voice["test"] = common_voice["test"].remove_columns(['speaker','audio'])
common_voice["train"] = common_voice["train"].remove_columns(['speaker','audio'])


In [15]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['id', 'path', 'sentence'],
        num_rows: 8120
    })
    test: Dataset({
        features: ['id', 'path', 'sentence'],
        num_rows: 894
    })
})


In [16]:
# model_name_or_path = "/data3/sharif/Datasets/openai_whisper_tiny"   #whisper_tiny
# model_name_or_path = "/data3/sharif/Datasets/openai_whisper_base"   #whisper_Base
model_name_or_path = "/data3/sharif/Datasets/openai_whisper_tiny"   #whisper_tiny
# model_name_or_path = "/data3/sharif/Datasets/openai_whisper-medium"   #whisper_Medium

language = "Urdu"
language_abbr = "ur"
task = "transcribe"

# dataset_name = "/data3/sharif/Datasets/Urdu_Kathbath_Data/ur"  # Update with your actual dataset path

train_audio_dir = os.path.join(base_dir, 'train')  # Path to train audio folder
test_audio_dir = os.path.join(base_dir, 'test')  # Path to test audio folder

print(task +" "+ language +"Finetuning the whisper small: " +model_name_or_path)
print("Dataset Used is : " +base_dir)

transcribe UrduFinetuning the whisper small: /data3/sharif/Datasets/openai_whisper_tiny
Dataset Used is : /data3/sharif/Datasets/BaltiSpeechDataset


In [17]:
print(f"Train-set Len:  {len(common_voice['train'])}")
print(f"Test -set Len:  {len(common_voice['test'])}")

Train-set Len:  8120
Test -set Len:  894


In [18]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['id', 'path', 'sentence'],
        num_rows: 8120
    })
    test: Dataset({
        features: ['id', 'path', 'sentence'],
        num_rows: 894
    })
})


In [19]:
print(common_voice["train"][0])

{'id': '156nabi', 'path': '/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav', 'sentence': 'کوشو'}


# **Prepare Feature Extractor , Tokenizer , Data**

* A feature extractor which pre-processes the raw audio-inputs
* The model which performs the sequence-to-sequence mapping
* A tokenizer which post-processes the model outputs to text format

### **Feature Extractor**


In [20]:
from transformers import WhisperFeatureExtractor
import torchaudio

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
feature_extractor

WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

### **Tokenizer**

In [21]:
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


WhisperTokenizer(name_or_path='/data3/sharif/Datasets/openai_whisper_tiny', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>', '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|>', '<|lt|>', '<|la|>', '<|mi|>', '<|ml|>', '<|cy|>', '<|sk|>', '<|te|>', '<|fa|>', '<|lv|>', '<|bn|>', '<|sr|>', '<|az|>', '<|sl|>', '<|kn|>', '<|et|>', '<|mk|>', '<|br|>', '<|eu|>', '<|is|>', '<|hy|>', '<|ne|>', '<|mn|>', '<|bs|>', '<|kk|>', '<|sq|>', '<|sw|>', '<|gl|>', '<|



### **Processor**

In [22]:
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)
processor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


WhisperProcessor:
- feature_extractor: WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: WhisperTokenizer(name_or_path='/data3/sharif/Datasets/openai_whisper_tiny', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>

**Load the Audio File:**

In [23]:
print(common_voice["train"][0])

{'id': '156nabi', 'path': '/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav', 'sentence': 'کوشو'}


In [24]:
print("Train Samples:",common_voice['train']['path'][54]) 
# print("Train Samples:",common_voice['train']['audio'][55]) 
# print("train Samples:",common_voice['train']['sentence'][55]) 


Train Samples: /data3/sharif/Datasets/BaltiSpeechDataset/train/Ishrat Abbas/Ishrat Abbas/47.wav


In [25]:
import os
# Define the base dataset directory
dataset_name = base_dir

# Helper function to trace audio path from the subfolders
def get_audio_path(row, base_dir=dataset_name):
    return row['path']

# Apply the function to create the new 'audio' column for both train and test datasets
common_voice["train"] = common_voice["train"].add_column("audio", [get_audio_path(row) for row in common_voice["train"]])
common_voice["test"] = common_voice["test"].add_column("audio", [get_audio_path(row) for row in common_voice["test"]])

# Print a sample from the updated dataset to verify
print(common_voice["train"][0])

{'id': '156nabi', 'path': '/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav', 'sentence': 'کوشو', 'audio': '/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav'}


In [26]:
common_voice["train"][0]["path"]

'/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav'

In [27]:
common_voice["train"][0]["sentence"]

'کوشو'

In [28]:
common_voice["train"][0]["audio"]

'/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav'

In [29]:
from datasets import Audio

print("==================== 🎵 Pre-Audio 🎵 ===========================")

input_str = common_voice["train"][0]["sentence"] 
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))


Input:                 کوشو
Decoded w/ special:    <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>کوشو<|endoftext|>
Decoded w/out special: کوشو
Are equal:             True


In [30]:
from transformers import WhisperFeatureExtractor
import torchaudio
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
feature_extractor 

WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [31]:
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


WhisperTokenizer(name_or_path='/data3/sharif/Datasets/openai_whisper_tiny', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>', '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|>', '<|lt|>', '<|la|>', '<|mi|>', '<|ml|>', '<|cy|>', '<|sk|>', '<|te|>', '<|fa|>', '<|lv|>', '<|bn|>', '<|sr|>', '<|az|>', '<|sl|>', '<|kn|>', '<|et|>', '<|mk|>', '<|br|>', '<|eu|>', '<|is|>', '<|hy|>', '<|ne|>', '<|mn|>', '<|bs|>', '<|kk|>', '<|sq|>', '<|sw|>', '<|gl|>', '<|

In [32]:
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)
processor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


WhisperProcessor:
- feature_extractor: WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: WhisperTokenizer(name_or_path='/data3/sharif/Datasets/openai_whisper_tiny', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>

In [33]:
import librosa

audio_path =  common_voice['train'][0]['audio']['path']
print(audio_path)
raw_audio, sampling_rate  = librosa.load(audio_path, sr=16000)

/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav


In [34]:
raw_audio, sampling_rate 

(array([0.        , 0.        , 0.        , ..., 0.00066827, 0.00061445,
        0.00070939], dtype=float32),
 16000)

In [35]:
inputs = processor(raw_audio, sampling_rate=sampling_rate, return_tensors="pt")

inputs

{'input_features': tensor([[[-0.6813, -0.6813, -0.6813,  ..., -0.6813, -0.6813, -0.6813],
         [-0.6813, -0.6813, -0.6813,  ..., -0.6813, -0.6813, -0.6813],
         [-0.6813, -0.6813, -0.6813,  ..., -0.6813, -0.6813, -0.6813],
         ...,
         [-0.6813, -0.6813, -0.6813,  ..., -0.6813, -0.6813, -0.6813],
         [-0.6813, -0.6813, -0.6813,  ..., -0.6813, -0.6813, -0.6813],
         [-0.6813, -0.6813, -0.6813,  ..., -0.6813, -0.6813, -0.6813]]])}

In [36]:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path)

In [37]:
print("Before preprocessing:",common_voice['train']['path'][151]) 
print("Before preprocessing:",common_voice['train']['sentence'][151]) 

Before preprocessing: /data3/sharif/Datasets/BaltiSpeechDataset/train/Ishrat Abbas/Ishrat Abbas/144.wav
Before preprocessing: ہرتہ


In [38]:
# Transcribe the audio using the model
with torch.no_grad():
    placeholder_token = torch.tensor([[tokenizer.pad_token_id]])  # Use a placeholder token
    logits = model(input_features=inputs['input_features'], decoder_input_ids=placeholder_token).logits
logits

tensor([[[20.9845, 22.1163, 23.4996,  ..., 17.0799, 16.1717, 16.5442]]])

In [39]:
# original text

common_voice['train'][0]['sentence']

'کوشو'

In [40]:
common_voice['train'][0]['path']

'/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav'

In [41]:
common_voice["train"]

Dataset({
    features: ['id', 'path', 'sentence', 'audio'],
    num_rows: 8120
})

In [42]:
common_voice["test"]

Dataset({
    features: ['id', 'path', 'sentence', 'audio'],
    num_rows: 894
})

In [43]:
common_voice['train'][0]['audio']

{'path': '/data3/sharif/Datasets/BaltiSpeechDataset/train/Nabi/Nabi/156nabi.wav',
 'array': array([0.        , 0.        , 0.        , ..., 0.00066827, 0.00061445,
        0.00070939]),
 'sampling_rate': 16000}

### **Prepare Dataset**

`Now we can write a function to prepare our data ready for the model:`

* We load and resample the audio data by calling batch["audio"]
* Use the feature extractor to compute the log-Mel spectrogram input features from our 1-dimensional audio array.
* We encode the transcriptions to label ids through the use of the tokenizer.

In [44]:
from Dataset_Preprocessing import preprocess_dataset
def prepare_dataset(batch):

    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [45]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1)

Map: 100%|██████████| 8120/8120 [01:49<00:00, 73.83 examples/s] 
Map: 100%|██████████| 894/894 [00:11<00:00, 77.61 examples/s] 


In [46]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 8120
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 894
    })
})

In [47]:
print('After preparation :\n')

common_voice['train'][0]

After preparation :



{'input_features': [[-0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.5876158475875854,
   -0.6812796592712402,
   -0.5920029878616333,
   -0.6161938905715942,
   -0.6438862085342407,
   -0.6072126626968384,
   -0.5688134431838989,
   -0.6506913900375366,
   -0.5946283340454102,
   -0.6812796592712402,
   -0.6662133932113647,
   -0.5240029096603394,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6642025709152222,
   -0.6812796592712402,
   -0.6812796592712402,
   -0.6534663438796997,
   -0.5913763046264648,
   -0.27070462703704834,
   -0.03900027275085449,
   -0.022606968879699707,
   -0.1575857400894165,
   -0.20522332191467285,
   -0.15457475185394287,
   -0.1705249547

# **Training And Evaluation**
### **Data Collator**

In [48]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [49]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

**Compute Metrics**


In [50]:
import evaluate

metric = evaluate.load("wer")

In [51]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

### **Load a Pre-Trained Checkpoint**

In [52]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [53]:
model.config.use_cache = False
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [54]:
model.get_memory_footprint()

59006208

In [55]:
total_params = sum(p.numel() for p in model.parameters())

print(f"Total parameters in the model: {total_params/1000000} M")
# for param in model.parameters():    #to check the grad_fn ERR
#     param.requires_grad = True

Total parameters in the model: 37.76064 M


In [56]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(
    model_name_or_path,
    device_map="auto"  # This will split the model across GPUs
)

model.gradient_checkpointing_enable()


In [57]:
# from peft import get_peft_model
# import torch
# #model = get_peft_model(model, LoraConfig)
# # model = model.half()

# for module in model.modules():
#     if isinstance(module, torch.nn.Linear):
#         module.qconfig = torch.quantization.get_default_qconfig("fbgemm")

In [58]:
# from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

# config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
# # config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

# model = get_peft_model(model, config)
# model.print_trainable_parameters()

**Dummy Model For Testing**

# **Training And Evaluation**

### **Training Arguments**

In [59]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-full-finetuned-Tiny",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=1e-5,
    warmup_steps=500,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=2,
    predict_with_generate=True,
    generation_max_length=128,
    num_train_epochs=3,
    fp16=True,  # Use fp16 if on CUDA and supported
    gradient_checkpointing=True,  # Optional: to reduce memory
    push_to_hub=False,
    report_to="none",  # or "tensorboard", "wandb", etc.
)


In [62]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,  # your Whisper-specific collator
    tokenizer=processor.feature_extractor,  # or processor.tokenizer
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [63]:
import time

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

start = time.time()
trainer.train()
end = time.time()
print(f"Whisper Training Time: {end - start:.2f} seconds")
print(f"Whisper Training Time: {end - start}:.2f")

Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

In [None]:
processor.save_pretrained(training_args.output_dir)

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import torch
from jiwer import cer as jiwer_cer
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

print("================= Optimized Evaluation Loop ======================")
normalizer = BasicTextNormalizer()

# Optimized DataLoader configuration
eval_dataloader = DataLoader(
    common_voice["test"],
    batch_size=16,  # Increased from 8 for faster processing
    collate_fn=data_collator,
    num_workers=4,  # Parallel data loading
    pin_memory=True  # Faster data transfers to GPU
)

# Initialize storage for metrics
list_pred = []
list_label = []
normalized_predictions = []
normalized_references = []

model.eval()
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            # Optimized generation with single beam search
            generated_tokens = model.generate(
                input_features=batch["input_features"].to("cuda", non_blocking=True),
                decoder_input_ids=batch["labels"][:, :4].to("cuda", non_blocking=True),
                max_new_tokens=255,
                num_beams=1  # Faster decoding with minimal accuracy trade-off
            )
            
            # Post-processing
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(generated_tokens.cpu().numpy(), skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            # Store results
            list_pred.extend(decoded_preds)
            list_label.extend(decoded_labels)
            normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
            normalized_references.extend([normalizer(label).strip() for label in decoded_labels])
            
            metric.add_batch(
                predictions=decoded_preds,
                references=decoded_labels,
            )
    
    # Memory cleanup
    del generated_tokens, labels, batch
    torch.cuda.empty_cache()

# Calculate final metrics
wer = 100 * metric.compute()
cer = 100 * jiwer_cer(list_label, list_pred)
normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)
normalized_cer = 100 * jiwer_cer(normalized_references, normalized_predictions)

print(f"\nEvaluation Results:")
print(f"WER: {wer:.2f}%")
print(f"CER: {cer:.2f}%")
print(f"Normalized WER: {normalized_wer:.2f}%")
print(f"Normalized CER: {normalized_cer:.2f}%")

In [None]:
# Display a few samples of predictions and references
num_samples_to_display = 17  # Change this number as needed

print("\nSample Predictions and References:\n")

for i in range(min(num_samples_to_display, len(list_pred))):
    print(f"Reference : {list_label[i]}")
    print(f"Prediction: {list_pred[i]}")
    # print("-" * 70)

In [None]:
print(task +" "+ language +"Finetuning the whisper small: " +model_name_or_path)
print("Dataset Used is CommonVoice Urdu: " +dataset_name)
wer_rounded =round(wer,2)
cer_rounded =round(cer,2)
norm_wer_rounded =round(normalized_wer,2)
norm_cer_rounded =round(normalized_cer,2)
print()
print(f"{wer_rounded}")
print(f"{cer_rounded}")
print(f"{norm_wer_rounded}")
print(f"{norm_cer_rounded}")
