In [2]:
# !pip install datasets>=2.6.1
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa
# !pip install evaluate>=0.30
# !pip install jiwer
# !pip install gradio
# !pip install -q bitsandbytes datasets accelerate
# !pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
# Select CUDA device index
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name_or_path = "openai/whisper-small"
language = "Hindi"
language_abbr = "or"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_17_0"

In [2]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset(dataset_name, language_abbr, split="train+validation", trust_remote_code=True)
common_voice["test"] = load_dataset(dataset_name, language_abbr, split="test",trust_remote_code=True)

common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes",'variant']
)

print(common_voice)
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2048
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 696
    })
})


In [2]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig

model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path)

In [4]:
def find_longest_sentence_length(dataset):
  """Finds the length of the longest sentence in the dataset.

  Args:
    dataset: The Hugging Face dataset.

  Returns:
    The length of the longest sentence.
  """

  longest_sentence_length = 0
  for item in dataset:
    sentence_length = len(item["sentence"])
    if sentence_length > longest_sentence_length:
      longest_sentence_length = sentence_length
      long_sentence = item["sentence"]
  return longest_sentence_length,long_sentence

longest_sentence_length,long_sentence = find_longest_sentence_length(common_voice["train"])
max_len = len(tokenizer(long_sentence)['input_ids'])

In [5]:
import torch
from datasets import load_dataset
from transformers import WhisperProcessor
import librosa

class CommonVoiceDataset(torch.utils.data.Dataset):
    def __init__(self, processor, model, dataset, max_len):#daatset is huggingface dataset object
        self.processor = processor
        self.dataset = dataset
        self.max_len = max_len
        self.bos_token = model.config.decoder_start_token_id

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]


        input_features = processor.feature_extractor(item['audio']["array"], sampling_rate=16000,return_tensors='pt').input_features[0]

        # Process the transcription
        transcription = item["sentence"]

        # Create labels
        labels = self.processor.tokenizer(transcription, padding="max_length", max_length=self.max_len, truncation=True, return_tensors="pt")
        labels = labels["input_ids"].masked_fill(labels['attention_mask'].ne(1), -100)
        labels = labels[0][1:]


        return {
            "input_features": input_features,
            "labels": labels
        }

In [6]:
train_dataset=CommonVoiceDataset(processor=processor, model=model, dataset=common_voice["train"], max_len=max_len)
test_dataset=CommonVoiceDataset(processor=processor, model=model, dataset=common_voice["test"], max_len=max_len)

In [7]:
# Create dataloaders
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=8,  # Adjust batch size as needed
    shuffle=True,  # Shuffle data during training
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=8,  # Adjust batch size as needed
    shuffle=False,  # No need to shuffle during testing
)

In [8]:
import evaluate
metric = evaluate.load("wer")

In [17]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration,  BitsAndBytesConfig

peft_model_id = "Mohan-diffuser/whisper-large-v2-odia-100steps"
model = WhisperForConditionalGeneration.from_pretrained(
    model_name_or_path, device_map="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True),
)


from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

model = PeftModel.from_pretrained(model, peft_model_id,low_cpu_mem_usage=False,is_trainable=True)

'''Override generation arguments - no tokens are forced as decoder outputs (see forced_decoder_ids),
no tokens are suppressed during generation (see suppress_tokens):'''
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [19]:
for name,param in model.named_parameters():
    # param=param.type(torch.float16)
    if param.requires_grad==True:
        print(name,param.dtype)

base_model.model.model.encoder.layers.0.self_attn.v_proj.lora_A.default.weight torch.float32
base_model.model.model.encoder.layers.0.self_attn.v_proj.lora_B.default.weight torch.float32
base_model.model.model.encoder.layers.0.self_attn.q_proj.lora_A.default.weight torch.float32
base_model.model.model.encoder.layers.0.self_attn.q_proj.lora_B.default.weight torch.float32
base_model.model.model.encoder.layers.1.self_attn.v_proj.lora_A.default.weight torch.float32
base_model.model.model.encoder.layers.1.self_attn.v_proj.lora_B.default.weight torch.float32
base_model.model.model.encoder.layers.1.self_attn.q_proj.lora_A.default.weight torch.float32
base_model.model.model.encoder.layers.1.self_attn.q_proj.lora_B.default.weight torch.float32
base_model.model.model.encoder.layers.2.self_attn.v_proj.lora_A.default.weight torch.float32
base_model.model.model.encoder.layers.2.self_attn.v_proj.lora_B.default.weight torch.float32
base_model.model.model.encoder.layers.2.self_attn.q_proj.lora_A.defaul

In [41]:
model.config.use_cache = True  # silence the warnings. Please re-enable for inference!
import gc
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
normalizer = BasicTextNormalizer()

predictions = []
references = []
normalized_predictions = []
normalized_references = []

model.eval()
for step, batch in enumerate(tqdm(test_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            generated_tokens = (
                model.generate(
                    input_features=batch["input_features"].to("cuda"),
                    forced_decoder_ids=forced_decoder_ids,
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
            decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)
            normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
            normalized_references.extend([normalizer(label).strip() for label in decoded_labels])

            # break


        del generated_tokens, labels, batch
    gc.collect()
wer = 100 * metric.compute(predictions=predictions, references=references)
normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)
eval_metrics = {"eval/wer": wer, "eval/normalized_wer": normalized_wer}

print(f"{wer=} and {normalized_wer=}")
print(eval_metrics)

  5%|▍         | 2/44 [01:15<26:16, 37.53s/it]


KeyboardInterrupt: 

In [42]:
decoded_preds,decoded_labels

(['ସାତ!',
  'ମା ବୁଢ଼ିକାରଇ ଶୁକାଇ ନାଳା ।',
  'ନାଉରି କି ଶୋଇଯାଇଛି?',
  'ସମସ୍ତ ପ୍ରକାର ଦାଶ ତ ଉ ଦାସ ବେବ ସାୟ ନିଶିଧ ହେବ ।',
  'ଦୁମ୍ଦୁମ୍ପୁରରେ କନ୍ୟାଟି ଘର ଯୋଗା ହେଲାଣି ।',
  'ଶୁତୁ ଉଫୁନିକ ପିଲାଟା, ଅଲଗା ଇପୋପିଆ ହୋଇ ଖଣ ରହିପାରିବୁ?',
  'ଆରେ ମୁ ଚନ୍ଦ୍ର ଉଦିଆରେ, ମୁଁ ଘର ଅନ୍ଧାର କରି ଆସି ଚୁଅ, ଆରେ ମୁ କଳାମାଣିକରେ ।',
  'ସାଆନ୍ତେ ଆପେ କହନ୍ତି, ସେ କାହାରି ଗୋଟିଏ ପଇସା ଆଣନ୍ତି ନାହିଁ ।',
  'ନ୍ୟ ।',
  'ଶୁନ ।',
  'ନ!',
  'ଅବଶ୍ୟ ଅବଶ୍ୟ, ଏ କଥାଟା କଣ କହିବାକୁ ହେପ?',
  'ଆଗକୁ ମୁଁ ଯାଇ ପାରିଲି ଲାହିଁ ।',
  'ବାହା ହେଲା ହଁ, ଢେର ଦିନଯାଏ ସନ୍ତାନ ମୁଖ ଦେଖିଲା ନାହିଁ ।',
  'ଇଡ଼େ ବଡ଼ ଜଳ କ୍ର୍ତି ଯେ କରିପାରେ ଶେକି, ଆମମାନଙ୍କ ପରି ମନ୍ଲେଷ ।',
  'ପିତା ଶୁଣି ପବନ ଲାଗିଛି ।'],
 ['ସାତ',
  'ମାଆ ବୁଢ଼ୀ କରଇ ସୁକଇଲାଣ',
  'ନାଉରି କି ଶୋଇ ଯାଇଛି?',
  'ସମସ୍ତ ପ୍ରକାର ଦାସତ୍ତ୍ୱ ଓ ଦାସବ୍ୟବସାୟ ନିଷିଦ୍ଧ ହେବ ।',
  'ଦୁମଦୁମପୁରରେ କନ୍ୟାଟି ଘରଯୋଗା ହେଲାଣି ।',
  'ତୁ ଉଛୁଣିକା ପିଲାଟା, ଅଲଗା ଏକୁଟିଆ ହୋଇ କଣ ରହି ପାରିବୁ?',
  'ଆରେ ମୋ ଚନ୍ଦ୍ର ଉଦିଆରେ, ମୋଘର ଅନ୍ଧାର କରି ଆସିଛୁ, ଆରେ ମୋ କଳାମାଣିକରେ ।',
  'ସାଆନ୍ତେ ଆପେ କହନ୍ତି ସେ କାହାରି ଗୋଟିଏ ପଇସା ଆଣନ୍ତି ନାହିଁ ।',
  'ନା',
  'ଶୂନ',
  'ନଅ',
  '"ଅବଶ୍ୟ ଅବଶ୍ୟ, ଏ କଥାଟା କଣ କହିବ

In [21]:
model.device

device(type='cuda', index=0)

# Model preparation for LoRA Training

In [None]:
from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig

model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, quantization_config=BitsAndBytesConfig(load_in_8bit=True))

'''Override generation arguments - no tokens are forced as decoder outputs (see forced_decoder_ids),
no tokens are suppressed during generation (see suppress_tokens):'''
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=16, lora_alpha=64, target_modules=["q_proj", "k_proj","v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Training Loop

In [21]:
device='cuda'
from tqdm import tqdm  # For progress bar

model.config.use_cache=False # just for training, make it True during inference
model.train()  # Set model to training mode

optimizer=torch.optim.AdamW(model.parameters(), lr=1e-3)
max_steps=1000
step=0
eval_steps=100

progress_bar = tqdm(range(max_steps),leave=False)


while True:

    train_loss = 0.0
    for batch in train_dataloader:

        optimizer.zero_grad()  # Reset gradients
        input_features, labels = batch["input_features"].to(device), batch["labels"].to(device)

        # Forward pass
        outputs = model(input_features, labels=labels)  # Assuming your model takes these inputs
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        progress_bar.update(1)
        progress_bar.set_postfix(loss=loss.item())


        if (step+1) % eval_steps==0:

          # Evaluate on the test set (optional)
          model.eval()  # Set model to evaluation mode
          peft_model_id = "Mohan-diffuser/whisper-small-odia-3000steps"
          model.push_to_hub(peft_model_id)
          test_loss = 0.0
          with torch.no_grad():
              for batch in test_dataloader:
                  input_features, labels = batch["input_features"].to(device), batch["labels"].to(device)
                  outputs = model(input_features, labels=labels)
                  loss = outputs.loss
                  test_loss += loss.item() * input_features.size(0)

          test_loss /= len(test_dataloader.dataset)

          # Print step results
          print(f"step {step + 1}/{max_steps}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

          model.train()  # Set model to training mode


        step=step+1

        if step==max_steps:

            break




  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 