In [1]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
from scipy.signal import resample
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader

import os,json
from transformers import WhisperTokenizer, get_scheduler
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration
import wandb
import evaluate
import re
import time
from jiwer import cer

device='cuda'


model_id = "openai/whisper-small"
tokenizer = WhisperTokenizer.from_pretrained(model_id,language='bengali',task='transcribe')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id,language='bengali',task='transcribe')
model = WhisperForConditionalGeneration.from_pretrained(model_id).to('cuda')

from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

model = PeftModel.from_pretrained(model, "Mohan-diffuser/whisper-small-odia-finetuned", is_trainable=False, device_map={"": 0})#Mohan-diffuser/whisper-small-odia-finetuned
model.print_trainable_parameters()
model.eval()
model.config.use_cache = True
torch.cuda.empty_cache()



def down_sample_audio(audio_original, original_sample_rate):
    target_sample_rate = 16000

    # Calculate the number of samples for the target sample rate
    num_samples = int(len(audio_original) * target_sample_rate / original_sample_rate)

    # Resample the audio array to the target sample rate
    downsampled_audio = resample(audio_original, num_samples)

    return downsampled_audio



class whisper_training_dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len):#daatset is huggingface dataset object
        self.dataset = dataset
        self.max_len = max_len
        self.bos_token = model.config.decoder_start_token_id

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        audio_data = down_sample_audio(item['audio']["array"], item['audio']["sampling_rate"])
        input_features = feature_extractor(audio_data, sampling_rate=16000,return_tensors='pt').input_features[0]

        # Process the transcription
        transcription = item['sentence']

        # Create labels
        labels = tokenizer(transcription, padding="max_length", max_length=self.max_len, truncation=True, return_tensors="pt")
        labels = labels["input_ids"].masked_fill(labels['attention_mask'].ne(1), -100)
        labels = labels[0][1:]


        return {
            "input_features": input_features,
            "labels": labels
        }
        

        


2025-05-13 15:23:22.093151: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747149802.102413 1437100 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747149802.105206 1437100 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-13 15:23:22.115183: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


trainable params: 0 || all params: 252,351,744 || trainable%: 0.0000


In [2]:
# from huggingface_hub import HfApi

# # Optional: create repo programmatically
# api = HfApi()
# repo_id = "Mohan-diffuser/whisper-small-odia-finetuned"  # change as needed
# api.create_repo(repo_id, private=False, exist_ok=True)

# # Push to Hub
# from transformers import WhisperForConditionalGeneration

# model.push_to_hub(repo_id)
# tokenizer.push_to_hub(repo_id)
# feature_extractor.push_to_hub(repo_id)

In [2]:
from datasets import load_dataset,concatenate_datasets

cv_17 = load_dataset("mozilla-foundation/common_voice_17_0", "or", split=["train","other","test"])

In [3]:
train_data = concatenate_datasets([cv_17[0],cv_17[1]])
test_data = cv_17[2]

In [4]:
train_data[20]


{'client_id': 'b0387daa800984e4069a5b7ee53956cb1c5f8d5a876ca0b4072260237d47d74afe21ec0267f9d9076c438d1af441fa9fada6c3a0bb2f10251b5b2ced597c5879',
 'path': '/home/mohan.dash/.cache/huggingface/datasets/downloads/extracted/d0b40a1f4c62ddf75c1b55067ff318cc79aad1b19ded2106bcf218176846f318/or_train_0/common_voice_or_21832053.mp3',
 'audio': {'path': '/home/mohan.dash/.cache/huggingface/datasets/downloads/extracted/d0b40a1f4c62ddf75c1b55067ff318cc79aad1b19ded2106bcf218176846f318/or_train_0/common_voice_or_21832053.mp3',
  'array': array([ 0.00000000e+00,  7.55790558e-17, -4.17558656e-16, ...,
         -7.13861823e-08, -3.89958615e-07, -4.04293644e-07]),
  'sampling_rate': 48000},
 'sentence': 'ଏଣେ ମହନ୍ତଙ୍କର ଯେପରି ଅକାରଣ ପଇସାଏ ନଷ୍ଟ ନ ହେବ ଖୁବ୍ ଜଗିଥାଏ ।',
 'up_votes': 3,
 'down_votes': 0,
 'age': 'thirties',
 'gender': 'male_masculine',
 'accent': 'Central,Baleswari',
 'locale': 'or',
 'segment': '',
 'variant': ''}

In [5]:

import gradio as gr
import torch
import numpy as np
from transformers import pipeline,AutoTokenizer,AutoModelForSeq2SeqLM

# Load tokenizer with source language
tokenizer_translation = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="ory_Orya")


model_translation = AutoModelForSeq2SeqLM.from_pretrained(
    "runs_translation/odia_english_ckpt",
    torch_dtype=torch.bfloat16,
    device_map=device  # Will automatically place on GPU if available
)

# Load translator
translator = pipeline(
    task="translation",
    model=model_translation,
    tokenizer=tokenizer_translation,
    src_lang="ory_Orya",
    tgt_lang="eng_Latn",
    torch_dtype=torch.bfloat16
)

# Assumes: model.to("cuda"), tokenizer, feature_extractor are already initialized
# Assumes: down_sample_audio() is defined

def transcribe(audio):
    if audio is None:
        return "No audio input.", ""

    original_sr, audio_np = audio
    audio_np = audio_np.astype(np.float32)
    audio_np /= np.max(np.abs(audio_np))

    # Downsample to 16kHz
    audio_16000 = down_sample_audio(audio_np, original_sr)

    # Extract features
    input_feature = feature_extractor(
        raw_speech=audio_16000,
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features

    # Generate Odia transcription
    with torch.no_grad():
        output = model.generate(
            input_feature.to("cuda"), language="bengali", task="transcribe"
        )

    odia_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

    # Translate to English
    translated_text = translator(
        odia_text,
    )[0]["translation_text"]



    return odia_text, translated_text

# --- Gradio Interface with two outputs ---
app = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="numpy", label="🎤 Record or Upload Audio"),
    outputs=[
        gr.Textbox(label="📝 Odia Transcription"),
        gr.Textbox(label="🌍 English Translation")
    ],
    title="Whisper Odia Transcription + Translation",
    description="Record or upload Odia audio. Get the transcription in Odia and a translation in English."
)

# Launch inline in notebook
app.launch(inline=True, share=True)


Device set to use cuda


Running on local URL:  http://127.0.0.1:7860
IMPORTANT: You are using gradio version 4.20.1, however version 4.44.1 is available, please upgrade.
--------
Running on public URL: https://53ca6e6305f6de26f2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [5]:
import torch

for idx in range(15):

    target = train_data[idx]['sentence']
    audio_original = train_data[idx]['audio']['array']
    original_sample_rate = train_data[idx]['audio']['sampling_rate']

    audio_16000 = down_sample_audio(audio_original, original_sample_rate)

    input_feature = feature_extractor(raw_speech=audio_16000,
                                    sampling_rate=16000,
                                    return_tensors='pt').input_features

    with torch.no_grad():
        op = model.generate(input_feature.to('cuda'), language='bengali', task='transcribe')


    text_pred =  tokenizer.batch_decode(op,skip_special_tokens=True )[0]

    print(f'-------{idx}------')
    print(f'true : {target} \npred : {text_pred}')
    print('\n ')

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


-------0------
true : ଆଜି ମୁଁ ସର୍ପଦ୍ୱାରା କଣ ହୋଇଥାନ୍ତି । 
pred : ଆଜିମୁ ସର୍ପ ଦ୍ୱରା କୌଣ ହୋଇଥାନ୍ତି

 
-------1------
true : ଏଠାରେ ନଦୀ ବୃଦ୍ଧିର କାରଣ ହେଲା ବୃଷ୍ଟି । 
pred : ଏଠାରେ ନଦୀ ବୃଧୀର କାରଣ ହେଲା ବୃଷ୍ଟି

 
-------2------
true : ଅସଲ କଥାଟା, ରାଜୀବଟା ଭାରି ଦୁଷ୍ଟ । 
pred : ଅସୋଲ କଥାଟା ରାଜିବଟା ଭାରି ଦୁଷ୍ଟ

 


KeyboardInterrupt: 

In [11]:
op = model.generate(input_feature.to('cuda'), language='bengali', task='transcribe')

tokenizer.batch_decode(op,skip_special_tokens=True )[0]

'ସେ ଗଛ ଦିକି ମାମନରେ ପଡିଲେ'

In [9]:
input_feature

tensor([[[-0.7036, -0.7036, -0.7036,  ..., -0.7036, -0.7036, -0.7036],
         [-0.7036, -0.7036, -0.7036,  ..., -0.7036, -0.7036, -0.7036],
         [-0.7036, -0.7036, -0.7036,  ..., -0.7036, -0.7036, -0.7036],
         ...,
         [-0.7036, -0.7036, -0.7036,  ..., -0.7036, -0.7036, -0.7036],
         [-0.7036, -0.7036, -0.7036,  ..., -0.7036, -0.7036, -0.7036],
         [-0.7036, -0.7036, -0.7036,  ..., -0.7036, -0.7036, -0.7036]]])

In [8]:
dataset = whisper_training_dataset(dataset=test_data, max_len=300)
test_dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=BATCH_SIZE,  # Adjust batch size as needed
    shuffle=True,  # Shuffle data during training
)

In [11]:

model.eval()

predictions=[]
references=[]



for batch in tqdm(test_dataloader,total=len(test_dataloader)):

    model.eval()  # Set model to training mode
    model.config.use_cache = True

    input_features, labels = batch["input_features"].to(device), batch["labels"].to(device)

    with torch.no_grad():
        generated_tokens = model.generate(input_features=input_features,language='bengali', task='transcribe')
                    
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    predictions.extend(decoded_preds)
    references.extend(decoded_labels)
    
    print(f'running_cer:{cer(references, predictions) * 100}')

CER = cer(references, predictions) * 100

  1%|          | 1/87 [00:03<05:29,  3.83s/it]

running_cer:22.687224669603523


  2%|▏         | 2/87 [00:07<05:33,  3.92s/it]

running_cer:22.516556291390728


  3%|▎         | 3/87 [00:11<05:12,  3.73s/it]

running_cer:21.919879062736207


  5%|▍         | 4/87 [00:14<04:54,  3.54s/it]

running_cer:20.70196311719215


  6%|▌         | 5/87 [00:17<04:43,  3.46s/it]

running_cer:20.69128787878788


  7%|▋         | 6/87 [00:21<04:53,  3.63s/it]

running_cer:19.807692307692307


  8%|▊         | 7/87 [00:24<04:36,  3.45s/it]

running_cer:20.65775950668037


  9%|▉         | 8/87 [00:28<04:42,  3.57s/it]

running_cer:19.98792270531401


 10%|█         | 9/87 [00:32<04:39,  3.58s/it]

running_cer:20.11173184357542


 11%|█▏        | 10/87 [00:35<04:23,  3.42s/it]

running_cer:20.394895256441124


 13%|█▎        | 11/87 [00:39<04:26,  3.50s/it]

running_cer:20.240963855421686


 14%|█▍        | 12/87 [00:42<04:20,  3.47s/it]

running_cer:20.511284202117036


 14%|█▍        | 12/87 [00:44<04:39,  3.73s/it]


KeyboardInterrupt: 