In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
hf_token = os.environ['HF_TOKEN']

In [3]:
from transformers import (
    WhisperProcessor, 
    WhisperForConditionalGeneration
)

import librosa
from datasets import load_dataset

In [4]:
def load_audio_data(data_path, sr=16000):
    signal, _ = librosa.load(data_path, sr=sr)
    return signal


class Transcription:
    def __init__(self, model_ckpt, sr):
        self.processor = WhisperProcessor.from_pretrained(model_ckpt, language="en", task="transcribe")
        self.model = WhisperForConditionalGeneration.from_pretrained(model_ckpt)
        self.model.config.forced_decoder_ids = None

        self.sampling_rate = sr

    def transcribe(self, signal):    
        input_feature = self.processor(signal, sampling_rate=self.sampling_rate, return_tensors="pt").input_features
        # generate token ids
        predicted_ids = self.model.generate(input_feature)
        # decode token ids to text
        transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
        return transcription

In [5]:
whisper_ckpt = "openai/whisper-small"

In [6]:
data_path = "data/audio.wav"
signal = load_audio_data(data_path=data_path, sr=16000)

In [7]:
transcription = Transcription(model_ckpt=whisper_ckpt, sr=16000)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
audio_text = transcription.transcribe(signal=signal)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


In [9]:
audio_text

[' अगा जुता तेज वा नहीं तो अजाम में भी तोड़ा सा लड़ा प्राद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्रद प्']