In [5]:
from functools import lru_cache
import whisper

@lru_cache
def get_whisper_model():
  model = whisper.load_model("base")
  return model


def transcribe_audio(audio):
  model = get_whisper_model()
  # pad/trim it to fit 30 seconds
  audio = whisper.pad_or_trim(audio)

  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)

  # detect the spoken language
  _, probs = model.detect_language(mel)
  print(f"Detected language: {max(probs, key=probs.get)}")

  # decode the audio
  options = whisper.DecodingOptions()
  result = whisper.decode(model, mel, options)


  return result.text



In [6]:
audio = whisper.load_audio("audio.m4a")
# print the recognized text
print(transcribe_audio(audio))

Detected language: en
Hello, this is Manoj Pajaj and I'm testing this book.
