<a target="_blank" href="https://colab.research.google.com/github/paulwbailey/whisper-testing/blob/main/whisper-audio-example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
! pip install git+https://github.com/openai/whisper.git > /dev/null
! pip install gradio > /dev/null

In [None]:
import whisper
import gradio as gr

In [None]:
model = whisper.load_model("medium")

print(f"Executing on {model.device}")

In [None]:
def process_audio(audio):

    LANGUAGES = whisper.tokenizer.LANGUAGES
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    language = max(probs, key=probs.get)
    prob = probs.get(language)
    language_str = LANGUAGES.get(language)

    # decode the audio
    options = dict(language=language) #, fp16 = False)
    transcribe_options = dict(task="transcribe", **options)
    translate_options = dict(task="translate", **options)
    transcription = model.transcribe(audio, **transcribe_options)["text"]
    translation = model.transcribe(audio, **translate_options)['text']
    return transcription, translation, f"The detected language is {language_str} with {prob:.2%} confidence."

In [None]:
app, local, external = gr.Interface(
    title='Multilingual Translator', 
    fn=process_audio, 
    inputs=[
        gr.Audio(source="microphone", type="filepath")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Translation"),
        gr.Text(label="Language Details")
    ],
    live=True, allow_flagging='never').launch(inline=False, debug=False, show_api=False)