<a target="_blank" href="https://colab.research.google.com/github/paulwbailey/whisper-testing/blob/main/whisper-video-caption.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
! pip install git+https://github.com/openai/whisper.git > /dev/null
! pip install gradio > /dev/null

In [None]:
import gradio as gr 
import os
import subprocess

import whisper
from whisper.utils import write_vtt

In [None]:
model = whisper.load_model("medium")
print(f"Executing on {model.device}")

In [None]:
def video2mp3(video_file, output_ext="mp3"):
    filename, ext = os.path.splitext(video_file)
    subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"], 
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
    return f"{filename}.{output_ext}"

In [None]:
def process(input_video, operation):

    audio_file = video2mp3(input_video)
    
    options = dict(beam_size=5, best_of=5) #, fp16 = False)
    translate_options = dict(task=operation, **options)
    result = model.transcribe(audio_file,**translate_options)

    output_dir = '/content/' if os.path.isdir('/content') else os.getcwd()
    audio_path = audio_file.split(".")[0]

    with open(os.path.join(output_dir, audio_path + ".vtt"), "w") as vtt:
        write_vtt(result["segments"], file=vtt)

    subtitle = audio_path + ".vtt"
    output_video = audio_path + "_subtitled.mp4"

    os.system(f"ffmpeg -i {input_video} -vf subtitles={subtitle} {output_video}")

    return output_video

In [None]:
def translate_video(input_video):
    return process(input_video, "translate")

In [None]:
def transcribe_video(input_video):
    return process(input_video, "transcribe")

In [None]:
block = gr.Blocks(title="Multilingual Caption")
with block:
    with gr.Group():
        with gr.Box():
            with gr.Row().style():
                inp_video = gr.Video(
                    label="Input Video",
                    source="upload"
                )
                op_video = gr.Video()
            btn_transcribe = gr.Button("Transcribe")
            btn_translate = gr.Button("Translate")
        btn_transcribe.click(transcribe_video, inputs=[inp_video], outputs=[op_video])
        btn_translate.click(translate_video, inputs=[inp_video], outputs=[op_video])
app, local, external = block.launch(inline=False, debug=False, show_api=False)