In [None]:
# Install dependencies
!pip install --upgrade pip
!pip install faster-whisper ctranslate2 gradio

# Import libraries
from faster_whisper import WhisperModel
import gradio as gr

# Global model cache
global_model = None

def load_model(use_hf_url, hf_url, model_size, device, compute_type):
    """
    Load the Faster-Whisper model with given parameters or a custom Hugging Face URL.
    Returns status message.
    """
    global global_model
    # Determine model identifier: Hugging Face URL takes priority
    model_id = hf_url.strip() if use_hf_url and hf_url.strip() else model_size
    # Instantiate the model (downloads CTranslate2 weights if needed)
    try:
        global_model = WhisperModel(model_id, device=device, compute_type=compute_type)
        return f"✅ Loaded model: {model_id} on {device} ({compute_type})"
    except Exception as e:
        return f"❌ Failed to load {model_id}: {e}"


def transcribe(audio, beam_size, language, vad_filter, word_timestamps):
    """
    Transcribe the uploaded audio file with current global_model.
    Returns detected language and full transcript.
    """
    if global_model is None:
        return "❌ Model not loaded", ""

    segments, info = global_model.transcribe(
        audio if isinstance(audio, str) else audio.name,
        beam_size=beam_size,
        language=language or None,
        vad_filter=vad_filter,
        word_timestamps=word_timestamps
    )

    transcript = []
    for segment in segments:
        transcript.append(f"[{segment.start:.2f}s -> {segment.end:.2f}] {segment.text}")
    full_text = "\n".join(transcript)
    lang_detect = f"Detected language: {info.language} (p={info.language_probability:.2f})"
    return lang_detect, full_text

# Build Gradio UI
demo = gr.Blocks()
with demo:
    gr.Markdown("# 🦙 Faster-Whisper Transcription Playground")
    with gr.Row():
        with gr.Column(scale=1):
            use_hf_url = gr.Checkbox(label="Use custom Hugging Face model URL", value=False)
            hf_url = gr.Textbox(label="Hugging Face model name or URL", placeholder="e.g. username/my-whisper-ct2", interactive=True)
            model_size = gr.Dropdown(
                choices=["small", "medium", "large", "large-v2", "large-v3", "distil-large-v3"],
                value="large-v3",
                label="Predefined Model Size",
                interactive=True
            )
            device = gr.Radio(
                choices=["cuda", "cpu"],
                value="cuda",
                label="Device"
            )
            compute_type = gr.Radio(
                choices=["float16", "int8", "int8_float16", "float32"],
                value="float16",
                label="Compute Type"
            )
            load_btn = gr.Button("Load Model")
            load_status = gr.Textbox(label="Model Status", interactive=False)

            audio_input = gr.Audio(label="Upload Audio", type="filepath")
            beam_size = gr.Slider(1, 10, value=5, step=1, label="Beam Size")
            language = gr.Textbox(label="Force Language (e.g., 'en')", placeholder="Leave empty for auto-detect")
            vad_filter = gr.Checkbox(label="VAD Filter (remove silence)", value=False)
            word_timestamps = gr.Checkbox(label="Word-level Timestamps", value=False)
            transcribe_btn = gr.Button("Transcribe")

        with gr.Column(scale=2):
            out_lang = gr.Textbox(label="Language Detection")
            out_text = gr.Textbox(label="Transcript", lines=20)

    # Bind events
    load_btn.click(
        fn=load_model,
        inputs=[use_hf_url, hf_url, model_size, device, compute_type],
        outputs=[load_status]
    )
    transcribe_btn.click(
        fn=transcribe,
        inputs=[audio_input, beam_size, language, vad_filter, word_timestamps],
        outputs=[out_lang, out_text]
    )

# Launch in Colab
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=True)


Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2
  Downloading ctranslate2-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from o

model.bin:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

vocabulary.json: 0.00B [00:00, ?B/s]