In [None]:
# 1. Install necessary packages in Colab
!pip install -q gradio faster-whisper torch torchaudio soundfile ctranslate2

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import gradio as gr
import numpy as np
from faster_whisper import WhisperModel
import time
import os
import soundfile as sf # Better for saving numpy arrays to WAV

# 2. Load the faster-whisper model (do this once)
# Choose a model size suitable for GPU, medium.en or large-v3 are good choices
model_size = "large-v3"
device_type = "cuda" # Use "cuda" for Nvidia GPU in Colab
compute_type = "float16" # Use "float16" for faster GPU inference

print(f"Loading model: {model_size} on {device_type} with {compute_type}...")
try:
    # Try loading on GPU first
    model = WhisperModel(model_size, device=device_type, compute_type=compute_type)
    print("Model loaded successfully on GPU.")
except Exception as e:
    print(f"ERROR: Failed to load model on GPU: {e}")
    print("Attempting to load on CPU...")
    try:
        # Fallback to CPU if GPU fails
        device_type = "cpu"
        compute_type = "int8" # Quantized for CPU
        model = WhisperModel(model_size, device=device_type, compute_type=compute_type)
        print("Model loaded successfully on CPU (int8).")
    except Exception as e_cpu:
        print(f"ERROR: Failed to load model on CPU as well: {e_cpu}")
        # If model loading fails completely, we can't proceed
        model = None # Indicate model loading failed

# 3. Define the function that Gradio will call
def transcribe_microphone_input(audio_input):
    """
    Receives audio data from Gradio microphone, transcribes it using faster-whisper.
    """
    if model is None:
        return "ERROR: Whisper model failed to load. Cannot transcribe."

    if audio_input is None:
        return "No audio recorded. Click the microphone, record, and click stop."

    # Gradio microphone input provides a tuple: (sample_rate, numpy_data)
    sample_rate, audio_data = audio_input

    print(f"Received audio: Sample Rate={sample_rate}, Duration={len(audio_data)/sample_rate:.2f}s")

    # Ensure audio data is float32, as expected by many audio processing libs/models
    audio_data = audio_data.astype(np.float32)

    # Optional: Normalize audio (can sometimes improve transcription)
    audio_data = audio_data / np.max(np.abs(audio_data))

    # Save the numpy array to a temporary WAV file that faster-whisper can read
    temp_filename = f"temp_audio_{time.time_ns()}.wav"
    try:
        print(f"Saving temporary audio to {temp_filename}...")
        sf.write(temp_filename, audio_data, sample_rate)
        print("Temporary file saved.")
    except Exception as e:
        print(f"Error saving temporary audio file: {e}")
        return f"ERROR: Could not save temporary audio file: {e}"

    # Perform transcription
    transcription_text = "Transcription failed." # Default message
    try:
        print(f"Starting transcription using {model_size} ({device_type}, {compute_type})...")
        start_time = time.time()
        # You can add vad_filter=True here for Voice Activity Detection if desired
        segments, info = model.transcribe(temp_filename, beam_size=5) #, vad_filter=True)

        # Combine segments into a single string
        transcription_text = " ".join(segment.text for segment in segments).strip()

        end_time = time.time()
        duration = end_time - start_time
        print(f"Transcription complete in {duration:.2f} seconds.")
        print(f"Detected language: {info.language} (Prob: {info.language_probability:.2f})")
        print(f"Transcription Result: '{transcription_text}'")

        if not transcription_text:
             transcription_text = "(No speech detected or empty transcription)"

    except Exception as e:
        print(f"Error during transcription: {e}")
        transcription_text = f"ERROR: Transcription failed: {e}"
    finally:
        # Clean up the temporary file
        if os.path.exists(temp_filename):
            try:
                os.remove(temp_filename)
                print(f"Removed temporary file: {temp_filename}")
            except Exception as e_del:
                print(f"Warning: Could not remove temporary file {temp_filename}: {e_del}")

    return transcription_text

# 4. Create and launch the Gradio Interface
print("Setting up Gradio interface...")
iface = gr.Interface(
    fn=transcribe_microphone_input,
    # Input: Use gr.Audio with source "microphone" and type "numpy"
    inputs=gr.Audio(sources=["microphone"], type="numpy", label="Record Audio Here"),
    # Output: A simple text box
    outputs=gr.Textbox(label="Transcription"),
    title="Faster-Whisper Transcription (via Microphone)",
    description="Click the microphone icon below, record your speech, press stop, and the transcription will appear.",
    # live=False means processing happens *after* user clicks stop.
    # live=True tries to process in near real-time chunks (more complex setup often needed)
    live=False,
)

print("Launching Gradio interface...")
# debug=True provides more logs, share=True gives a public link (useful for Colab)
iface.launch(debug=True, share=True)

Loading model: large-v3 on cuda with float16...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Model loaded successfully on GPU.
Setting up Gradio interface...
Launching Gradio interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://27370e18799157d6e9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Received audio: Sample Rate=44100, Duration=53.94s
Saving temporary audio to temp_audio_1743420786725799537.wav...
Temporary file saved.
Starting transcription using large-v3 (cuda, float16)...
Transcription complete in 4.42 seconds.
Detected language: en (Prob: 0.98)
Transcription Result: 'hello hello can you hear me can you listen to this audio if you can then  please please transcribe this this will be so much help I cannot even tell you  how helpful this will be so yeah I'm going to submitting this now I'm going  to be submitting this now anyway two hours and 30 minutes left hopefully I  get another 30 minutes of session in and one more hour so three hours it's a  target for today anyway I will stop at around 42 seconds I'll blabber for  another three seconds like one two three anyway yeah transcribe this bye'
Removed temporary file: temp_audio_1743420786725799537.wav
Keyboard interruption in main thread... closing server.


KeyboardInterrupt: 