In [None]:
# Install required packages
!pip install gradio nemo_toolkit[asr] torch torchaudio librosa soundfile

import gradio as gr
import nemo.collections.asr as nemo_asr
import torch
import torchaudio
import librosa
import soundfile as sf
import os
import tempfile
import re
from typing import Optional, Tuple
import warnings
warnings.filterwarnings("ignore")

class ParakeetASRInterface:
    def __init__(self):
        self.current_model = None
        self.current_model_name = None

    def extract_model_name_from_url(self, url_or_name: str) -> str:
        """Extract model name from Hugging Face URL or return the name directly."""
        if url_or_name.startswith("https://huggingface.co/"):
            # Extract model name from URL
            pattern = r"https://huggingface\.co/([^/]+/[^/?]+)"
            match = re.match(pattern, url_or_name)
            if match:
                return match.group(1)
            else:
                raise ValueError("Invalid Hugging Face URL format")
        else:
            # Assume it's already a model name
            return url_or_name.strip()

    def load_model(self, model_url_or_name: str) -> Tuple[str, str]:
        """Load a Parakeet model from Hugging Face."""
        try:
            model_name = self.extract_model_name_from_url(model_url_or_name)

            # Check if model is already loaded
            if self.current_model is not None and self.current_model_name == model_name:
                return f"✅ Model '{model_name}' is already loaded and ready!", ""

            # Clear previous model
            if self.current_model is not None:
                del self.current_model
                torch.cuda.empty_cache() if torch.cuda.is_available() else None

            # Load new model
            status_msg = f"🔄 Loading model '{model_name}'..."
            print(status_msg)

            self.current_model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name)
            self.current_model_name = model_name

            # Move to GPU if available
            if torch.cuda.is_available():
                self.current_model = self.current_model.cuda()
                device_info = "GPU"
            else:
                device_info = "CPU"

            success_msg = f"✅ Successfully loaded '{model_name}' on {device_info}!"
            return success_msg, ""

        except Exception as e:
            error_msg = f"❌ Error loading model: {str(e)}"
            return error_msg, ""

    def transcribe_audio(self, audio_input, decoder_type: str = "TDT") -> Tuple[str, str]:
        """Transcribe audio using the loaded model."""
        try:
            if self.current_model is None:
                return "❌ Please load a model first!", ""

            if audio_input is None:
                return "❌ Please provide an audio file!", ""

            # Handle different audio input types
            if isinstance(audio_input, tuple):
                sample_rate, audio_data = audio_input
                # Save temporary file
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                    sf.write(tmp_file.name, audio_data, sample_rate)
                    audio_path = tmp_file.name
            else:
                # Assume it's a file path
                audio_path = audio_input

            # Load and preprocess audio
            audio_data, sr = librosa.load(audio_path, sr=16000, mono=True)

            # Save preprocessed audio
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                sf.write(tmp_file.name, audio_data, 16000)
                processed_audio_path = tmp_file.name

            # Set decoder type
            if hasattr(self.current_model, 'change_decoding_strategy'):
                if decoder_type.upper() == "CTC":
                    # For CTC decoding
                    self.current_model.change_decoding_strategy(decoder_type="ctc")
                else:
                    # For TDT decoding (default)
                    self.current_model.change_decoding_strategy(decoder_type="tdt")

            # Transcribe
            status_msg = f"🔄 Transcribing audio using {decoder_type} decoder..."
            print(status_msg)

            transcriptions = self.current_model.transcribe([processed_audio_path])

            if transcriptions and len(transcriptions) > 0:
                if hasattr(transcriptions[0], 'text'):
                    result = transcriptions[0].text
                else:
                    result = str(transcriptions[0])
            else:
                result = "No transcription generated"

            # Cleanup temporary files
            try:
                if isinstance(audio_input, tuple):
                    os.unlink(audio_path)
                os.unlink(processed_audio_path)
            except:
                pass

            success_msg = f"✅ Transcription completed using {decoder_type} decoder!"
            return result, success_msg

        except Exception as e:
            error_msg = f"❌ Error during transcription: {str(e)}"
            return "", error_msg

# Initialize the interface
asr_interface = ParakeetASRInterface()

# Define the Gradio interface
def create_gradio_interface():
    with gr.Blocks(
        title="🦜 Parakeet ASR - Hugging Face Models",
        theme=gr.themes.Soft()
    ) as interface:

        gr.Markdown("""
        # 🦜 Parakeet Automatic Speech Recognition

        Use NVIDIA's Parakeet models from Hugging Face for speech recognition!

        **Instructions:**
        1. Enter a Hugging Face model URL or model name (e.g., `nvidia/parakeet-tdt_ctc-0.6b-ja`)
        2. Click "Load Model"
        3. Upload or record audio
        4. Choose decoder type (TDT or CTC)
        5. Click "Transcribe"
        """)

        with gr.Row():
            with gr.Column(scale=2):
                model_input = gr.Textbox(
                    label="🤗 Hugging Face Model URL or Name",
                    placeholder="nvidia/parakeet-tdt_ctc-0.6b-ja",
                    value="nvidia/parakeet-tdt_ctc-0.6b-ja",
                    info="Enter the full HF URL or just the model name"
                )

                load_btn = gr.Button("🔄 Load Model", variant="primary")

                model_status = gr.Textbox(
                    label="Model Status",
                    interactive=False,
                    placeholder="No model loaded"
                )

        with gr.Row():
            with gr.Column(scale=1):
                audio_input = gr.Audio(
                    label="🎤 Audio Input",
                    type="filepath",
                    format="wav"
                )

                decoder_choice = gr.Radio(
                    choices=["TDT", "CTC"],
                    value="TDT",
                    label="🔧 Decoder Type",
                    info="TDT is generally faster, CTC might be more accurate"
                )

                transcribe_btn = gr.Button("🎯 Transcribe Audio", variant="secondary")

            with gr.Column(scale=1):
                transcription_output = gr.Textbox(
                    label="📝 Transcription Result",
                    lines=6,
                    placeholder="Transcription will appear here...",
                    interactive=False
                )

                transcribe_status = gr.Textbox(
                    label="Transcription Status",
                    interactive=False,
                    placeholder="Ready for transcription"
                )

        # Example models section
        gr.Markdown("""
        ## 📚 Example Parakeet Models

        Click on any model name to load it:
        """)

        example_models = [
            "nvidia/parakeet-tdt_ctc-0.6b-ja",
            "nvidia/parakeet-ctc-0.6b",
            "nvidia/parakeet-tdt-1.1b",
            "nvidia/parakeet-ctc-1.1b"
        ]

        with gr.Row():
            for model in example_models:
                gr.Button(
                    model.split("/")[-1],
                    size="sm"
                ).click(
                    lambda m=model: m,
                    outputs=model_input
                )

        # Event handlers
        load_btn.click(
            fn=asr_interface.load_model,
            inputs=[model_input],
            outputs=[model_status, gr.Textbox(visible=False)]
        )

        transcribe_btn.click(
            fn=asr_interface.transcribe_audio,
            inputs=[audio_input, decoder_choice],
            outputs=[transcription_output, transcribe_status]
        )

        # Footer
        gr.Markdown("""
        ---

        **Notes:**
        - Models are loaded from Hugging Face and may take time to download initially
        - Audio is automatically resampled to 16kHz mono as required by Parakeet models
        - TDT decoder is typically faster but CTC might provide better accuracy in some cases
        - Make sure you have sufficient GPU memory for larger models

        **Supported formats:** WAV, MP3, FLAC, M4A
        """)

    return interface

# Launch the interface
if __name__ == "__main__":
    # Create and launch the interface
    interface = create_gradio_interface()

    # Launch with public sharing enabled for Colab
    interface.launch(
        share=True,  # Creates a public link
        debug=True,  # Shows detailed error messages
        server_name="0.0.0.0",  # Allows external connections
        server_port=7860,  # Default Gradio port
        show_error=True
    )

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ef581e8300053e27e6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


🔄 Loading model 'nvidia/parakeet-tdt_ctc-0.6b-ja'...


parakeet-tdt_ctc-0.6b-ja.nemo:   0%|          | 0.00/2.49G [00:00<?, ?B/s]

[NeMo I 2025-07-13 21:56:28 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 3072 tokens


[NeMo W 2025-07-13 21:56:32 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: null
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 30.0
    min_duration: 0.1
    is_tarred: true
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    use_lhotse: true
    use_bucketing: true
    batch_duration: 600
    quadratic_duration: 15
    num_buckets: 30
    bucket_duration_bins:
    - 1.94375
    - 2.55687
    - 3.08312
    - 3.57138
    - 3.98812
    - 4.36069
    - 4.72375
    - 5.078
    - 5.434
    - 5.78994
    - 6.15175
    - 6.5175
    - 6.8885
    - 7.26075
    - 7.6495
    - 8.05538
    - 8.48038
    - 8.92763
    - 9.40019
    - 9.90275
    - 10.44594
    - 11.04794


[NeMo I 2025-07-13 21:56:32 nemo_logging:393] PADDING: 0
[NeMo I 2025-07-13 21:56:38 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.05, 'omega': 0.1}
[NeMo I 2025-07-13 21:56:38 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.05, 'omega': 0.1}
[NeMo I 2025-07-13 21:56:39 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.05, 'omega': 0.1}
[NeMo I 2025-07-13 21:56:52 nemo_logging:393] Model EncDecHybridRNNTCTCBPEModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--parakeet-tdt_ctc-0.6b-ja/snapshots/44edb27eea9317daf89333e75eb830db4b1cc298/parakeet-tdt_ctc-0.6b-ja.nemo.
[NeMo I 2025-07-13 21:57:35 nemo_logging:393] No `decoding_cfg` passed when changing decoding strategy, using intern

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][NeMo W 2025-07-13 21:57:37 nemo_logging:405] CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.
Transcribing: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]


[NeMo I 2025-07-13 22:02:00 nemo_logging:393] No `decoding_cfg` passed when changing decoding strategy, using internal config
[NeMo I 2025-07-13 22:02:00 nemo_logging:393] Changed decoding strategy of the CTC decoder to 
    strategy: greedy
    preserve_alignments: null
    compute_timestamps: null
    word_seperator: ' '
    segment_seperators:
    - .
    - '!'
    - '?'
    segment_gap_threshold: null
    ctc_timestamp_type: all
    batch_dim_index: 0
    greedy:
      preserve_alignments: false
      compute_timestamps: false
      preserve_frame_confidence: false
      confidence_method_cfg:
        name: entropy
        entropy_type: tsallis
        alpha: 0.33
        entropy_norm: exp
        temperature: DEPRECATED
    beam:
      beam_size: 4
      search_type: default
      preserve_alignments: false
      compute_timestamps: false
      return_best_hypothesis: true
      beam_alpha: 1.0
      beam_beta: 0.0
      kenlm_path: null
      flashlight_cfg:
        lexicon_path:

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]