In [1]:
# Install required libraries
!pip install git+https://github.com/openai/whisper.git
!pip install transformers torch
!pip install gradio
!pip install deep_translator
!pip install gtts

# Import libraries
import gradio as gr
import whisper
from transformers import pipeline
from deep_translator import GoogleTranslator
from gtts import gTTS
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-woh2qhg9
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-woh2qhg9
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [3]:
# Load Whisper Model
whisper_model = whisper.load_model("base")

# Load LLaMA-like translation model (using M2M100 as a substitute)
translator = pipeline("translation", model="facebook/m2m100_418M", device=0 if torch.cuda.is_available() else -1)

# Define Language Codes (for gTTS and translation)
LANGUAGE_CODES = {
    "Urdu": "ur", "Hindi": "hi", "French": "fr", "Spanish": "es",
    "German": "de", "Chinese": "zh", "Arabic": "ar", "Russian": "ru"
}

# M2M100 Language Codes (subset matching gTTS)
M2M100_CODES = {
    "Urdu": "ur", "Hindi": "hi", "French": "fr", "Spanish": "es",
    "German": "de", "Chinese": "zh", "Arabic": "ar", "Russian": "ru"
}

Device set to use cuda:0


In [6]:
# Function to Split Text into Chunks (for long text)
def split_text(text, max_length=500):
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

# Function to Translate Text with LLaMA-like Model
def translate_text(text, target_lang):
    target_code = M2M100_CODES.get(target_lang, "ur")
    chunks = split_text(text, 500)  # Split into manageable chunks
    translated_chunks = [translator(chunk, src_lang="en", tgt_lang=target_code)[0]['translation_text'] for chunk in chunks]
    return " ".join(translated_chunks)

# Fallback Translation with GoogleTranslator for unsupported languages
def fallback_translate(text, target_lang):
    target_code = LANGUAGE_CODES.get(target_lang, "ur")
    chunks = split_text(text, 5000)
    translated_chunks = [GoogleTranslator(source="en", target=target_code).translate(chunk) for chunk in chunks]
    return " ".join(translated_chunks)

# Function to Generate Audio from Text
def generate_audio(text, language_code):
    tts = gTTS(text=text, lang=language_code, slow=False)
    audio_file = "translated_audio.mp3"
    tts.save(audio_file)
    return audio_file

# Process Audio Function
def process_audio(audio_file, target_language):
    if not audio_file:
        return "⚠️ No audio input provided.", "", None

    # Transcribe with Whisper
    result = whisper_model.transcribe(audio_file)
    transcription = result.get("text", "").strip()

    if not transcription:
        return "⚠️ No speech detected in the input.", "", None

    # Translate the text with LLaMA-like model
    try:
        translated_text = translate_text(transcription, target_language)
    except Exception as e:
        print(f"Translation with M2M100 failed: {e}, falling back to Google Translate")
        translated_text = fallback_translate(transcription, target_language)

    # Generate audio from translated text
    target_code = LANGUAGE_CODES.get(target_language, "ur")
    translated_audio = generate_audio(translated_text, target_code)

    return transcription, translated_text, translated_audio

In [8]:
# Gradio Interface Function
def gradio_interface(input_choice, audio_file, microphone_input, target_language):
    # Determine the input source based on user choice
    if input_choice == "Upload Audio File":
        audio_input = audio_file
    elif input_choice == "Record Voice":
        audio_input = microphone_input
    else:
        return "⚠️ Please select an input method.", "", None

    # Process the chosen input
    transcription, translated_text, translated_audio = process_audio(audio_input, target_language)
    return transcription, translated_text, translated_audio

# Function to toggle visibility of input components
def toggle_inputs(input_choice):
    if input_choice == "Upload Audio File":
        return gr.update(visible=True), gr.update(visible=False)
    elif input_choice == "Record Voice":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=False), gr.update(visible=False)

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Whisper + LLaMA Speech Translator 🌍")
    gr.Markdown("Upload an audio file or record your voice, select a language, and get transcription + translation + translated audio.")

    # Radio button to choose input method
    input_choice = gr.Radio(choices=["Upload Audio File", "Record Voice"], label="🎙️ Choose Input Method", value="Upload Audio File")

    # Audio and Microphone components
    audio_file = gr.Audio(type="filepath", label="📂 Upload an MP3/WAV file", visible=True)
    microphone_input = gr.Audio(type="filepath", interactive=True, label="🎤 Record Your Voice", visible=False)  # Removed 'source'

    # Dropdown for language selection
    target_language = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()), label="🌍 Select Translation Language")

    # Output components
    transcription = gr.Textbox(label="📝 Transcription (English)")
    translation = gr.Textbox(label="🌎 Translation")
    translated_audio = gr.Audio(label="🎧 Translated Audio", type="filepath")

    # Button to submit
    submit_button = gr.Button("Submit")

    # Toggle input components based on user choice
    input_choice.change(
        toggle_inputs,
        inputs=input_choice,
        outputs=[audio_file, microphone_input]
    )

    # Process the input when the submit button is clicked
    submit_button.click(
        gradio_interface,
        inputs=[input_choice, audio_file, microphone_input, target_language],
        outputs=[transcription, translation, translated_audio]
    )

# Launch Gradio App
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d85dac9c54b5a420aa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


