# Speech to Speech from one language to another complete pipeline


In [None]:
import whisper
import speech_recognition as sr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import os
from Language_map import LANGUAGE_MAP 

In [None]:
def transcribe_from_mic(language):
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print(f"Speak Now ({language})...")
        audio = r.listen(source, timeout=10, phrase_time_limit=15)
        with open("temp.wav", "wb") as f:
            f.write(audio.get_wav_data())
    model = whisper.load_model("large")
    whisper_code = LANGUAGE_MAP.get(language.lower(), {}).get("whisper", "en")
    result = model.transcribe("temp.wav", language=whisper_code)
    return result["text"]

In [None]:
def translate_text(text, src_language, tgt_language, model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
    src_code = LANGUAGE_MAP.get(src_language.lower(), {}).get("nllb", "eng_Latn")
    tgt_code = LANGUAGE_MAP.get(tgt_language.lower(), {}).get("nllb", "eng_Latn")
    inputs = tokenizer(text, return_tensors="pt")
    translated = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_code]
    )
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [None]:
def speak_text(text, language):
    gtts_code = LANGUAGE_MAP.get(language.lower(), {}).get("gtts", "en")
    if gtts_code is None:
        print(f"Warning: gTTS does not support language '{language}', using English instead.")
        gtts_code = "en"
    tts = gTTS(text, lang=gtts_code)
    tts.save("output.mp3")
    if os.name == 'nt':
        os.system("start output.mp3")
    elif os.name == 'posix':
        os.system("xdg-open output.mp3")
    else:
        os.system("open output.mp3")

In [None]:
if __name__ == "__main__":
    src_language = "kannada"  # example input speech language
    tgt_language = "spanish"  # example output translation language
    model_path = "path/to/quantized-nllb"  # Use any quantized NLLB model in the NLLB-200-distilled-600 directory

    input_text = transcribe_from_mic(src_language)
    print(f"Transcribed text ({src_language}):", input_text)

    translated_text = translate_text(input_text, src_language, tgt_language, model_path)
    print(f"Translated text ({tgt_language}):", translated_text)

    speak_text(translated_text, tgt_language)