In [11]:
import speech_recognition as sr
import pyaudio
import os
from openai import OpenAI
from gtts import gTTS
import pygame
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


Speech Recognition

In [12]:

recognizer = sr.Recognizer()

def speech_to_text():
    with sr.Microphone() as source:
        print("Adjusting for ambient noise...")
        recognizer.adjust_for_ambient_noise(source, duration=1) 
        print("Please speak...")
        try:
            audio_data = recognizer.listen(source, timeout=5) 
            print("Recognizing with PocketSphinx...")
            text = recognizer.recognize_sphinx(audio_data)
            print("Recognized Text: " + text)
        except sr.UnknownValueError:
            print("Could not understand the audio.")
        except Exception as e:
            print(f"Error: {e}")
        return text


LLM

In [13]:
def text_to_meaning(input):
    monster_api_key = os.getenv('MONSTER_API_KEY')

    generation_model_name: str
    temperature: float = 0.9
    top_p = 0.9
    max_tokens: int = 2048
    stream: bool = False  
    llm_name: str = "Meta-Llama"

    monster_client = OpenAI(
        base_url="https://llm.monsterapi.ai/v1/",
        api_key=monster_api_key
    )

    monster_ai_model_name = {
        "Google-Gemma": "google/gemma-2-9b-it",
        "Mistral": "mistralai/Mistral-7B-Instruct-v0.2",
        "Microsoft-Phi": "microsoft/Phi-3-mini-4k-instruct",
        "Meta-Llama": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    }
    message = [
                {"role": "system", "content": "You are an AI assistant. Kindly answer the following question:"},
                {"role": "user", "content": input}
            ]

    response = monster_client.chat.completions.create( model=monster_ai_model_name[llm_name], messages=message, temperature=temperature, top_p=top_p, max_tokens=max_tokens, stream=False)
    return (response.choices[0].message.content)

Text to Speech

In [14]:
def speak_output(output):
    language = "en"
    myobj = gTTS(text=output,lang = language,slow = False)
    myobj.save("welcome.mp3")
    pygame.mixer.init()
    pygame.mixer.music.load("welcome.mp3")
    pygame.mixer.music.play()

Main Function

In [16]:
# orchestrator

input_text = speech_to_text()
# input_text = "Whats full form of AI"
output_text = text_to_meaning(input_text)
# speak_output(output_text)

with gr.Blocks(theme=gr.themes.Glass()) as interface:
    gr.Markdown("## Speech 2 Speech LLM Application")
    gr.Markdown("_Developed for Tensorgo_")
    
    with gr.Tab("🎤 Ask Your Question"):
        input_button = gr.Button("🎙️ Start Listening")
        output_textbox = gr.Textbox(label="AI Response")
        
        input_button.click(
            fn=lambda: text_to_meaning(speech_to_text()),
            outputs=output_textbox,
        )
    
    with gr.Tab("ℹ️ About"):
        gr.Markdown("This application converts spoken questions into text, processes them using an LLM, and speaks the response back.")
        gr.Markdown("**Features:**")
        gr.Markdown("- Speech-to-Text Recognition")
        gr.Markdown("- Natural Language Processing with an LLM")
        gr.Markdown("- Text-to-Speech Audio Response")
    
    with gr.Tab("⚙ Settings"):
        model_dropdown = gr.Dropdown(
            label="Select AI Model",
            choices=["Google-Gemma", "Mistral", "Microsoft-Phi", "Meta-Llama"],
            value="Meta-Llama",
        )
        gr.Markdown("Adjust settings as needed.")
    

interface.launch()

Adjusting for ambient noise...
Please speak...
Recognizing with PocketSphinx...
Recognized Text: hello
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.16.0, however version 4.44.1 is available, please upgrade.
--------
Adjusting for ambient noise...
Please speak...
Recognizing with PocketSphinx...
Recognized Text: plenty misspoke
