<a href="https://colab.research.google.com/github/nadiralam/voicechatbot/blob/main/Voice_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install -q openai-whisper

In [10]:
!pip install -q gradio

In [11]:
!pip install -q gtts


In [12]:
# Imports
import google.generativeai as genai
import whisper
import gradio as gr
import numpy as np
from gtts import gTTS
from PIL import Image
import datetime
import os
import torch




In [13]:
# API Key for Gemini (replace with your own securely)
from google.colab import userdata
GENAI_API_KEY = userdata.get('Gemini_key')

genai.configure(api_key=GENAI_API_KEY)


In [14]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")

Using torch 2.6.0+cu124 (cuda)


In [15]:
## load the wisper model

model = whisper.load_model("medium", device=DEVICE)

100%|█████████████████████████████████████| 1.42G/1.42G [00:17<00:00, 88.5MiB/s]


In [16]:
# Logger
tstamp = str(datetime.datetime.now()).replace(' ','_')
logfile = f'{tstamp}_log.txt'
def writehistory(text):
    with open(logfile, 'a', encoding='utf-8') as f:
        f.write(text + '\n')



In [17]:
# Whisper transcription
def transcribe(audio):
    if not audio:
        return '', '', None
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    result = whisper.decode(model, mel, whisper.DecodingOptions())
    return result.text



In [18]:
# Gemini image+text prompt
def img2txt(prompt_text, image_path):
    if not image_path:
        return "No image provided."
    img = Image.open(image_path)
    writehistory(f"Input prompt: {prompt_text}")

    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content([prompt_text, img])
        return response.text
    except Exception as e:
        return f"Error: {str(e)}"



In [19]:
# Text-to-speech
def text_to_speech(text, file_path="Temp3.mp3"):
    audioobj = gTTS(text=text, lang='en', slow=False)
    audioobj.save(file_path)
    return file_path



In [20]:
# Final pipeline
def process_inputs(audio_path, image_path):
    speech_text = transcribe(audio_path)
    prompt = f"Act as an expert in imagery descriptive analysis. Respond to the prompt: {speech_text}"
    img_text = img2txt(prompt, image_path)
    audio_reply_path = text_to_speech(img_text)
    return speech_text, img_text, audio_reply_path



In [21]:
# Gradio UI
app = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="Gemini Output"),
        gr.Audio(label="Gemini Audio Response")
    ],
    title="Gemini + Whisper Voice Chatbot",
    description="Speak your prompt, upload an image, and get image analysis powered by Gemini."
)



In [None]:
app.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://838eb585e929701ab2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
