In [1]:
import sys, platform, time, os
print("Python:", sys.version)
print("OS:", platform.platform())


Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]
OS: Windows-11-10.0.26100-SP0


In [10]:
!pip install --quiet sounddevice soundfile numpy SpeechRecognition pyttsx3 google-generativeai
# Optional (better STT):
# !pip install --quiet openai-whisper torch --index-url https://download.pytorch.org/whl/cpu


In [88]:
import sounddevice as sd, soundfile as sf, numpy as np

SAMPLERATE = 16000
DURATION   = 6
WAV_FILE   = "input.wav"

print("Recording… (start speaking)")
audio = sd.rec(int(DURATION * SAMPLERATE), samplerate=SAMPLERATE, channels=1, dtype='float32')
sd.wait()
audio = np.clip(audio * 1.8, -1.0, 1.0)  # gain
sf.write(WAV_FILE, audio, SAMPLERATE)
print("Saved:", WAV_FILE)


Recording… (start speaking clearly)
Saved: input.wav


In [97]:
import speech_recognition as sr

recognizer = sr.Recognizer()
with sr.AudioFile(WAV_FILE) as source:
    recognizer.adjust_for_ambient_noise(source, duration=0.2)
    audio_data = recognizer.record(source)

try:
    user_text = recognizer.recognize_google(audio_data, language="en-US")
    print("You said:", user_text)
except sr.UnknownValueError:
    user_text = ""
    print("Couldn’t understand the audio.")
except sr.RequestError as e:
    user_text = ""
    print("Google SR request error:", e)


You said: I am Soumya email Soumya at Gmail.com


In [98]:
 import google.generativeai as genai
import time

USE_GEMINI = True  # False for mock reply, use true when use gemini key
GEMINI_MODEL = "models/gemini-1.5-pro-latest"

GEMINI_API_KEY = os.getenv("", "")   
if USE_GEMINI and not GEMINI_API_KEY:
    print("⚠️ No Gemini API key found. Switching to mock.")
    USE_GEMINI = False
else:
    genai.configure(api_key=GEMINI_API_KEY)

def ai_reply_from_gemini(text):
    prompt = f"You are a concise voice agent. User said: {text}"
    t0 = time.time()
    try:
        model = genai.GenerativeModel(GEMINI_MODEL)
        resp = model.generate_content(prompt)
        reply = getattr(resp, "text", str(resp))
    except Exception as e:
        reply = f"(fallback) I heard: {text}"
        print("Gemini error:", e)
    latency = (time.time() - t0) * 1000
    return reply, latency

def ai_reply_mock(text):
    t0 = time.time()
    time.sleep(0.15)
    latency = (time.time() - t0) * 1000
    return f"(mock) You said: {text}", latency

if user_text.strip():
    if USE_GEMINI:
        ai_text, model_latency_ms = ai_reply_from_gemini(user_text)
    else:
        ai_text, model_latency_ms = ai_reply_mock(user_text)
    print("AI reply:", ai_text)
    print(f"Model latency: {model_latency_ms:.1f} ms")
else:
    ai_text, model_latency_ms = "(no input)", 0


⚠️ No Gemini API key found. Switching to mock.
AI reply: (mock) You said: I am Soumya email Soumya at Gmail.com
Model latency: 150.2 ms


In [99]:
import pyttsx3

engine = pyttsx3.init()
engine.setProperty('rate', 185)   
print("Speaking:", ai_text)
engine.say(ai_text)
engine.runAndWait()


Speaking: (mock) You said: I am Soumya email Soumya at Gmail.com


In [100]:
import re, json

form = {"name": "", "email": ""}

txt = (user_text or "").lower()

# Name extraction - stop when reaching 'my email' or 'email'
m = re.search(r"(?:my name is|i am)\s+([a-zA-Z]+)", txt)
if m:
    form["name"] = m.group(1).title()

# Email extraction
m = re.search(r"([a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,})", txt)
if m:
    form["email"] = m.group(1)
else:
    m = re.search(r"(?:my email is|email)\s+([a-z0-9._%+-]+)\s*(?:at)?\s*(gmail|yahoo|outlook)", txt)
    if m:
        form["email"] = f"{m.group(1)}@{m.group(2)}.com"

print("Form after extraction:")
print(json.dumps(form, indent=2))


Form after extraction:
{
  "name": "Soumya",
  "email": "soumya@gmail.com"
}


In [94]:
t_start = time.time()
_ = ai_reply_from_gemini(user_text)[0] if USE_GEMINI else ai_reply_mock(user_text)[0]
t_model_end = time.time()
tts_start = time.time()
engine.say(ai_text)
engine.runAndWait()
t_end = time.time()

model_ms  = (t_model_end - t_start) * 1000
tts_ms    = (t_end - tts_start) * 1000
total_ms  = (t_end - t_start) * 1000

print(f"Model: {model_ms:.1f} ms | TTS: {tts_ms:.1f} ms | Total: {total_ms:.1f} ms")


Model: 151.0 ms | TTS: 181.1 ms | Total: 332.3 ms


In [101]:
report = f"""
# Performance Report (Notebook)

- ASR: {'SpeechRecognition' if 'speech_recognition' in sys.modules else 'Whisper'}
- LLM: {'Gemini ' + GEMINI_MODEL if USE_GEMINI else 'Mock reply'}
- TTS: pyttsx3 (offline)
- Latencies:
  - Model: ~{model_ms:.0f} ms
  - TTS: ~{tts_ms:.0f} ms
  - Total: ~{total_ms:.0f} ms

## Why not <500 ms?
- This is single-shot. Streaming STT/LLM/TTS would be needed.
"""
print(report)



# Performance Report (Notebook)

- ASR: SpeechRecognition
- LLM: Mock reply
- TTS: pyttsx3 (offline)
- Latencies:
  - Model: ~151 ms
  - TTS: ~181 ms
  - Total: ~332 ms

## Why not <500 ms?
- This is single-shot. Streaming STT/LLM/TTS would be needed.

