In [1]:
# ==============================================
# Real-Time Speech-to-Text System (Online + Offline)
# Fully Working in Jupyter Notebook
# ==============================================

# ----------------------------
# 1. Install required libraries
# ----------------------------
!pip install SpeechRecognition vosk pandas sounddevice soundfile

# ----------------------------
# 2. Imports
# ----------------------------
import os
import json
import tempfile
import wave
import sounddevice as sd
import soundfile as sf
import numpy as np
import pandas as pd
import speech_recognition as sr
from vosk import Model, KaldiRecognizer
import subprocess

# ----------------------------
# 3. Load Vosk Model
# ----------------------------
VOSK_MODEL_PATH = "vosk-model-small-en-us-0.15"  # download and unzip this model first
if not os.path.exists(VOSK_MODEL_PATH):
    print("‚ö†Ô∏è Please download Vosk model from https://alphacephei.com/vosk/models "
          "and unzip it as 'vosk-model-small-en-us-0.15'")
else:
    vosk_model = Model(VOSK_MODEL_PATH)

# ----------------------------
# 4. Record audio from mic
# ----------------------------
def record_audio(duration=5, fs=16000):
    print("üé§ Speak something... (Recording for {} seconds)".format(duration))
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
    sd.wait()
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    sf.write(tmp_file.name, audio, fs)
    print(f"Recording saved to {tmp_file.name}")
    return tmp_file.name

# ----------------------------
# 5. Google Speech API (Online)
# ----------------------------
def recognize_google(filename):
    r = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio_data = r.record(source)
        try:
            print("üîç Recognizing with Google API...")
            text = r.recognize_google(audio_data)
            print("‚úÖ Speech successfully converted to text!")
            return text
        except sr.UnknownValueError:
            return "‚ö†Ô∏è Could not understand audio. Speak more clearly."
        except sr.RequestError:
            return "‚ö†Ô∏è Google API unavailable. Check your internet."

# ----------------------------
# 6. Vosk Recognition (Offline)
# ----------------------------
def recognize_vosk(filename):
    try:
        print("üîç Recognizing with Vosk (offline)...")
        process = subprocess.Popen(
            ["ffmpeg", "-loglevel", "quiet", "-i", filename, "-ar", "16000", "-ac", "1", "-f", "s16le", "-"],
            stdout=subprocess.PIPE
        )
        rec = KaldiRecognizer(vosk_model, 16000)
        result_text = ""
        while True:
            data = process.stdout.read(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                res = json.loads(rec.Result())
                result_text += " " + res.get("text", "")
        final_res = json.loads(rec.FinalResult())
        result_text += " " + final_res.get("text", "")
        if result_text.strip() == "":
            return "‚ö†Ô∏è Could not understand audio. Speak more clearly."
        print("‚úÖ Speech successfully converted to text!")
        return result_text.strip()
    except Exception as e:
        return f"‚ö†Ô∏è Vosk error: {str(e)}"

# ----------------------------
# 7. Compare Methods
# ----------------------------
def compare_methods(filename):
    results = {
        "Method": ["Google API", "Vosk (Offline)"],
        "Output": [recognize_google(filename), recognize_vosk(filename)]
    }
    df = pd.DataFrame(results)
    return df

# ----------------------------
# 8. Run Everything
# ----------------------------
audio_file = record_audio(duration=5)  # Record 5 seconds
comparison_df = compare_methods(audio_file)

print("\n=== Comparison Table ===")
display(comparison_df)

# ----------------------------
# 9. Observations / Notes
# ----------------------------
print("\nüìå Observations:")
print("- Google API is more accurate with noisy or fast speech but requires internet.")
print("- Vosk works offline but may struggle with unclear or soft speech.")
print("- For future improvements, you can integrate Whisper (OpenAI) for better offline performance.")




LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from vosk-model-small-en-us-0.15/graph/HCLr.fst vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo vosk-model-small-en-us-0.15/graph/phones/word_boundary.int


üé§ Speak something... (Recording for 5 seconds)
Recording saved to /var/folders/q0/f4378byd6qdfkx4nnwfw_rk80000gn/T/tmpobvl72nc.wav
üîç Recognizing with Google API...
‚úÖ Speech successfully converted to text!
üîç Recognizing with Vosk (offline)...
‚úÖ Speech successfully converted to text!

=== Comparison Table ===


Unnamed: 0,Method,Output
0,Google API,hello
1,Vosk (Offline),hello



üìå Observations:
- Google API is more accurate with noisy or fast speech but requires internet.
- Vosk works offline but may struggle with unclear or soft speech.
- For future improvements, you can integrate Whisper (OpenAI) for better offline performance.
