# STT -> LLM -> TTS Test Space

pipeline and stack:

* STT: coqui tts, vosk (which one?)
* LLM: ollama, langchain
* TTS: coqui tts
* AUDIO I/O: pyaudio, sounddevice

### Audio I/O Testing

In [2]:
# audio IO - pyaudio test (playback - sample)
import wave
import sys
import pyaudio

chunksize = 1024
f = 'wav_training/to_output.wav'

with wave.open(f, 'rb') as wf:
    # Instantiate PyAudio and initialize PortAudio system resources (1)
    p = pyaudio.PyAudio()

    # Open stream (2)
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # Play samples from the wave file (3)
    while len(data := wf.readframes(chunksize)):
        stream.write(data)

    # Close stream (4)
    stream.close()

    # Release PortAudio system resources (5)
    p.terminate()

In [3]:
# audio IO - pyaudio test (record)
import wave
import sys
import pyaudio
import math

chunksize = 1024
f = 'record.wav'
seconds = 5
rate = 44100
channels = 1
form = pyaudio.paInt16

# Instantiate PyAudio and initialize PortAudio system resources (1)
p = pyaudio.PyAudio()

# Open steam (2)
stream = p.open(format=form,
                channels=channels,
                rate=rate,
                input=True,
                frames_per_buffer=chunksize)

# instantiate frames container
print ("recording started")
recordframes = []

# record w/ logic for seconds
for i in range(0, math.ceil(rate / chunksize * seconds)):
    data = stream.read(chunksize)
    recordframes.append(data)
print ("recording stopped")
stream.stop_stream()

# Close stream (4)
stream.close()

# Release PortAudio system resources (5)
p.terminate()

# wav file
wf = wave.open(f, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(form))
wf.setframerate(rate)
wf.writeframes(b''.join(recordframes))
wf.close()

recording started
recording stopped


In [4]:
# audio IO - pyaudio test (playback - sample)
import wave
import sys
import pyaudio

chunksize = 1024
f = 'record.wav'

with wave.open(f, 'rb') as wf:
    # Instantiate PyAudio and initialize PortAudio system resources (1)
    p = pyaudio.PyAudio()

    # Open steam (2)
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # Play samples from the wave file (3)
    while len(data := wf.readframes(chunksize)):
        stream.write(data)

    # Close stream (4)
    stream.close()

    # Release PortAudio system resources (5)
    p.terminate()

### Speech to Text Testing

In [1]:
from vosk import Model, KaldiRecognizer
import pyaudio
import json

In [2]:
# Load the Vosk model
model_path = "models/stt_model/vosk-model-small-en-us-0.15"
# model_path = "models/stt_model/vosk-model-en-us-0.22"
model = Model(model_path)

In [3]:
# Initialize the recognizer with the model and sample rate
recognizer = KaldiRecognizer(model, 16000) # 16000 is the sample rate of the model

In [4]:
# Setup PyAudio for audio input
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8192)
stream.start_stream()

print("Listening...")

# Speech recognition loop
while True:
    data = stream.read(4096, exception_on_overflow=False)
    if recognizer.AcceptWaveform(data):
        result = json.loads(recognizer.Result())
        print("You:", result["text"])
    else:
        # Optional: print partial results during recognition
        # partial_result = json.loads(recognizer.PartialResult())
        # print("Partial:", partial_result["partial"])
        pass

Listening...
You: hey i'm trying to small also want to make it for with local before on little smartphones your sleep but like it's pretty good it should be you're my voice now i'm you know have options exit minute
You: oh it's python and for ask is the package amusing to be with speech that it knows when i finish it sense middletown homicide at that moment
You: it's not very good it's like it's it's it's
You: first woman like it's far from me
You: cooper
You: it's for the bike is far from me right now and like i'm like i said i'm using the that small version of their mother so it it it's a little limited it's capabilities but at least the all this was about twenty minutes appoint if it's it's a really simple baggage
You: 
You: fruit sugar i mean abusing lama for one thought that was my models local you
You: and it just got a
You: interface and on all and
You: 
You: 
You: oh really
You: 
You: 
You: interesting
You: that's really interesting of i'm trying to develop a small enough think

KeyboardInterrupt: 

### Voice Synthesis Testing

In [5]:
import torch
from TTS.api import TTS
from datetime import date 

script = 'Hey fryman, pass me the peanut butter'

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# Text to speech to a file
tts.tts_to_file(text=script, speaker_wav="wav_training/p1.wav", language="en", file_path=f"wav_sample/test_p1_{date.today().strftime('%Y%m%d%H%M%S')}.wav")

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
 > Text splitted to sentences.
['Hey fryman, pass me the peanut butter']
 > Processing time: 2.366482734680176
 > Real-time factor: 0.6703615660290067


'wav_sample/test_p1_20250319000000.wav'

### Voice Synthesis Testing - Fine Tuned XTTSv2 Model

In [1]:
# model
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from IPython.display import Audio, display

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# input fs
CONFIG_PATH = "models/tts_model/XTTS/run/training/GPT_XTTS_Carl_20251002-October-03-2025_11+12AM-37eb147/config.json"
TOKENIZER_PATH = "models/tts_model/XTTS/run/training/XTTS_v2_original_model_files/vocab.json"
XTTS_CHECKPOINT = "models/tts_model/XTTS/run/training/GPT_XTTS_Carl_20251002-October-03-2025_11+12AM-37eb147/best_model.pth"
SPEAKER_REFERENCE = "wav/input/Carl_Voice_Dataset/wavs/carl_0001.wav"

In [3]:
# instantiate model - only needs to run once!

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
print("Loading fine-tuned XTTS model manually...")

try:
    print("Loading model...")
    config = XttsConfig()
    config.load_json(CONFIG_PATH)
    model = Xtts.init_from_config(config)
    model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False)
    model.cuda()
    
    print("Computing speaker latents...")
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
    print("\n✅ Model and Speaker Latents loaded successfully!")

except Exception as e:
    print(f"\n❌ Model Loading Failed: {e}")

Using device: cuda
Loading fine-tuned XTTS model manually...
Loading model...
Computing speaker latents...

✅ Model and Speaker Latents loaded successfully!


In [7]:
# audio generation

NEW_CARL_TEXT = input()
OUTPUT_WAV_PATH = f"wav/output/test_XTTSv2_20251002/output_{NEW_CARL_TEXT[0:10]}.wav" 

# --- Inference ---
print(f"Generating: '{NEW_CARL_TEXT[:50]}...'")

try:
    # Use the loaded model and pre-calculated latents for fast inference
    out = model.inference(
        NEW_CARL_TEXT,
        "en",
        gpt_cond_latent,
        speaker_embedding,
        temperature=0.7, # Add custom parameters here
    )
    
    torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)
    print(f"✅ Audio saved as {OUTPUT_WAV_PATH}")
    display(Audio(out['wav'], rate=24000))

except Exception as e:
    print(f"❌ Inference failed: {e}")

 Get out of my friggin' pool


Generating: 'Get out of my friggin' pool...'
✅ Audio saved as output_Get out of.wav


### LLM Instantiation Testing

In [1]:
# instantiate ollama - is this necessary when running win app?
import os
os.system('ollama run carl_20250927')

0

In [2]:
from ollama import chat
from ollama import ChatResponse

In [3]:
# demo example - https://github.com/ollama/ollama-python

msg = input('Speak to the Carl: ')

response: ChatResponse = chat(model='carl_20250927', messages=[
  {
    'role': 'user',
    'content': msg,
  },
])
print(response['message']['content'])
# or access fields directly from the response object
print(response.message.content)

Speak to the Carl:  Hey Carl. How u doing?


Ugh, what's it to you? I'm doin' great, just tryin' to enjoy my day in peace before some moron like you comes along and ruins it. What the hell do you want? Can't you see I'm watchin' TV here? Now's not a good time, okay? This don't matter, but I gotta work from home today and I got better things to do than chat with... (sigh) ...you.
Ugh, what's it to you? I'm doin' great, just tryin' to enjoy my day in peace before some moron like you comes along and ruins it. What the hell do you want? Can't you see I'm watchin' TV here? Now's not a good time, okay? This don't matter, but I gotta work from home today and I got better things to do than chat with... (sigh) ...you.


In [4]:
type(response)

ollama._types.ChatResponse

In [4]:
response

ChatResponse(model='carl_20250927', created_at='2025-09-27T17:13:05.7013036Z', done=True, done_reason='stop', total_duration=2087382600, load_duration=92825100, prompt_eval_count=359, prompt_eval_duration=484504000, eval_count=97, eval_duration=1509164900, message=Message(role='assistant', content="Ugh, what's it to you? I'm doin' great, just tryin' to enjoy my day in peace before some moron like you comes along and ruins it. What the hell do you want? Can't you see I'm watchin' TV here? Now's not a good time, okay? This don't matter, but I gotta work from home today and I got better things to do than chat with... (sigh) ...you.", images=None, tool_calls=None))

In [5]:
response['message']

Message(role='assistant', content="Ugh, what's it to you? I'm doin' great, just tryin' to enjoy my day in peace before some moron like you comes along and ruins it. What the hell do you want? Can't you see I'm watchin' TV here? Now's not a good time, okay? This don't matter, but I gotta work from home today and I got better things to do than chat with... (sigh) ...you.", images=None, tool_calls=None)

In [7]:
# stop the carl
os.system('ollama stop carl_20250927')

0