In [1]:
import threading
import asyncio
import queue
import time
import sounddevice as sd
import numpy as np
import torch
import warnings
import os
import sys
from transformers import AutoTokenizer
from parler_tts import ParlerTTSForConditionalGeneration
from openai import OpenAI

os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings("ignore")

client = OpenAI(
    base_url="https://api.sambanova.ai/v1",
    api_key="4a81daa9-5f3d-409b-9f30-ebedb379219a"
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
MODEL_1 = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(DEVICE)
MODEL_2 = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(DEVICE)

DESC_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_1.config.text_encoder._name_or_path)
DESCRIPTION = ("Rohit's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.")
DESC_INPUTS = DESC_TOKENIZER(DESCRIPTION, return_tensors="pt").to(DEVICE)
DESC_INPUT_IDS = DESC_INPUTS.input_ids
DESC_ATTN_MASK = DESC_INPUTS.attention_mask

SAMPLING_RATE = MODEL_1.config.sampling_rate
sd.default.latency = 'low'

MAX_CHUNK_CHARS = 10000

async def async_playback(queue1, queue2, done_event, latency_event):
    next_from_queue1 = True
    latency_measured = False

    while True:
        try:
            current_queue = queue1 if next_from_queue1 else queue2
            try:
                audio = await asyncio.wait_for(current_queue.get(), timeout=0.5)
            except asyncio.TimeoutError:
                current_queue = queue2 if next_from_queue1 else queue1
                try:
                    audio = await asyncio.wait_for(current_queue.get(), timeout=0.5)
                except asyncio.TimeoutError:
                    if done_event.is_set() and queue1.empty() and queue2.empty():
                        break
                    continue

            if latency_event and not latency_measured:
                latency_event.set()
                latency_measured = True

            sd.play(audio, samplerate=SAMPLING_RATE)
            sd.wait()

            next_from_queue1 = not (current_queue is queue1)

        except asyncio.CancelledError:
            break

    print("Playback finished.")

def tts_worker(text_queue, audio_queue, model, loop):
    async def put_audio(audio):
        await audio_queue.put(audio)

    while True:
        text = text_queue.get()
        if text is None:
            break

        prompt_inputs = TOKENIZER(text, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            speech_output = model.generate(
                input_ids=DESC_INPUT_IDS,
                attention_mask=DESC_ATTN_MASK,
                prompt_input_ids=prompt_inputs.input_ids,
                prompt_attention_mask=prompt_inputs.attention_mask
            )
        audio = speech_output.cpu().numpy().squeeze()
        if audio.ndim > 1:
            audio = audio.flatten()
        audio = audio.astype(np.float32)
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val

        asyncio.run_coroutine_threadsafe(put_audio(audio), loop)

def stream_llm_to_chunks(prompt, q1, q2, small_chunk=20, large_chunk=60):
    buffer = ""
    toggle = 0
    max_queue_size=10
    chunk_sizes = [small_chunk, large_chunk]
    chunk_index = 0

    response = client.chat.completions.create(
        model="Meta-Llama-3.2-3B-Instruct",
        messages=[
            {"role": "system", "content": "You are a Hindi-only assistant. Keep responses short."},
            {"role": "user", "content": prompt}
        ],
        stream=True
    )

    for chunk in response:
        delta = chunk.choices[0].delta
        if hasattr(delta, "content") and delta.content:
            buffer += delta.content
            print(delta.content, end="", flush=True)
            current_chunk_size = chunk_sizes[chunk_index % 2]

            while len(buffer) >= current_chunk_size:
                chunk_text = buffer[:current_chunk_size].strip()
                target_queue = q1 if toggle == 0 else q2

                while target_queue.qsize() >= max_queue_size:
                    time.sleep(0.01)

                target_queue.put(chunk_text)
                buffer = buffer[current_chunk_size:]
                toggle = 1 - toggle
                chunk_sizes[chunk_index % 2] = min(chunk_sizes[chunk_index % 2] * 2, MAX_CHUNK_CHARS)
                chunk_index += 1
    if buffer:
        (q1 if toggle == 0 else q2).put(buffer.strip())

    # time.sleep(1.0)  # Prevent premature shutdown
    q1.put(None)
    q2.put(None)

def speak_from_prompt(prompt: str, small_chunk: int = 10, large_chunk: int = 70, measure_latency: bool = False):
    Q1, Q2 = queue.Queue(), queue.Queue()
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    A1, A2 = asyncio.Queue(), asyncio.Queue()
    done_event = threading.Event()
    latency_event = threading.Event() if measure_latency else None

    loop_thread = threading.Thread(target=loop.run_forever)
    loop_thread.start()

    tts_thread1 = threading.Thread(target=tts_worker, args=(Q1, A1, MODEL_1, loop))
    tts_thread2 = threading.Thread(target=tts_worker, args=(Q2, A2, MODEL_2, loop))
    tts_thread1.start()
    tts_thread2.start()

    start_time = time.time() if measure_latency else None
    playback_future = asyncio.run_coroutine_threadsafe(
        async_playback(A1, A2, done_event, latency_event), loop
    )

    stream_llm_to_chunks(prompt, Q1, Q2, small_chunk, large_chunk)
    tts_thread1.join()
    tts_thread2.join()

    done_event.set()
    playback_future.result()

    loop.call_soon_threadsafe(loop.stop)
    loop_thread.join()

    if measure_latency and latency_event:
        latency_event.wait()
        latency = time.time() - start_time
        print(f"\nSpeech synthesis complete.\n⏱️ First audio latency: {latency:.2f} seconds")
    else:
        print("\nSpeech synthesis complete.")


Flash attention 2 is not installed
ParlerTTSForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
ParlerTTSForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `genera

In [6]:
speak_from_prompt("give me short story", small_chunk = 10, large_chunk = 20, measure_latency=True)

एक छोटे से गाँव में एक लड़का रहता था। वह अपने दादाजी की दुकान में काम करता था। एक दिन, वह एक पुरानी किताब ढूंढता है जिसमें एक रहस्यमय पत्र है। उस पत्र में एक सुंदर लड़की का नाम लिखा है जो गाँव से 100 किलोमीटर दूर रहती है।

लड़का उस पत्र को पढ़ता है और सोचता है कि वह उस लड़की को ढूंढने की कोशिश करेगा। वह उस पत्र के निर्देशों का पालन करता है और एक सुंदर यात्रा शुरू करता है।

ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun o

KeyboardInterrupt: 

ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred
ALSA lib pcm.c:8740:(snd_pcm_recover) underrun occurred


Playback finished.


In [None]:
from RealtimeSTT import AudioToTextRecorder
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='multiprocessing')
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer

if __name__ == '__main__':
    print("Wait until it says 'speak now'")
    recorder = AudioToTextRecorder(model="medium",device="cpu",allowed_latency_limit=30,language="en",spinner=True)
    while True:
        recorder.text(speak_from_prompt)
        # recorder.text(chat_completion_llama)

Wait until it says 'speak now'




⠧ speak nowनमस्ते! कैसे हैं आप?



⠹ speak nowPlayback finished.

Speech synthesis complete.
Playback finished.म है सेवा।
⠧ speak now
Speech synthesis complete.
⠋ speak nowing

In [5]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
from transformers import AutoTokenizer
from threading import Thread

torch_device = "cuda:0" # Use "mps" for Mac 
torch_dtype = torch.bfloat16
model_name = "parler-tts/parler-tts-mini-v1"

# need to set padding max length
max_length = 50

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name) 
model = ParlerTTSForConditionalGeneration.from_pretrained(
    model_name,
).to(torch_device, dtype=torch_dtype)

sampling_rate = model.audio_encoder.config.sampling_rate
frame_rate = model.audio_encoder.config.frame_rate

def generate(text, description, play_steps_in_s=0.5):
  play_steps = int(frame_rate * play_steps_in_s)
  streamer = ParlerTTSStreamer(model, device=torch_device, play_steps=play_steps)
  # tokenization
  inputs = tokenizer(description, return_tensors="pt").to(torch_device)
  prompt = tokenizer(text, return_tensors="pt").to(torch_device)
  # create generation kwargs
  generation_kwargs = dict(
    input_ids=inputs.input_ids,
    prompt_input_ids=prompt.input_ids,
    attention_mask=inputs.attention_mask,
    prompt_attention_mask=prompt.attention_mask,
    streamer=streamer,
    do_sample=True,
    temperature=1.0,
    min_new_tokens=10,
  )
  # initialize Thread
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
  thread.start()
  # iterate over chunks of audio
  for new_audio in streamer:
    if new_audio.shape[0] == 0:
      break
    print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 4)} seconds")
    yield sampling_rate, new_audio


# now you can do
text = "This is a test of the streamer class"
description = "Jon's talking really fast."

chunk_size_in_s = 0.5

for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
  # You can do everything that you need with the chunk now
  # For example: stream it, save it, play it.
  print(audio_chunk.shape) 

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.51.3",
  "use_cache": true,
  "vocab_size": 32128
}

Config of the audio_encoder: <class 'parler_tts.dac_wrapper.modeling_dac.DACModel'> is overwritten by shared au

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Sample of length: 0.329 seconds
(14507,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.1122 seconds
(4949,)


In [2]:
!python ASR.py

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.75it/s][0m[0m
Wait until it says 'speak now'[0m
ALSA lib pcm_dsnoop.c:567:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1000:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2721:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2721:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2721:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
Cannot connect to server socket err = No such file or directory
Cannot connect to server request channel
jack server is not running or cannot be started
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
Cannot connect to server socket err = No such file or directory
Cannot connect to server request channel
jack server is not running or cannot be started
JackShmReadWritePtr::~JackShmReadWritePtr - Init not