## Transformers API

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from IPython.display import Audio
import soundfile as sf
import torch

from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

In [2]:
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2"
).to("cuda:0")

processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved', 'mrope_interleaved'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [25]:
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this audio"},
            {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"}
        ],
    },
]

USE_AUDIO_IN_VIDEO = False

text = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=False
)
audios, images, videos = process_mm_info(
    conversation,
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)
inputs = processor(
    text=text, 
    audio=audios, 
    images=images, 
    videos=videos, 
    return_tensors="pt", 
    padding=True, 
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)
inputs = inputs.to(model.device).to(model.dtype)

text_ids, audio = model.generate(
    **inputs, 
    speaker="Chelsie", # ["Chelsie", "Ethan", "Aiden"]
    thinker_return_dict_in_generate=True,
    use_audio_in_video=USE_AUDIO_IN_VIDEO,
    return_audio=False
)

out_text = processor.batch_decode(
    text_ids.sequences[:, inputs["input_ids"].shape[1] :],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(out_text[0])
# Audio(audio.detach().cpu()[0], rate=24000, autoplay=True)

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


The audio contains the sound of a person coughing. There are three distinct, sharp coughs in succession, followed by a brief pause and then a fourth, slightly longer cough. The coughs sound dry and forceful, suggesting the person may have a cold, irritation in the throat, or another respiratory condition. The recording is clear and close-miked, with no background noise or other sounds present.


## Realtime Check

In [26]:
# Generate from thinker

input_ids = inputs["input_ids"]
thinker_kwargs = {
    "max_new_tokens": 1024,
    "eos_token_id": 151645,
    "output_hidden_states": True,
    "return_dict_in_generate": True
}
for key, value in inputs.items():
    if key == "feature_attention_mask":
        thinker_kwargs[key] = value
    elif key in ("input_features", "attention_mask"):
        thinker_kwargs[key] = value

thinker_result = model.thinker.generate(
    input_ids,
    **thinker_kwargs
)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


In [27]:
out_text = processor.batch_decode(
    thinker_result.sequences[:, input_ids.shape[1] :],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(out_text[0])

The audio contains the sound of a person coughing. There are three distinct, sharp coughs in succession, followed by a brief pause and then a fourth, slightly longer cough. The coughs sound dry and forceful, suggesting the person may have a cold, irritation in the throat, or another respiratory condition. The recording is clear and close-miked, with no background noise or other sounds present.


In [50]:
thinker_result.hidden_states[0][0].shape

torch.Size([1, 51, 2048])

In [78]:
# thinker_result.hidden_states -> (token_pos, layer_pos, [batch, token, dim])

# get the text embeddings and hidden embeddings from the thinker model
thinker_embed = torch.cat([
    hidden_states[0]
    for hidden_states in thinker_result.hidden_states
], dim=1).to(input_ids.device) # [1, t, d]
thinker_hidden = torch.cat([
    hidden_states[model.config.talker_config.accept_hidden_layer]
    for hidden_states in thinker_result.hidden_states
], dim=1).to(input_ids.device) # [1, t, d]

im_start_indexes = torch.cat(
    (
        torch.nonzero(input_ids[0] == model.config.im_start_token_id).squeeze(),
        torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype),
    ),
    dim=-1,
)  # Shape [n_starts + 1]; Take batch 0 since batched inference is not supported here.

multimodal_mask = (
    (thinker_result.sequences == model.config.thinker_config.audio_token_id) |
    (thinker_result.sequences == model.config.thinker_config.image_token_id) |
    (thinker_result.sequences == model.config.thinker_config.video_token_id)
).to(input_ids.device)  # [1 t] # fmt: skip

talker_special_tokens = torch.tensor(
    [[
        model.config.tts_bos_token_id,
        model.config.tts_eos_token_id,
        model.config.tts_pad_token_id
    ]],
    device=input_ids.device,
    dtype=input_ids.dtype,
)

tts_bos_embed, tts_eos_embed, tts_pad_embed = (
    model.talker.text_projection(model.thinker.get_input_embeddings()(talker_special_tokens))
    .to(input_ids.device)
    .chunk(3, dim=1)
)

In [76]:
model.talker.text_projection(model.thinker.get_input_embeddings()(talker_special_tokens)).shape

torch.Size([1, 3, 1024])

In [63]:
torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype)

tensor([133], device='cuda:0')

In [66]:
thinker_result.sequences.shape[-1]

133

In [61]:
input_ids.shape

torch.Size([1, 51])

In [36]:
model.thinker

Qwen3OmniMoeThinkerForConditionalGeneration(
  (audio_tower): Qwen3OmniMoeAudioEncoder(
    (positional_embedding): SinusoidsPositionEmbedding()
    (layers): ModuleList(
      (0-31): 32 x Qwen3OmniMoeAudioEncoderLayer(
        (self_attn): Qwen3OmniMoeAudioAttention(
          (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((1280,), eps=1e-05, element

## vLLM Inference

In [2]:
import os
os.environ['VLLM_USE_V1']='0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch

from vllm import LLM, SamplingParams
from transformers import Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

from ipywebrtc import CameraStream, AudioRecorder
import soundfile as sf
from librosa import load
import io

INFO 11-19 16:49:23 [__init__.py:244] Automatically detected platform cuda.


In [5]:
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

# llm = LLM(
#     model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,
#     limit_mm_per_prompt={'image': 0, 'video': 0, 'audio': 1},
#     max_num_seqs=64,
#     max_model_len=16384,
#     dtype=torch.float16,
#     enable_chunked_prefill=True
# )

processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


ERROR! Session/line number was not unique in database. History logging moved to new session 220


In [3]:
camera = CameraStream(
    constraints={
    'facing_mode': 'user',
    'audio': True,
    'video': False
})

recorder = AudioRecorder(
    stream=camera,
    format="webm",
    recording=True,
    filename="audio_stream",
    autosave=True,
)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), autosave=True, filename='audio_stream', recording=True, s…

In [6]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Summarize this audio in a single sentence"},
            {"type": "audio", "audio": "msagi.mp3"},
        ], 
    }
]

sampling_params = SamplingParams(
    temperature=0.6,
    top_p=0.95,
    top_k=20,
    max_tokens=16384,
)

text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
audios, images, videos = process_mm_info(messages, use_audio_in_video=False)

inputs = {
    'prompt': text,
    'multi_modal_data': {},
    "mm_processor_kwargs": {
        "use_audio_in_video": True,
    },
}

if images is not None:
    inputs['multi_modal_data']['image'] = images
if videos is not None:
    inputs['multi_modal_data']['video'] = videos
if audios is not None:
    inputs['multi_modal_data']['audio'] = [audios[0][:16000 * 10]]

[src/libmpg123/id3.c:process_comment():587] error: No comment text / valid description?


In [7]:
audios

[array([0., 0., 0., ..., 0., 0., 0.], shape=(85140330,), dtype=float32)]

In [5]:
from tqdm.auto import tqdm

ttfts = []

for t in tqdm(range(1, 600, 10)):

    inputs['multi_modal_data']['audio'] = [audios[0][:16000 * t]]
    
    outputs = llm.generate(
        [inputs],
        sampling_params=sampling_params,
        use_tqdm=False
    )
    
    metrics = outputs[0].metrics
    ttft = metrics.first_token_time - metrics.arrival_time
    ttfts.append(ttft * 1000)
    # print(f"TTFT: {ttft * 1000: .3f} ms")

  0%|          | 0/60 [00:00<?, ?it/s]

ValueError: Attempted to assign 5213 = 5213 multimodal tokens to 5107 placeholders

In [None]:
import matplotlib.pyplot as plt

times = list(range(1, 600, 10))
plt.plot(times, ttfts)
plt.title("Qwen3Moe Omni TTFT wt max_num_seq=64")
plt.ylabel("TTFT (ms)")
plt.xlabel("Input Audio Length (s)")
plt.show()

In [13]:
num_tokens = len(outputs[0].outputs[0].token_ids)
metrics = outputs[0].metrics
time_in_queue = metrics.time_in_queue
time_to_first_token = metrics.first_token_time - metrics.first_scheduled_time
e2e_latency = metrics.finished_time - metrics.first_scheduled_time
time_per_output_token = e2e_latency / num_tokens
inter_token_latency = (metrics.last_token_time - metrics.first_token_time) / (num_tokens - 1)

print(f"Time to first token: {time_to_first_token*1000} ms")
print(f"E2E latency: {e2e_latency*1000} ms")
print(f"Time per output token: {time_per_output_token*1000} ms")
print(f"Inter-token latency: {inter_token_latency*1000} ms")
print(f"Time in queue: {time_in_queue*1000} ms")

Time to first token: 96.8024730682373 ms
E2E latency: 7201.478719711304 ms
Time per output token: 10.229373181408102 ms
Inter-token latency: 10.10560548492041 ms
Time in queue: 8.301734924316406 ms


## Realtime Camera Stream

In [1]:
from ipywebrtc import CameraStream, AudioRecorder
from IPython.display import Audio, display
import torch
import io

model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=False
)

(get_speech_timestamps, _, read_audio, *_) = utils

Using cache found in /root/.cache/torch/hub/snakers4_silero-vad_master


In [2]:
camera = CameraStream(
    constraints={
    'facing_mode': 'user',
    'audio': True,
    'video': False
})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'facing_mode': 'user', '…

In [64]:
audio_bytes = io.BytesIO(recorder.audio.value).getvalue()

audio = read_audio(audio_bytes, sampling_rate=16000)

speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=16000)
for ts in speech_timestamps:
    start = ts["start"]
    end = ts["end"]

    arr = audio[start: end]

    display(Audio(arr, rate=16000))

## Recorded Streaming

In [1]:
import os
os.environ['VLLM_USE_V1']='0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch

from vllm import LLM, SamplingParams, AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
from transformers import Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

from ipywebrtc import CameraStream, AudioRecorder
import soundfile as sf
from librosa import load
import io

from ipywebrtc import CameraStream, AudioRecorder
from IPython.display import Audio, display
import torch
import io

model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=False
)

(get_speech_timestamps, _, read_audio, *_) = utils

INFO 11-19 18:05:11 [__init__.py:244] Automatically detected platform cuda.


Using cache found in /root/.cache/torch/hub/snakers4_silero-vad_master


In [2]:
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

engine_args = AsyncEngineArgs(
    model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,
    limit_mm_per_prompt={'image': 0, 'video': 0, 'audio': 1},
    max_num_seqs=64,
    max_model_len=16384,
    dtype=torch.float16,
    enable_chunked_prefill=True
)
llm = AsyncLLMEngine.from_engine_args(engine_args)

processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_interleaved', 'interleaved', 'mrope_section'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'interleaved', 'mrope_section'}


INFO 11-19 18:05:21 [config.py:841] This model supports multiple tasks: {'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-19 18:05:21 [config.py:1472] Using max model len 16384
INFO 11-19 18:05:23 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=5120.
INFO 11-19 18:05:23 [llm_engine.py:230] Initializing a V0 LLM engine (v0.11.1rc7.dev231+g8bd45fc0b.d20251119) with config: model='Qwen/Qwen3-Omni-30B-A3B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen3-Omni-30B-A3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='xgrammar', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_

You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


INFO 11-19 18:05:26 [weight_utils.py:292] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/15 [00:00<?, ?it/s]


INFO 11-19 18:06:00 [default_loader.py:272] Loading weights took 34.70 seconds
INFO 11-19 18:06:01 [model_runner.py:1203] Model loading took 59.1623 GiB and 35.383682 seconds


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.




  torch.tensor([1] * torch.tensor(video_grid_thw).shape[0]))


INFO 11-19 18:06:08 [worker.py:294] Memory profiling takes 7.10 seconds
INFO 11-19 18:06:08 [worker.py:294] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.95) = 75.29GiB
INFO 11-19 18:06:08 [worker.py:294] model weights take 59.16GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 3.91GiB; the rest of the memory reserved for KV Cache is 12.13GiB.
INFO 11-19 18:06:09 [executor_base.py:113] # cuda blocks: 8278, # CPU blocks: 2730
INFO 11-19 18:06:09 [executor_base.py:118] Maximum concurrency for 16384 tokens per request: 8.08x
INFO 11-19 18:06:12 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_s

Capturing CUDA graph shapes:   0%|          | 0/11 [00:00<?, ?it/s]

INFO 11-19 18:06:22 [model_runner.py:1671] Graph capturing finished in 10 secs, took 0.25 GiB
INFO 11-19 18:06:22 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 21.12 seconds


In [3]:
camera = CameraStream(
    constraints={
    'facing_mode': 'user',
    'audio': True,
    'video': False
})

recorder = AudioRecorder(
    stream=camera,
    format="webm",
    recording=True,
    filename="audio_stream",
    autosave=True,
)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), autosave=True, filename='audio_stream', recording=True, s…

In [8]:
audio_bytes = io.BytesIO(recorder.audio.value).getvalue()
audio = read_audio(audio_bytes)

messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Summarize this audio in a single sentence"},
            {"type": "audio", "audio": "msagi.mp3"},
        ], 
    }
]

sampling_params = SamplingParams(
    temperature=0.6,
    top_p=0.95,
    top_k=20,
    max_tokens=16384,
)

text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
# audios, images, videos = process_mm_info(messages, use_audio_in_video=False)

inputs = {
    'prompt': text,
    'multi_modal_data': {"audio": [audio]},
    "mm_processor_kwargs": {
        "use_audio_in_video": True,
    },
}

previous_text = ""
async for request_output in llm.generate(
    inputs,
    sampling_params=sampling_params,
    request_id=0
):
    current_text = request_output.outputs[0].text
    new_text = current_text[len(previous_text):]
    
    if new_text:
        print(new_text, end='', flush=True)
        previous_text = current_text
    
    if request_output.finished:
        metrics = request_output.metrics
        time_to_first_token = metrics.first_token_time - metrics.first_scheduled_time
        print(f"Time to first token: {time_to_first_token*1000} ms")

INFO 11-19 18:10:54 [async_llm_engine.py:210] Added request 0.
INFO 11-19 18:10:54 [metrics.py:417] Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 0.1 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
The audio contains a person saying, "Hey, what's up? Hello."INFO 11-19 18:10:54 [async_llm_engine.py:178] Finished request 0.
Time to first token: 104.42972183227539 ms


In [27]:
import websockets
import json
import base64
from IPython.display import Audio, display
import asyncio
import time
import random

ELEVENLABS_API_KEY = "sk_4902f02e03291772954c3f3a42ded11d174d09aca13bd0f6"
VOICE_ID = "21m00Tcm4TlvDq8ikWAM"

async def stream_tts():
    url = f"wss://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream-input?model_id=eleven_monolingual_v1"
    
    audio_chunks = []
    first_text_sent_time = None
    first_audio_received_time = None
    
    async with websockets.connect(url) as ws:
        await ws.send(json.dumps({
            "text": " ",
            "xi-api-key": ELEVENLABS_API_KEY,
        }))
        
        async def send_text():
            nonlocal first_text_sent_time
            previous_text = ""
            first_token = True
            
            async for request_output in llm.generate(inputs, sampling_params=sampling_params, request_id=random.randint(0, 10000)):
                current_text = request_output.outputs[0].text
                new_text = current_text[len(previous_text):]
                
                if new_text:
                    if first_token:
                        metrics = request_output.metrics
                        ttft = (metrics.first_token_time - metrics.first_scheduled_time) * 1000
                        print(f"vLLM Time to first token: {ttft:.2f} ms\n")
                        first_token = False
                    
                    print(new_text, end='', flush=True)
                    await ws.send(json.dumps({"text": new_text}))
                    
                    if first_text_sent_time is None:
                        first_text_sent_time = time.time()
                    
                    previous_text = current_text
                
                if request_output.finished:
                    await ws.send(json.dumps({"text": ""}))
                    break
        
        async def receive_audio():
            nonlocal first_audio_received_time
            while True:
                try:
                    response = await ws.recv()
                    data = json.loads(response)
                    
                    if "audio" in data and data["audio"]:
                        if first_audio_received_time is None:
                            first_audio_received_time = time.time()
                            ttfb = (first_audio_received_time - first_text_sent_time) * 1000
                            print(f"\n\nElevenLabs Time to first audio byte: {ttfb:.2f} ms")
                        audio_chunks.append(base64.b64decode(data["audio"]))
                    
                    if data.get("isFinal"):
                        break
                except websockets.exceptions.ConnectionClosed:
                    break
        
        await asyncio.gather(send_text(), receive_audio())
    
    if audio_chunks:
        audio_data = b"".join(audio_chunks)
        display(Audio(audio_data, rate=22050, autoplay=True))

await stream_tts()

INFO 11-19 18:28:29 [async_llm_engine.py:210] Added request 6465.
INFO 11-19 18:28:29 [metrics.py:417] Avg prompt throughput: 7.0 tokens/s, Avg generation throughput: 2.3 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
vLLM Time to first token: 99.90 ms

The audio contains a person saying "Hey, what's up?" followed by "Hello."INFO 11-19 18:28:29 [async_llm_engine.py:178] Finished request 6465.
INFO 11-19 18:28:29 [async_llm_engine.py:222] Aborted request 6465.


ElevenLabs Time to first audio byte: 939.82 ms
