## Transformers API

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from IPython.display import Audio
import soundfile as sf
import torch

from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

In [3]:
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2"
).to("cuda:0")

processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_interleaved', 'mrope_section', 'interleaved'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [25]:
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this audio"},
            {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"}
        ],
    },
]

USE_AUDIO_IN_VIDEO = False

text = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=False
)
audios, images, videos = process_mm_info(
    conversation,
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)
inputs = processor(
    text=text, 
    audio=audios, 
    images=images, 
    videos=videos, 
    return_tensors="pt", 
    padding=True, 
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)
inputs = inputs.to(model.device).to(model.dtype)

text_ids, audio = model.generate(
    **inputs, 
    speaker="Chelsie", # ["Chelsie", "Ethan", "Aiden"]
    thinker_return_dict_in_generate=True,
    use_audio_in_video=USE_AUDIO_IN_VIDEO,
    return_audio=False
)

out_text = processor.batch_decode(
    text_ids.sequences[:, inputs["input_ids"].shape[1] :],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(out_text[0])
# Audio(audio.detach().cpu()[0], rate=24000, autoplay=True)

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


The audio contains the sound of a person coughing. There are three distinct, sharp coughs in succession, followed by a brief pause and then a fourth, slightly longer cough. The coughs sound dry and forceful, suggesting the person may have a cold, irritation in the throat, or another respiratory condition. The recording is clear and close-miked, with no background noise or other sounds present.


## Realtime Check

In [26]:
# Generate from thinker

input_ids = inputs["input_ids"]
thinker_kwargs = {
    "max_new_tokens": 1024,
    "eos_token_id": 151645,
    "output_hidden_states": True,
    "return_dict_in_generate": True
}
for key, value in inputs.items():
    if key == "feature_attention_mask":
        thinker_kwargs[key] = value
    elif key in ("input_features", "attention_mask"):
        thinker_kwargs[key] = value

thinker_result = model.thinker.generate(
    input_ids,
    **thinker_kwargs
)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


In [27]:
out_text = processor.batch_decode(
    thinker_result.sequences[:, input_ids.shape[1] :],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(out_text[0])

The audio contains the sound of a person coughing. There are three distinct, sharp coughs in succession, followed by a brief pause and then a fourth, slightly longer cough. The coughs sound dry and forceful, suggesting the person may have a cold, irritation in the throat, or another respiratory condition. The recording is clear and close-miked, with no background noise or other sounds present.


In [50]:
thinker_result.hidden_states[0][0].shape

torch.Size([1, 51, 2048])

In [78]:
# thinker_result.hidden_states -> (token_pos, layer_pos, [batch, token, dim])

# get the text embeddings and hidden embeddings from the thinker model
thinker_embed = torch.cat([
    hidden_states[0]
    for hidden_states in thinker_result.hidden_states
], dim=1).to(input_ids.device) # [1, t, d]
thinker_hidden = torch.cat([
    hidden_states[model.config.talker_config.accept_hidden_layer]
    for hidden_states in thinker_result.hidden_states
], dim=1).to(input_ids.device) # [1, t, d]

im_start_indexes = torch.cat(
    (
        torch.nonzero(input_ids[0] == model.config.im_start_token_id).squeeze(),
        torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype),
    ),
    dim=-1,
)  # Shape [n_starts + 1]; Take batch 0 since batched inference is not supported here.

multimodal_mask = (
    (thinker_result.sequences == model.config.thinker_config.audio_token_id) |
    (thinker_result.sequences == model.config.thinker_config.image_token_id) |
    (thinker_result.sequences == model.config.thinker_config.video_token_id)
).to(input_ids.device)  # [1 t] # fmt: skip

talker_special_tokens = torch.tensor(
    [[
        model.config.tts_bos_token_id,
        model.config.tts_eos_token_id,
        model.config.tts_pad_token_id
    ]],
    device=input_ids.device,
    dtype=input_ids.dtype,
)

tts_bos_embed, tts_eos_embed, tts_pad_embed = (
    model.talker.text_projection(model.thinker.get_input_embeddings()(talker_special_tokens))
    .to(input_ids.device)
    .chunk(3, dim=1)
)

In [76]:
model.talker.text_projection(model.thinker.get_input_embeddings()(talker_special_tokens)).shape

torch.Size([1, 3, 1024])

In [63]:
torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype)

tensor([133], device='cuda:0')

In [66]:
thinker_result.sequences.shape[-1]

133

In [61]:
input_ids.shape

torch.Size([1, 51])

In [36]:
model.thinker

Qwen3OmniMoeThinkerForConditionalGeneration(
  (audio_tower): Qwen3OmniMoeAudioEncoder(
    (positional_embedding): SinusoidsPositionEmbedding()
    (layers): ModuleList(
      (0-31): 32 x Qwen3OmniMoeAudioEncoderLayer(
        (self_attn): Qwen3OmniMoeAudioAttention(
          (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((1280,), eps=1e-05, element

## vLLM Inference

In [1]:
import os
os.environ['VLLM_USE_V1']='0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch

from vllm import LLM, SamplingParams
from transformers import Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

from ipywebrtc import CameraStream, AudioRecorder
import soundfile as sf
from librosa import load
import io

INFO 11-16 09:18:54 [__init__.py:244] Automatically detected platform cuda.


In [2]:
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

llm = LLM(
        model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,
        limit_mm_per_prompt={'image': 0, 'video': 0, 'audio': 1},
        max_num_seqs=8,
        max_model_len=16384,
        dtype=torch.float16
)

processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved', 'mrope_interleaved'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'interleaved', 'mrope_section'}


INFO 11-16 09:19:03 [config.py:841] This model supports multiple tasks: {'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-16 09:19:03 [config.py:1472] Using max model len 16384
INFO 11-16 09:19:06 [llm_engine.py:230] Initializing a V0 LLM engine (v0.11.1rc7.dev227+g26918cb13.d20251116) with config: model='Qwen/Qwen3-Omni-30B-A3B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen3-Omni-30B-A3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='xgrammar', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=No

You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


INFO 11-16 09:19:08 [weight_utils.py:292] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/15 [00:00<?, ?it/s]


INFO 11-16 09:23:00 [default_loader.py:272] Loading weights took 232.07 seconds
INFO 11-16 09:23:01 [model_runner.py:1203] Model loading took 59.1623 GiB and 233.194678 seconds


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
  torch.tensor([1] * torch.tensor(video_grid_thw).shape[0]))


INFO 11-16 09:23:09 [worker.py:294] Memory profiling takes 7.66 seconds
INFO 11-16 09:23:09 [worker.py:294] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.95) = 75.29GiB
INFO 11-16 09:23:09 [worker.py:294] model weights take 59.16GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.74GiB; the rest of the memory reserved for KV Cache is 14.29GiB.
INFO 11-16 09:23:10 [executor_base.py:113] # cuda blocks: 9757, # CPU blocks: 2730
INFO 11-16 09:23:10 [executor_base.py:118] Maximum concurrency for 16384 tokens per request: 9.53x
INFO 11-16 09:23:15 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_s

Capturing CUDA graph shapes:   0%|          | 0/4 [00:00<?, ?it/s]

INFO 11-16 09:23:20 [model_runner.py:1671] Graph capturing finished in 5 secs, took 0.11 GiB
INFO 11-16 09:23:20 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 18.90 seconds


In [3]:
camera = CameraStream(
    constraints={
    'facing_mode': 'user',
    'audio': True,
    'video': False
})

recorder = AudioRecorder(
    stream=camera,
    format="webm",
    recording=True,
    filename="audio_stream",
    autosave=True,
)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), autosave=True, filename='audio_stream', recording=True, s…

In [11]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": ""},
            {"type": "audio", "audio": "audio_stream.webm"},
        ], 
    }
]

sampling_params = SamplingParams(
    temperature=0.6,
    top_p=0.95,
    top_k=20,
    max_tokens=16384,
)

text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
audios, images, videos = process_mm_info(messages, use_audio_in_video=False)

inputs = {
    'prompt': text,
    'multi_modal_data': {},
    "mm_processor_kwargs": {
        "use_audio_in_video": True,
    },
}

if images is not None:
    inputs['multi_modal_data']['image'] = images
if videos is not None:
    inputs['multi_modal_data']['video'] = videos
if audios is not None:
    inputs['multi_modal_data']['audio'] = audios

outputs = llm.generate(
    [inputs],
    sampling_params=sampling_params
)

print(outputs[0].outputs[0].text)

  librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  torch.tensor([1] * torch.tensor(video_grid_thw).shape[0]))


SNL can refer to several things, depending on the context. Here are the most common meanings:

1. **Saturday Night Live** – A long-running American live television comedy sketch show that airs on NBC. It first premiered in 1975 and is known for its satirical sketches, celebrity hosts, and musical performances. The show has launched the careers of many famous comedians and actors.

2. **Sodium Nitroprusside (SNP)** – In medical contexts, SNL might be a typo or shorthand for **Sodium Nitroprusside**, a medication used to treat high blood pressure and certain heart conditions. However, the standard abbreviation is **SNP**, not SNL.

3. **SNL (Software)** – In software development, SNL can refer to **Simulation Network Language**, a domain-specific language used in modeling and simulation, particularly in defense and aerospace applications.

4. **SNL (Society)** – In some contexts, it could refer to an organization or society, such as the **Society of Nuclear Medicine and Molecular Imaging

In [12]:
num_tokens = len(outputs[0].outputs[0].token_ids)
metrics = outputs[0].metrics
time_in_queue = metrics.time_in_queue
time_to_first_token = metrics.first_token_time - metrics.first_scheduled_time
e2e_latency = metrics.finished_time - metrics.first_scheduled_time
time_per_output_token = e2e_latency / num_tokens
inter_token_latency = (metrics.last_token_time - metrics.first_token_time) / (num_tokens - 1)

print(f"Time to first token: {time_to_first_token*1000} ms")
print(f"E2E latency: {e2e_latency*1000} ms")
print(f"Time per output token: {time_per_output_token*1000} ms")
print(f"Inter-token latency: {inter_token_latency*1000} ms")
print(f"Time in queue: {time_in_queue*1000} ms")

Time to first token: 127.66051292419434 ms
E2E latency: 3524.7745513916016 ms
Time per output token: 10.45927166585045 ms
Inter-token latency: 10.109465746652512 ms
Time in queue: 638.0672454833984 ms


## Realtime Camera Stream

In [1]:
from ipywebrtc import CameraStream, AudioRecorder
from IPython.display import Audio, display
import torch
import io

model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=False
)

(get_speech_timestamps, _, read_audio, *_) = utils

Using cache found in /root/.cache/torch/hub/snakers4_silero-vad_master


In [2]:
camera = CameraStream(
    constraints={
    'facing_mode': 'user',
    'audio': True,
    'video': False
})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'facing_mode': 'user', '…

In [64]:
audio_bytes = io.BytesIO(recorder.audio.value).getvalue()

audio = read_audio(audio_bytes, sampling_rate=16000)

speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=16000)
for ts in speech_timestamps:
    start = ts["start"]
    end = ts["end"]

    arr = audio[start: end]

    display(Audio(arr, rate=16000))

## RealTime Streaming