### Video Information Extracting with Qwen2.5-Omni

This notebook demonstrates how to use Qwen2.5-Omni to obtain information from the video stream.

In [2]:
# Cell 1: Install if needed (optional)
# !pip install torch transformers peft imageio opencv-python pillow qwen-omni-utils

# Cell 2: Imports
import os
import torch
import numpy as np
from PIL import Image
from IPython.display import Video, display
import imageio
import cv2

from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor, BitsAndBytesConfig
from peft import PeftModel
from qwen_omni_utils import process_mm_info
from generator import SyntheticVideoDataset  # Adjust if needed

# Cell 1: Load OmegaConf config just like Hydra
from omegaconf import OmegaConf

cfg = OmegaConf.load("conf/config.yaml")
OmegaConf.to_container(cfg, resolve=True)  # To view if needed



{'defaults': ['_self_'],
 'dataset': {'use_video_files': False,
  'num_videos': 100,
  'num_classes': 5,
  'frames_per_video': 64,
  'frame_height': 56,
  'frame_width': 56,
  'fps': 24,
  'max_skip': 1,
  'seed': 42,
  'temp_folder': './temp_videos/',
  'delete_videos': False},
 'model': {'base_model': 'Qwen/Qwen2.5-Omni-7B',
  'use_4bit': False,
  'lora_r': 64,
  'lora_alpha': 128,
  'lora_target_modules': ['q_proj',
   'v_proj',
   'k_proj',
   'o_proj',
   'gate_proj',
   'up_proj',
   'down_proj'],
  'lora_weights': 'qwen_video_lora',
  'print_parameters': True},
 'train': {'batch_size': 4,
  'num_epochs': 8,
  'learning_rate': 2e-05,
  'precision': 'bf16',
  'system_prompt': 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.',
  'val_split': 0.2,
  'eval_after_steps': 10000},
 'wandb': {'project': 'qwen-video-classification',
  'run_name': 'qwen_lora_run',
  'mode': 'di

In [3]:
# Cell 2: Load model using config
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor, BitsAndBytesConfig
from peft import PeftModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = cfg.model.base_model
lora_weights = cfg.model.lora_weights

model = Qwen2_5OmniModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
model = model.thinker

model = PeftModel.from_pretrained(model, lora_weights)
model = model.to(device).eval()

processor = Qwen2_5OmniProcessor.from_pretrained(model_name)


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but flash_attention_2 only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.
Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


Load model and processors.

In [37]:
# Cell 3: Load or generate video
from generator import SyntheticVideoDataset
from PIL import Image
import numpy as np
import imageio
import os

def save_video(frames, path=None, fps=None):
    if path is None:
        path = "synthetic.mp4"
    if fps is None:
        fps = 24  
    
    os.makedirs(os.path.dirname(path), exist_ok=True)
    writer = imageio.get_writer(path, fps=fps, codec="libx264", format="FFMPEG")
    for f in frames:
        writer.append_data(np.array(f.convert("RGB")))
    writer.close()

dataset = SyntheticVideoDataset(
    num_videos=cfg.dataset.num_videos,
    num_classes=cfg.dataset.num_classes,
    frames_per_video=cfg.dataset.frames_per_video,
    frame_height=cfg.dataset.frame_height,
    frame_width=cfg.dataset.frame_width,
    max_skip=cfg.dataset.max_skip,
    seed=None
)
import random
index = random.randint(0, len(dataset) - 1)
frames, label_id = dataset[index]
label_text = dataset.classes[label_id]
video_path = os.path.join(cfg.dataset.temp_folder, "demo_video.mp4")
save_video(frames,  video_path, fps=cfg.dataset.fps)





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [38]:
# Cell 4: Display the video
from IPython.display import Video, display

display(Video(video_path, width=320, height=320))


In [18]:
# Cell 5: Inference function
from qwen_omni_utils import process_mm_info

def run_inference(model, processor, sys_prompt, prompt_text, video_frames, device, max_new_tokens=100):
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": [
            {"type": "text", "text": prompt_text},
            {"type": "video", "video": video_frames},
        ]}
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    audios, images, videos = process_mm_info(messages, use_audio_in_video=False)

    inputs = processor(
        text=text,
        audios=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        truncation=True,
        use_audio_in_video=False
    )
    inputs = inputs.to(device).to(model.dtype)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens
        )

    return processor.batch_decode(output, skip_special_tokens=True)[0].strip()


In [19]:
# Cell 6: Final run
sys_prompt = cfg.train.system_prompt
prompt_text = "Please classify this video. Choose one of: horizontal_move, vertical_move, blinking_dot, random_teleport, bouncing_diag. Respond with only the label."

prediction = run_inference(model, processor, sys_prompt, prompt_text, frames, device, cfg.demo.max_new_tokens)

print(f"🧠 Ground Truth: {label_text}")
print(f"🤖 Model Prediction: {prediction}")


🧠 Ground Truth: horizontal move
🤖 Model Prediction: system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.
user
Please classify this video. Choose one of: horizontal_move, vertical_move, blinking_dot, random_teleport, bouncing_diag. Respond with only the label.
assistant
horizontal_move
