# Install requirements

In [4]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting openvino==2025.2.0 (from -r requirements.txt (line 3))
  Using cached openvino-2025.2.0-19140-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting numpy==2.2.6 (from -r requirements.txt (line 4))
  Using cached numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting opencv-python==4.11.0.86 (from -r requirements.txt (line 5))
  Using cached opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Collecting pillow==11.3.0 (from -r requirements.txt (line 6))
  Using cached pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting transformers==4.53.3 (from -r requirements.txt (line 7))
  Using cached transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
Collecting torch==2.8.0 (from -r requirements.txt (line 8))
  Using cached https://download.pytorch.org/whl/cpu/torch-2.8.0-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting

# Pre steps to Optimize and upload to HF

In [None]:
# Save model to OpenVINO format

from optimum.intel.openvino import OVModelForVisualCausalLM

# First time: export and save
model = OVModelForVisualCausalLM.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf", 
    export=True,
    trust_remote_code=True
)

model.save_pretrained("./llava_openvino_model")

# Future times: load from local saved version (much faster)
model = OVModelForVisualCausalLM.from_pretrained("./llava_openvino_model")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 22.37it/s]
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor

## Upload OV model to HF 

In [None]:
# Create model card and upload to Hugging Face Hub

from huggingface_hub import HfApi, create_repo
import os

# Replace with your desired repo name
REPO_NAME = "llava-next-video-openvino"  # Change this to your preferred name
HF_USERNAME = "ezelanza"  # Replace with your HF username

# Create model card content
model_card = """---
license: apache-2.0
base_model: llava-hf/LLaVA-NeXT-Video-7B-hf
tags:
- openvino
- llava
- multimodal
- video
- visual-question-answering
---

# LLaVA-NeXT-Video OpenVINO Model

This is an OpenVINO optimized version of the LLaVA-NeXT-Video-7B-hf model.

## Model Description
- **Base Model**: llava-hf/LLaVA-NeXT-Video-7B-hf
- **Optimization**: Converted to OpenVINO format for efficient inference
- **Size**: ~7B parameters

## Usage

```python
from optimum.intel.openvino import OVModelForVisualCausalLM

model = OVModelForVisualCausalLM.from_pretrained("YOUR_USERNAME/llava-next-video-openvino")
```

## License
This model inherits the license from the original LLaVA-NeXT model.
"""

# Save model card
with open("README.md", "w") as f:
    f.write(model_card)

print("Model card created: README.md")


In [None]:
# Upload model to Hugging Face Hub

from huggingface_hub import HfApi
import os
# Login to Hugging Face

from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

# Configuration - UPDATE THESE VALUES
REPO_NAME = "ezelanza/llava-next-video-openvino"  # Your desired repo name
# The username will be automatically detected from your login

api = HfApi()

# Create repository
try:
    repo_url = api.create_repo(
        repo_id=REPO_NAME,
        exist_ok=True,
        repo_type="model"
    )
    print(f"Repository created/exists: {repo_url}")
except Exception as e:
    print(f"Repository creation error: {e}")

# Upload model files if they exist
if os.path.exists("./llava_openvino_model"):
    print("Uploading model files...")
    api.upload_folder(
        folder_path="./llava_openvino_model",
        repo_id=REPO_NAME,
        repo_type="model"
    )
    
    # Upload README
    if os.path.exists("README.md"):
        api.upload_file(
            path_or_fileobj="README.md",
            path_in_repo="README.md",
            repo_id=REPO_NAME,
            repo_type="model"
        )
    
    print(f"✅ Model uploaded successfully!")
    print(f"🔗 View your model at: https://huggingface.co/{api.whoami()['name']}/{REPO_NAME}")
else:
    print("❌ Model directory './llava_openvino_model' not found.")
    print("Run the first cell to save the model first.")


## Optimize the model 

In [None]:
from optimum.intel.openvino import OVModelForVisualCausalLM
from transformers import LlavaNextVideoProcessor
from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)
model_id = "ezelanza/llava-next-video-openvino"


model = OVModelForVisualCausalLM.from_pretrained(model_id)
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")


In [None]:
from optimum.intel import OVQuantizationConfig, OVWeightQuantizationConfig, OVPipelineQuantizationConfig

dataset, num_samples = "contextual", 50

# weight-only 8bit
woq_8bit = OVWeightQuantizationConfig(bits=8)

# weight-only 4bit
woq_4bit = OVWeightQuantizationConfig(bits=4, group_size=16)

# static quantization
static_8bit = OVQuantizationConfig(bits=8, dataset=dataset, num_samples=num_samples)

# pipeline quantization: applying different quantization on each components
ppl_q = OVPipelineQuantizationConfig(
    quantization_configs={
        "lm_model": OVQuantizationConfig(bits=8),
        "multimodal_model": OVWeightQuantizationConfig(bits=8),
        "text_embeddings_model": OVWeightQuantizationConfig(bits=8),
        "vision_embeddings_model": OVWeightQuantizationConfig(bits=8),
        "vision_model": OVWeightQuantizationConfig(bits=8) 
    },
    dataset=dataset,
    num_samples=num_samples,
)

In [None]:
from optimum.intel import OVModelForVisualCausalLM, OVWeightQuantizationConfig

model_id = "ezelanza/llava-next-video-openvino"

q_model = OVModelForVisualCausalLM.from_pretrained(model_id, quantization_config=woq_8bit)
int8_model_path = "llava_next_video_int8"
q_model.save_pretrained(int8_model_path)

In [None]:
# Upload model to Hugging Face Hub

from huggingface_hub import HfApi
import os
# Login to Hugging Face

from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

# Configuration - UPDATE THESE VALUES
REPO_NAME = "ezelanza/llava-next-video-openvino-int8"  # Your desired repo name
# The username will be automatically detected from your login

api = HfApi()

# Create repository
try:
    repo_url = api.create_repo(
        repo_id=REPO_NAME,
        exist_ok=True,
        repo_type="model"
    )
    print(f"Repository created/exists: {repo_url}")
except Exception as e:
    print(f"Repository creation error: {e}")

# Upload model files if they exist
if os.path.exists("./llava_next_video_int8"):
    print("Uploading model files...")
    api.upload_folder(
        folder_path="./llava_next_video_int8",
        repo_id=REPO_NAME,
        repo_type="model"
    )
    
    # Upload README
    if os.path.exists("README.md"):
        api.upload_file(
            path_or_fileobj="README.md",
            path_in_repo="README.md",
            repo_id=REPO_NAME,
            repo_type="model"
        )
    
    print(f"✅ Model uploaded successfully!")
    print(f"🔗 View your model at: https://huggingface.co/{api.whoami()['name']}/{REPO_NAME}")
else:
    print("❌ Model directory './llava_openvino_model' not found.")
    print("Run the first cell to save the model first.")

# Run inference with HF video dataset

In [1]:
from huggingface_hub import hf_hub_download 
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM
from optimum.intel.openvino import OVModelForVisualCausalLM
from transformers import LlavaNextVideoProcessor
from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

#load model in memory
model_id = "ezelanza/llava-next-video-openvino-int8"

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = OVModelForVisualCausalLM.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


Go to: https://huggingface.co/settings/tokens
Create a new token with WRITE permissions



You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")

conversation = [
    {

        "role": "user",
        "content": [
            {"type": "text", "text": "What is happening in the video?"},
            {"type": "video", "path": video_path},
            ],
    },
]

inputs = processor.apply_chat_template(
    conversation,
    num_frames=4,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True
)


Unused or unrecognized kwargs: return_tensors.


In [3]:
output = model.generate(**inputs, max_new_tokens=60)
    
response = processor.batch_decode(
        output,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]

    
if "ASSISTANT:" in response:
        description = response.split("ASSISTANT:")[-1].strip()
else:
        description = response.strip()
    
print(f"CAPTION GENERATED (video frames): {description}")

CAPTION GENERATED (video frames): In the video, we see a young child sitting on a bed, wearing glasses and engrossed in reading a book. The child appears to be focused on the book, possibly reading or looking at the pictures. The room has a cozy and lived-in feel, with various items


# Run inference with frames array (from HF video)

In [1]:
import cv2
import numpy as np
from pathlib import Path
import time
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM
from huggingface_hub import hf_hub_download 

def extract_video_frames(video_path, num_frames=4, width=36, height=36):
    """Extract evenly spaced frames from a video file."""
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Error: Could not open video file")
        return []
    
    total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_video_frames / fps
    
    print(f"Video info: {total_video_frames} frames, {fps:.1f} FPS, {duration:.1f} seconds")
    
    # Calculate frame indices to extract (evenly spaced)
    frame_indices = np.linspace(0, total_video_frames-1, num_frames, dtype=int)
    
    frames = []
    for i, frame_idx in enumerate(frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if ret:
            # Resize frame to reduce processing time
            frame_resized = cv2.resize(frame, (width, height))
            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)
            print(f"Extracted frame {i+1}/{num_frames} at frame {frame_idx}")
    
    cap.release()
    return frames

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")

# Extract frames from video
frames = extract_video_frames(video_path, num_frames=4, width=120, height=80)
    
# Save frames as temporary images
frame_paths = []
for i, frame in enumerate(frames):
    frame_path = f"video_frame_{i}.jpg"
    cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    frame_paths.append(Path(frame_path))
    print(f"Saved frame {i+1} as {frame_path}")


Video info: 243 frames, 25.0 FPS, 9.7 seconds
Extracted frame 1/4 at frame 0
Extracted frame 2/4 at frame 80
Extracted frame 3/4 at frame 161
Extracted frame 4/4 at frame 242
Saved frame 1 as video_frame_0.jpg
Saved frame 2 as video_frame_1.jpg
Saved frame 3 as video_frame_2.jpg
Saved frame 4 as video_frame_3.jpg


In [4]:
# Use frames as images in conversation
#load model in memory
model_id = "ezelanza/llava-next-video-openvino-int8"

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = OVModelForVisualCausalLM.from_pretrained(model_id)
    

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
conversation_with_frames = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe what you see in these images. What is happening?"},
                *[{"type": "image", "image": path.as_posix()} for path in frame_paths],
            ],
        },
    ]
    
    # Process with the same model and processor
inputs_with_frames = processor.apply_chat_template(
        conversation_with_frames,
        num_frames=1,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True
    )
    
    # Generate response
out_with_frames = model.generate(**inputs_with_frames, max_new_tokens=60)
    
response_with_frames = processor.batch_decode(
        out_with_frames,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    
if "ASSISTANT:" in response_with_frames:
        description_with_frames = response_with_frames.split("ASSISTANT:")[-1].strip()
else:
        description_with_frames = response_with_frames.strip()
    
print(f"CAPTION GENERATED (video frames): {description_with_frames}")

CAPTION GENERATED (video frames): In the image, there is a young child sitting on a bed, engrossed in reading a book. The child is wearing glasses and appears to be focused on the content of the book, which is open in front of them. The child is dressed in a light-colored top


# Run inference with frames array (from local webcam)

In [1]:
from huggingface_hub import hf_hub_download 
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM
from optimum.intel.openvino import OVModelForVisualCausalLM
from transformers import LlavaNextVideoProcessor
from huggingface_hub import login
import getpass

print("Go to: https://huggingface.co/settings/tokens")
print("Create a new token with WRITE permissions")
print()

token = getpass.getpass("Enter your HF token: ")
login(token=token)

#load model in memory
model_id = "ezelanza/llava-next-video-openvino-int8"

  from .autonotebook import tqdm as notebook_tqdm


Go to: https://huggingface.co/settings/tokens
Create a new token with WRITE permissions



In [6]:
import cv2
import numpy as np
from pathlib import Path
import time
from transformers import LlavaNextVideoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM


def capture_webcam_to_mp4(output_path="output.mp4", duration_seconds=3, fps=4, width=320, height=240):
    """Capture frames from webcam and save as an MP4 video."""
    cap = cv2.VideoCapture(0)  # 0 for default webcam
    
    if not cap.isOpened():
        print("Error: Could not open webcam")
        return False
    
    # Set resolution to reduce processing time
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
    
    # Verify the resolution was set
    actual_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    actual_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    print(f"Webcam resolution set to: {actual_width}x{actual_height}")

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # Codec for .mp4
    out = cv2.VideoWriter(output_path, fourcc, fps, (actual_width, actual_height))
    
    total_frames = int(duration_seconds * fps)
    frame_interval = 1.4 / fps  # Time between frames
    
    print(f"Capturing {total_frames} frames over {duration_seconds} seconds...")
    print("Starting in 3 seconds...")
    
    # Countdown
    for i in range(3, 0, -1):
        print(f"{i}...")
        time.sleep(1)
    
    print("Starting frame capture...")
    start_time = time.time()
    
    for i in range(total_frames):
        ret, frame = cap.read()
        if not ret:
            print(f"Warning: Frame {i} not captured correctly.")
            break
        
        # Resize frame to target size
        frame_resized = cv2.resize(frame, (actual_width, actual_height))
        
        out.write(frame_resized)  # Write frame to video file
        
        elapsed_time = time.time() - start_time
        print(f"Captured frame {i+1}/{total_frames} at {elapsed_time:.1f}s")
        
        # Wait to maintain fps timing
        if i < total_frames - 1:
            time.sleep(frame_interval)
    
    cap.release()
    out.release()
    print(f"Capture complete! Video saved to: {output_path}")
    return True


In [7]:
import cv2
from pathlib import Path

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = OVModelForVisualCausalLM.from_pretrained(model_id)

output_path="inference.mp4"
capture_webcam_to_mp4(output_path,duration_seconds=3,fps=4)
    
conversation_webcam = [
    {

        "role": "user",
        "content": [
            {"type": "text", "text": "What is happening in the video?"},
            {"type": "video", "path": output_path},
            ],
    },
]

Webcam resolution set to: 320x240
Capturing 12 frames over 3 seconds...
Starting in 3 seconds...
3...
2...
1...
Starting frame capture...
Captured frame 1/12 at 0.2s
Captured frame 2/12 at 0.5s
Captured frame 3/12 at 0.9s
Captured frame 4/12 at 1.3s
Captured frame 5/12 at 1.6s
Captured frame 6/12 at 2.0s
Captured frame 7/12 at 2.3s
Captured frame 8/12 at 2.7s
Captured frame 9/12 at 3.0s
Captured frame 10/12 at 3.4s
Captured frame 11/12 at 3.7s
Captured frame 12/12 at 4.1s
Capture complete! Video saved to: inference.mp4


In [8]:
inputs_webcam = processor.apply_chat_template(
    conversation_webcam,
    num_frames=4,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True
)

out_webcam = model.generate(**inputs_webcam, max_new_tokens=60)

response_webcam = processor.batch_decode(
                    out_webcam,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True
                )[0]

if "ASSISTANT:" in response_webcam:
    description_webcam = response_webcam.split("ASSISTANT:")[-1].strip()
else:
    # If no ASSISTANT marker, use the full response
    description_webcam = response_webcam.strip()

print(f"CAPTION GENERATED: {description_webcam}")

Unused or unrecognized kwargs: return_tensors.


CAPTION GENERATED: In the video, we see a man wearing glasses and a black jacket who appears to be in a room with a plant in the background. He is holding up his hand and making a gesture that could be interpreted as a wave or a greeting. The man seems to be in a
