In [1]:
import torch
from PIL import Image
import requests
from transformers import AutoImageProcessor, AutoModel
import cv2
import torch
from transformers.image_utils import load_image

In [None]:
def video2frames(vidfile, save_folder):
    """ Convert input video to images """
    count = 0
    cap = cv2.VideoCapture(vidfile)
    while(cap.isOpened()):
        ret, frame = cap.read()
        if ret == True:
            cv2.imwrite(f'{save_folder}/{count:04d}.jpg', frame)
            count += 1
        else:
            break
    cap.release()
    return count



In [None]:
vid_example = '/home/milo/Documents/phd/VideoMimic/tram/example_video.mov'
outpath = '/home/milo/Documents/phd/VideoMimic/src/videomimic/data/example'
video2frames(vid_example, outpath)

In [None]:
from huggingface_hub import notebook_login
notebook_login()
# 

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
def load_dinov2_model():
    processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
    model = AutoModel.from_pretrained('facebook/dinov2-base')
    return processor, model

def extract_dinov2_features(images, processor, model):
    inputs = processor(images=images, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states[:, 0, :]



In [4]:
def load_dinov3_model():    
    processor = AutoImageProcessor.from_pretrained("facebook/dinov3-vitl16-pretrain-lvd1689m")
    model = AutoModel.from_pretrained("facebook/dinov3-vitl16-pretrain-lvd1689m")
    return processor, model

def extract_dinov3_features(images, processor, model):
    inputs = processor(images=images, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states[:, 0, :]


In [11]:
processor, model = load_dinov2_model()


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dv3_processor, dv3_model = load_dinov3_model()

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
import time
import torch
import numpy as np
from PIL import Image
import os
from pathlib import Path

def benchmark_dinov3_inference(processor, model, image_paths, batch_sizes=[1, 2, 4, 8, 16], num_warmup=5, num_iterations=20):
    """
    Benchmark DINOv3 inference speed with different batch sizes
    
    Args:
        processor: DINOv3 image processor
        model: DINOv3 model
        image_paths: List of image paths to use for testing
        batch_sizes: List of batch sizes to test
        num_warmup: Number of warmup iterations
        num_iterations: Number of timing iterations
    
    Returns:
        Dictionary with timing results for each batch size
    """
    device = next(model.parameters()).device
    print(f"Running on device: {device}")
    
    # Load all images once
    images = []
    for path in image_paths:
        if os.path.exists(path):
            image = load_image(path)
            images.append(image)
    
    print(f"Loaded {len(images)} images for testing")
    
    results = {}
    
    for batch_size in batch_sizes:
        print(f"\nTesting batch size: {batch_size}")
        
        # Prepare batches
        batches = []
        for i in range(0, len(images), batch_size):
            batch = images[i:i+batch_size]
            batches.append(batch)
        
        if not batches:
            print(f"No batches created for batch size {batch_size}")
            continue
            
        # Warmup
        print(f"Warming up with {num_warmup} iterations...")
        for i in range(num_warmup):
            batch = batches[i % len(batches)]
            with torch.no_grad():
                inputs = processor(images=batch, return_tensors="pt").to(device)
                outputs = model(**inputs)
        
        # Timing
        print(f"Running {num_iterations} timing iterations...")
        times = []
        
        for i in range(num_iterations):
            batch = batches[i % len(batches)]
            
            start_time = time.time()
            with torch.no_grad():
                inputs = processor(images=batch, return_tensors="pt").to(device)
                outputs = model(**inputs)
                features = outputs.last_hidden_state[:, 0, :]  # CLS token
            end_time = time.time()
            
            inference_time = end_time - start_time
            times.append(inference_time)
        
        # Calculate statistics
        times = np.array(times)
        mean_time = np.mean(times)
        std_time = np.std(times)
        fps = batch_size / mean_time
        
        results[batch_size] = {
            'mean_time': mean_time,
            'std_time': std_time,
            'fps': fps,
            'min_time': np.min(times),
            'max_time': np.max(times)
        }
        
        print(f"Batch size {batch_size}:")
        print(f"  Mean inference time: {mean_time:.4f}s ± {std_time:.4f}s")
        print(f"  FPS: {fps:.2f}")
        print(f"  Min time: {np.min(times):.4f}s")
        print(f"  Max time: {np.max(times):.4f}s")
    
    return results


In [7]:
# Check if GPU is available and move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move model to device
dv3_model = dv3_model.to(device)
print(f"Model moved to {device}")

# Check GPU memory if available
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"Available GPU Memory: {torch.cuda.memory_reserved(0) / 1e9:.1f} GB")


Using device: cuda
Model moved to cuda
GPU: NVIDIA GeForce RTX 5090
GPU Memory: 33.6 GB
Available GPU Memory: 1.2 GB


In [8]:
# Prepare test images - use the example video frames
example_dir = '/home/milo/Documents/phd/VideoMimic/src/videomimic/data/example'
image_paths = []

if os.path.exists(example_dir):
    # Get all jpg files from the example directory
    for file in os.listdir(example_dir):
        if file.endswith('.jpg'):
            image_paths.append(os.path.join(example_dir, file))
    
    # Sort to ensure consistent ordering
    image_paths.sort()
    print(f"Found {len(image_paths)} images in {example_dir}")
    
    # Show first few paths
    for i, path in enumerate(image_paths[:5]):
        print(f"  {i}: {os.path.basename(path)}")
else:
    print(f"Directory {example_dir} not found!")
    # Fallback: create a list with the single image we know exists
    single_image = '/home/milo/Documents/phd/VideoMimic/src/videomimic/data/example/0000.jpg'
    if os.path.exists(single_image):
        image_paths = [single_image] * 10  # Duplicate for testing
        print(f"Using single image duplicated 10 times: {single_image}")
    else:
        print("No test images found!")


Found 247 images in /home/milo/Documents/phd/VideoMimic/src/videomimic/data/example
  0: 0000.jpg
  1: 0001.jpg
  2: 0002.jpg
  3: 0003.jpg
  4: 0004.jpg


In [9]:
# Run the benchmark test
print("=" * 60)
print("DINOv3 INFERENCE SPEED BENCHMARK")
print("=" * 60)

# Test different batch sizes to find optimal throughput
batch_sizes_to_test = [1, 2, 4, 8, 16, 32] if len(image_paths) >= 32 else [1, 2, 4, 8]

results = benchmark_dinov3_inference(
    processor=dv3_processor,
    model=dv3_model,
    image_paths=image_paths,
    batch_sizes=batch_sizes_to_test,
    num_warmup=3,
    num_iterations=10
)


DINOv3 INFERENCE SPEED BENCHMARK
Running on device: cuda:0
Loaded 247 images for testing

Testing batch size: 1
Warming up with 3 iterations...
Running 10 timing iterations...
Batch size 1:
  Mean inference time: 0.0103s ± 0.0002s
  FPS: 97.27
  Min time: 0.0099s
  Max time: 0.0107s

Testing batch size: 2
Warming up with 3 iterations...
Running 10 timing iterations...
Batch size 2:
  Mean inference time: 0.0150s ± 0.0008s
  FPS: 133.51
  Min time: 0.0141s
  Max time: 0.0172s

Testing batch size: 4
Warming up with 3 iterations...
Running 10 timing iterations...
Batch size 4:
  Mean inference time: 0.0291s ± 0.0013s
  FPS: 137.49
  Min time: 0.0276s
  Max time: 0.0324s

Testing batch size: 8
Warming up with 3 iterations...
Running 10 timing iterations...
Batch size 8:
  Mean inference time: 0.0520s ± 0.0017s
  FPS: 153.74
  Min time: 0.0503s
  Max time: 0.0560s

Testing batch size: 16
Warming up with 3 iterations...
Running 10 timing iterations...
Batch size 16:
  Mean inference time: 0.

In [None]:
# Analyze results and provide recommendations
print("\n" + "=" * 60)
print("BENCHMARK RESULTS SUMMARY")
print("=" * 60)

best_fps = 0
best_batch_size = 1
target_fps = 30

print(f"{'Batch Size':<12} {'FPS':<8} {'Time (ms)':<12} {'Status':<15}")
print("-" * 50)

for batch_size in sorted(results.keys()):
    fps = results[batch_size]['fps']
    time_ms = results[batch_size]['mean_time'] * 1000
    status = "✓ TARGET MET" if fps >= target_fps else "✗ Below target"
    
    print(f"{batch_size:<12} {fps:<8.2f} {time_ms:<12.2f} {status:<15}")
    
    if fps > best_fps:
        best_fps = fps
        best_batch_size = batch_size

print(f"\nBest performance: {best_fps:.2f} FPS with batch size {best_batch_size}")

if best_fps >= target_fps:
    print(f"🎉 SUCCESS: Achieved {best_fps:.2f} FPS (target: {target_fps} FPS)")
    print(f"Recommended batch size: {best_batch_size}")
else:
    print(f"⚠️  WARNING: Best FPS ({best_fps:.2f}) is below target ({target_fps} FPS)")
    print("Consider:")
    print("  - Using a smaller/faster model (e.g., DINOv2-base instead of DINOv3-large)")
    print("  - Optimizing image preprocessing")
    print("  - Using model quantization")
    print("  - Running on a more powerful GPU")


In [None]:
# Additional optimization test: Compare DINOv2 vs DINOv3 performance
print("\n" + "=" * 60)
print("DINOv2 vs DINOv3 COMPARISON")
print("=" * 60)

# Test DINOv2 for comparison
print("Testing DINOv2-base performance...")
processor_dv2, model_dv2 = load_dinov2_model()
model_dv2 = model_dv2.to(device)

# Quick test with batch size 1
test_image = load_image(image_paths[0])
times_dv2 = []

print("Warming up DINOv2...")
for _ in range(3):
    with torch.no_grad():
        inputs = processor_dv2(images=test_image, return_tensors="pt").to(device)
        outputs = model_dv2(**inputs)

print("Timing DINOv2...")
for _ in range(10):
    start_time = time.time()
    with torch.no_grad():
        inputs = processor_dv2(images=test_image, return_tensors="pt").to(device)
        outputs = model_dv2(**inputs)
    end_time = time.time()
    times_dv2.append(end_time - start_time)

dv2_fps = 1.0 / np.mean(times_dv2)
dv2_time_ms = np.mean(times_dv2) * 1000

print(f"DINOv2-base: {dv2_fps:.2f} FPS ({dv2_time_ms:.2f} ms per image)")
print(f"DINOv3-large: {results[1]['fps']:.2f} FPS ({results[1]['mean_time']*1000:.2f} ms per image)")

if dv2_fps >= target_fps:
    print(f"✅ DINOv2-base meets target FPS ({target_fps})")
else:
    print(f"❌ DINOv2-base also below target FPS ({target_fps})")


In [None]:
url = "/home/milo/Documents/phd/VideoMimic/src/videomimic/data/example/0000.jpg"
image = load_image(url)
features = extract_dinov2_features(image, processor, model)

In [25]:
print(image.size)
print(features.shape)


(720, 1280)
torch.Size([1, 768])


In [5]:
url = "/home/milo/Documents/phd/VideoMimic/src/videomimic/data/example/0000.jpg"
image = load_image(url)
image_list = [image, image]
features = extract_dinov3_features(image_list, dv3_processor, dv3_model)
print(image.size)
print(features.shape)

(720, 1280)
torch.Size([2, 1024])
