In [1]:
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2"
).to("cuda")

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
            {"type": "text", "text": "Can you describe this image?"},
        ]
    },
]

inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device, dtype=torch.bfloat16)

In [None]:
import time
import psutil
import threading
import gc

In [None]:
class GPUMemoryMonitor:
    def __init__(self):
        self.peak_memory = 0
        self.monitoring = False
        self.monitor_thread = None
    
    def get_gpu_memory_mb(self):
        """Get current GPU memory usage in MB"""
        if torch.cuda.is_available():
            return torch.cuda.memory_allocated() / 1024 / 1024
        return 0
    
    def monitor_memory(self):
        """Monitor GPU memory usage in a separate thread"""
        while self.monitoring:
            current_memory = self.get_gpu_memory_mb()
            if current_memory > self.peak_memory:
                self.peak_memory = current_memory
            time.sleep(0.001)  # Check every 1ms
    
    def start_monitoring(self):
        """Start monitoring GPU memory"""
        self.peak_memory = self.get_gpu_memory_mb()
        self.monitoring = True
        self.monitor_thread = threading.Thread(target=self.monitor_memory)
        self.monitor_thread.start()
    
    def stop_monitoring(self):
        """Stop monitoring and return peak memory usage"""
        self.monitoring = False
        if self.monitor_thread:
            self.monitor_thread.join()
        return self.peak_memory

# Initialize the memory monitor
memory_monitor = GPUMemoryMonitor()

In [5]:
import torch
import gc
import time

# Clear GPU cache before measurement
torch.cuda.empty_cache()
gc.collect()

# Get initial memory usage
initial_memory = memory_monitor.get_gpu_memory_mb()
print(f"Initial GPU memory usage: {initial_memory:.2f} MB")

# Start memory monitoring
memory_monitor.start_monitoring()

# Start timing
start_time = time.time()

# Generate text
generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)

# Stop timing
inference_time = time.time() - start_time

# Stop memory monitoring
peak_memory = memory_monitor.stop_monitoring()

# Decode the generated text
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

# Calculate memory usage
memory_increase = peak_memory - initial_memory
final_memory = memory_monitor.get_gpu_memory_mb()

# Print results
print(f"\n=== Performance Metrics ===")
print(f"Generated text: {generated_texts[0]}")
print(f"\n=== Timing ===")
print(f"Inference time: {inference_time:.4f} seconds")
print(f"Tokens per second: {64/inference_time:.2f} tokens/sec")

print(f"\n=== Memory Usage ===")
print(f"Initial GPU memory: {initial_memory:.2f} MB")
print(f"Peak GPU memory: {peak_memory:.2f} MB")
print(f"Final GPU memory: {final_memory:.2f} MB")
print(f"Memory increase during generation: {memory_increase:.2f} MB")

# Additional GPU memory info
if torch.cuda.is_available():
    print(f"\n=== GPU Memory Details ===")
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 / 1024:.2f} MB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024 / 1024:.2f} MB")
    print(f"GPU memory cached: {torch.cuda.memory_cached() / 1024 / 1024:.2f} MB")

Initial GPU memory usage: 4321.53 MB

=== Performance Metrics ===
Generated text: User:



Can you describe this image?
Assistant: The image depicts a close-up view of a bee on a pink flower. The bee is positioned in the center of the flower, with its body prominently visible. The bee appears to be engaged in the act of pollination, as it is surrounded by the petals of the flower. The flower itself is vibrant and has a

=== Timing ===
Inference time: 2.8372 seconds
Tokens per second: 22.56 tokens/sec

=== Memory Usage ===
Initial GPU memory: 4321.53 MB
Peak GPU memory: 4585.85 MB
Final GPU memory: 4330.09 MB
Memory increase during generation: 264.32 MB

=== GPU Memory Details ===
GPU memory allocated: 4330.09 MB
GPU memory reserved: 4908.00 MB
GPU memory cached: 4908.00 MB


  print(f"GPU memory cached: {torch.cuda.memory_cached() / 1024 / 1024:.2f} MB")


In [6]:
import pandas as pd
from tabulate import tabulate
import random

def create_batch_inputs(batch_size):
    """Create batched inputs by processing multiple messages together with proper padding"""
    # Set padding to left for proper batching
    processor.tokenizer.padding_side = "left"
    
    # Create batch_size number of the same message format
    batch_messages = []
    
    for i in range(batch_size):
        batch_messages.append([
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                    {"type": "text", "text": f"Can you describe this image? please {i} {random.randint(0, 100)}"},
                ]
            }
        ])
    
    # Process each conversation separately then combine
    all_inputs = []
    for single_messages in batch_messages:
        single_input = processor.apply_chat_template(
            single_messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )
        all_inputs.append(single_input)
    
    # Now combine all inputs with proper padding
    # Get the maximum sequence length
    max_len = max(inp['input_ids'].size(1) for inp in all_inputs)
    
    # Pad all sequences to the same length (left padding)
    batch_input_ids = []
    batch_attention_masks = []
    batch_pixel_values = []
    
    for inp in all_inputs:
        input_ids = inp['input_ids'].squeeze(0)  # Remove batch dim
        attention_mask = inp['attention_mask'].squeeze(0)
        
        # Left pad
        pad_length = max_len - len(input_ids)
        if pad_length > 0:
            # Pad with tokenizer.pad_token_id
            pad_token_id = processor.tokenizer.pad_token_id if processor.tokenizer.pad_token_id is not None else processor.tokenizer.eos_token_id
            input_ids = torch.cat([torch.full((pad_length,), pad_token_id, dtype=input_ids.dtype), input_ids])
            attention_mask = torch.cat([torch.zeros(pad_length, dtype=attention_mask.dtype), attention_mask])
        
        batch_input_ids.append(input_ids)
        batch_attention_masks.append(attention_mask)
        if 'pixel_values' in inp:
            batch_pixel_values.append(inp['pixel_values'])
    
    # Stack into batches
    batch_inputs = {
        'input_ids': torch.stack(batch_input_ids).to(model.device),
        'attention_mask': torch.stack(batch_attention_masks).to(model.device),
    }
    
    if batch_pixel_values:
        batch_inputs['pixel_values'] = torch.cat(batch_pixel_values, dim=0).to(model.device, dtype=torch.bfloat16)
    
    return batch_inputs

def measure_batch_performance(batch_size, max_new_tokens=64, num_runs=10):
    """Measure performance for a given batch size with multiple runs for averaging"""
    print(f"Testing batch size: {batch_size} ({num_runs} runs)")
    
    # Create batch inputs once
    try:
        batch_inputs = create_batch_inputs(batch_size)
        
        # Check sequence length before proceeding
        seq_len = batch_inputs['input_ids'].shape[1]
        batch_size_actual = batch_inputs['input_ids'].shape[0]
        print(f"  Batch shape: {batch_inputs['input_ids'].shape} (batch_size={batch_size_actual}, seq_len={seq_len})")
        
        if seq_len > 16384:  # Model's max sequence length
            print(f"  ⚠️  Sequence length {seq_len} exceeds model limit of 16384")
            print(f"  Skipping batch size {batch_size} due to sequence length limit")
            return None
            
    except Exception as e:
        print(f"Error creating batch inputs for size {batch_size}: {e}")
        return None
    
    # Store results from all runs
    all_results = []
    
    for run in range(num_runs):
        print(f"  Run {run + 1}/{num_runs}...", end=" ")
        
        # Clear GPU cache before each measurement
        torch.cuda.empty_cache()
        gc.collect()
        
        # Get initial memory usage
        initial_memory = memory_monitor.get_gpu_memory_mb()
        
        # Start memory monitoring
        memory_monitor.start_monitoring()
        
        # Start timing
        start_time = time.time()
        
        try:
            # Generate text
            generated_ids = model.generate(**batch_inputs, do_sample=False, max_new_tokens=max_new_tokens)
            
            # Stop timing
            inference_time = time.time() - start_time
            
            # Stop memory monitoring
            peak_memory = memory_monitor.stop_monitoring()
            
            # Calculate metrics for this run
            memory_increase = peak_memory - initial_memory
            final_memory = memory_monitor.get_gpu_memory_mb()
            total_tokens = batch_size * max_new_tokens
            tokens_per_second = total_tokens / inference_time
            time_per_sample = inference_time / batch_size
            
            run_result = {
                'inference_time': inference_time,
                'time_per_sample': time_per_sample,
                'tokens_per_second': tokens_per_second,
                'initial_memory': initial_memory,
                'peak_memory': peak_memory,
                'memory_increase': memory_increase,
                'final_memory': final_memory
            }
            
            all_results.append(run_result)
            print(f"✓ {inference_time:.3f}s")
            
        except Exception as e:
            print(f"✗ Error: {e}")
            memory_monitor.stop_monitoring()
            continue
    
    if not all_results:
        print(f"  All runs failed for batch size {batch_size}")
        return None
    
    # Calculate averages
    print(f"  Completed {len(all_results)}/{num_runs} successful runs")
    
    avg_result = {
        'batch_size': batch_size,
        'inference_time': sum(r['inference_time'] for r in all_results) / len(all_results),
        'time_per_sample': sum(r['time_per_sample'] for r in all_results) / len(all_results),
        'tokens_per_second': sum(r['tokens_per_second'] for r in all_results) / len(all_results),
        'initial_memory': sum(r['initial_memory'] for r in all_results) / len(all_results),
        'peak_memory': sum(r['peak_memory'] for r in all_results) / len(all_results),
        'memory_increase': sum(r['memory_increase'] for r in all_results) / len(all_results),
        'final_memory': sum(r['final_memory'] for r in all_results) / len(all_results),
        'num_successful_runs': len(all_results)
    }
    
    return avg_result

# Test different batch sizes - now with proper batching (separate sequences)
batch_sizes = [1, 2, 4, 8, 16, 32, 64]  # Can test larger batches now since sequences aren't concatenated
results = []

print("=== Testing Different Batch Sizes (10 runs each for averaging) ===\n")

for batch_size in batch_sizes:
    result = measure_batch_performance(batch_size, num_runs=10)
    if result is not None:
        results.append(result)
        print(f"✓ Batch size {batch_size} completed successfully (avg from {result['num_successful_runs']} runs)")
    else:
        print(f"✗ Batch size {batch_size} failed")
        break  # Stop if we hit memory limits
    print()

# Create and display results table
if results:
    df = pd.DataFrame(results)
    
    # Format the table
    formatted_df = df.copy()
    formatted_df['inference_time'] = formatted_df['inference_time'].apply(lambda x: f"{x:.4f}s")
    formatted_df['time_per_sample'] = formatted_df['time_per_sample'].apply(lambda x: f"{x:.4f}s")
    formatted_df['tokens_per_second'] = formatted_df['tokens_per_second'].apply(lambda x: f"{x:.2f}")
    formatted_df['initial_memory'] = formatted_df['initial_memory'].apply(lambda x: f"{x:.1f} MB")
    formatted_df['peak_memory'] = formatted_df['peak_memory'].apply(lambda x: f"{x:.1f} MB")
    formatted_df['memory_increase'] = formatted_df['memory_increase'].apply(lambda x: f"{x:.1f} MB")
    formatted_df['final_memory'] = formatted_df['final_memory'].apply(lambda x: f"{x:.1f} MB")
    formatted_df['num_successful_runs'] = formatted_df['num_successful_runs'].apply(lambda x: f"{x}/10")
    
    # Rename columns for better display
    formatted_df.columns = [
        'Batch Size', 'Avg Total Time', 'Avg Time/Sample', 'Avg Tokens/Sec', 
        'Avg Initial Mem', 'Avg Peak Mem', 'Avg Memory Inc', 'Avg Final Mem', 'Success Rate'
    ]
    
    print("=== Performance Results Table (Averaged from 10 runs) ===")
    print(tabulate(formatted_df, headers='keys', tablefmt='grid', showindex=False))
    
    # Also create a summary with key metrics
    print("\n=== Key Insights ===")
    print(f"• Best throughput: {df.loc[df['tokens_per_second'].idxmax(), 'batch_size']} batch size with {df['tokens_per_second'].max():.2f} tokens/sec")
    print(f"• Lowest latency per sample: {df.loc[df['time_per_sample'].idxmin(), 'batch_size']} batch size with {df['time_per_sample'].min():.4f}s per sample")
    print(f"• Memory usage scales from {df['memory_increase'].min():.1f} MB to {df['memory_increase'].max():.1f} MB")
    
else:
    print("No successful results to display.")

=== Testing Different Batch Sizes (10 runs each for averaging) ===

Testing batch size: 1 (10 runs)
  Batch shape: torch.Size([1, 1102]) (batch_size=1, seq_len=1102)
  Run 1/10... ✓ 2.196s
  Run 2/10... ✓ 2.188s
  Run 3/10... ✓ 2.185s
  Run 4/10... ✓ 2.179s
  Run 5/10... ✓ 2.161s
  Run 6/10... ✓ 2.177s
  Run 7/10... ✓ 2.189s
  Run 8/10... ✓ 2.174s
  Run 9/10... ✓ 2.184s
  Run 10/10... ✓ 2.175s
  Completed 10/10 successful runs
✓ Batch size 1 completed successfully (avg from 10 runs)

Testing batch size: 2 (10 runs)
  Batch shape: torch.Size([2, 1103]) (batch_size=2, seq_len=1103)
  Run 1/10... ✓ 2.433s
  Run 2/10... ✓ 2.334s
  Run 3/10... ✓ 2.322s
  Run 4/10... ✓ 2.333s
  Run 5/10... ✓ 2.322s
  Run 6/10... ✓ 2.348s
  Run 7/10... ✓ 2.339s
  Run 8/10... ✓ 2.350s
  Run 9/10... ✓ 2.341s
  Run 10/10... ✓ 2.321s
  Completed 10/10 successful runs
✓ Batch size 2 completed successfully (avg from 10 runs)

Testing batch size: 4 (10 runs)
  Batch shape: torch.Size([4, 1103]) (batch_size=4, seq_le

In [7]:
# Let's check if we have results and display them if the previous cell output was truncated
try:
    if 'results' in locals() and results:
        df = pd.DataFrame(results)
        
        # Format the table
        formatted_df = df.copy()
        formatted_df['inference_time'] = formatted_df['inference_time'].apply(lambda x: f"{x:.4f}s")
        formatted_df['time_per_sample'] = formatted_df['time_per_sample'].apply(lambda x: f"{x:.4f}s")
        formatted_df['tokens_per_second'] = formatted_df['tokens_per_second'].apply(lambda x: f"{x:.2f}")
        formatted_df['initial_memory'] = formatted_df['initial_memory'].apply(lambda x: f"{x:.1f} MB")
        formatted_df['peak_memory'] = formatted_df['peak_memory'].apply(lambda x: f"{x:.1f} MB")
        formatted_df['memory_increase'] = formatted_df['memory_increase'].apply(lambda x: f"{x:.1f} MB")
        formatted_df['final_memory'] = formatted_df['final_memory'].apply(lambda x: f"{x:.1f} MB")
        
        # Rename columns for better display
        formatted_df.columns = [
            'Batch Size', 'Total Time', 'Time/Sample', 'Tokens/Sec', 
            'Initial Mem', 'Peak Mem', 'Memory Increase', 'Final Mem'
        ]
        
        print("=== Performance Results Table ===")
        print(tabulate(formatted_df, headers='keys', tablefmt='grid', showindex=False))
        
        # Also create a summary with key metrics
        print("\n=== Key Insights ===")
        print(f"• Best throughput: Batch size {df.loc[df['tokens_per_second'].idxmax(), 'batch_size']} with {df['tokens_per_second'].max():.2f} tokens/sec")
        print(f"• Lowest latency per sample: Batch size {df.loc[df['time_per_sample'].idxmin(), 'batch_size']} with {df['time_per_sample'].min():.4f}s per sample")
        print(f"• Memory usage scales from {df['memory_increase'].min():.1f} MB to {df['memory_increase'].max():.1f} MB")
        print(f"• Peak memory usage ranges from {df['peak_memory'].min():.1f} MB to {df['peak_memory'].max():.1f} MB")
        
        # Show efficiency metrics
        print(f"\n=== Efficiency Analysis ===")
        for _, row in df.iterrows():
            efficiency = row['batch_size'] / row['inference_time']
            print(f"• Batch {int(row['batch_size'])}: {efficiency:.2f} samples/second, {row['memory_increase']:.1f} MB memory increase")
        
    else:
        print("No results found. Please run the previous cell first.")
        
except Exception as e:
    print(f"Error displaying results: {e}")
    print("Results might not be available yet.")

Error displaying results: Length mismatch: Expected axis has 9 elements, new values have 8 elements
Results might not be available yet.


In [8]:
# Compact summary of results
if 'results' in locals() and results:
    print("=== COMPACT PERFORMANCE SUMMARY (Averaged from 10 runs each) ===")
    print("Batch | Avg Total Time | Avg Tokens/Sec | Avg Memory Inc | Avg Peak Memory | Success")
    print("------|----------------|----------------|----------------|-----------------|--------")
    
    for result in results:
        print(f"{result['batch_size']:5d} | {result['inference_time']:13.4f}s | {result['tokens_per_second']:13.2f} | {result['memory_increase']:13.1f}MB | {result['peak_memory']:14.1f}MB | {result['num_successful_runs']:2d}/10")
    
    # Best performance summary
    df = pd.DataFrame(results)
    best_throughput_idx = df['tokens_per_second'].idxmax()
    best_latency_idx = df['time_per_sample'].idxmin()
    
    print(f"\n🚀 BEST THROUGHPUT: Batch {int(df.loc[best_throughput_idx, 'batch_size'])} = {df.loc[best_throughput_idx, 'tokens_per_second']:.1f} tokens/sec")
    print(f"⚡ BEST LATENCY: Batch {int(df.loc[best_latency_idx, 'batch_size'])} = {df.loc[best_latency_idx, 'time_per_sample']:.4f}s per sample")
    print(f"💾 MEMORY RANGE: {df['memory_increase'].min():.1f}MB - {df['memory_increase'].max():.1f}MB increase")
else:
    print("No results to display.")

=== COMPACT PERFORMANCE SUMMARY (Averaged from 10 runs each) ===
Batch | Avg Total Time | Avg Tokens/Sec | Avg Memory Inc | Avg Peak Memory | Success
------|----------------|----------------|----------------|-----------------|--------
    1 |        2.1808s |         29.35 |         253.7MB |         4594.8MB | 10/10
    2 |        2.3441s |         54.61 |         504.7MB |         4856.9MB | 10/10
    4 |        2.5139s |        101.84 |        1011.1MB |         5385.2MB | 10/10
    8 |        4.0339s |        126.93 |        2044.8MB |         6463.1MB | 10/10
   16 |        5.2270s |        195.91 |        4153.1MB |         8659.1MB | 10/10
   32 |        8.5899s |        238.42 |        8187.4MB |        12870.3MB | 10/10
   64 |       16.2781s |        251.63 |       16486.1MB |        21519.8MB | 10/10

🚀 BEST THROUGHPUT: Batch 64 = 251.6 tokens/sec
⚡ BEST LATENCY: Batch 64 = 0.2543s per sample
💾 MEMORY RANGE: 253.7MB - 16486.1MB increase


In [None]:
# h100

=== COMPACT PERFORMANCE SUMMARY (Averaged from 10 runs each) ===
Batch | Avg Total Time | Avg Tokens/Sec | Avg Memory Inc | Avg Peak Memory | Success
------|----------------|----------------|----------------|-----------------|--------
    1 |        1.0849s |         58.99 |         255.0MB |         4619.5MB | 10/10
    2 |        1.2191s |        104.99 |         517.6MB |         4893.2MB | 10/10
    4 |        1.3328s |        192.08 |        1003.0MB |         5400.5MB | 10/10
    8 |        2.3268s |        220.05 |        1981.8MB |         6423.5MB | 10/10
   16 |        3.2665s |        313.48 |        4078.6MB |         8608.1MB | 10/10
   32 |        5.7176s |        358.20 |        8109.9MB |        12816.2MB | 10/10

🚀 BEST THROUGHPUT: Batch 32 = 358.2 tokens/sec
⚡ BEST LATENCY: Batch 32 = 0.1787s per sample
💾 MEMORY RANGE: 255.0MB - 8109.9MB increase


In [None]:
# a100

=== COMPACT PERFORMANCE SUMMARY (Averaged from 10 runs each) ===
Batch | Avg Total Time | Avg Tokens/Sec | Avg Memory Inc | Avg Peak Memory | Success
------|----------------|----------------|----------------|-----------------|--------
    1 |        2.1808s |         29.35 |         253.7MB |         4594.8MB | 10/10
    2 |        2.3441s |         54.61 |         504.7MB |         4856.9MB | 10/10
    4 |        2.5139s |        101.84 |        1011.1MB |         5385.2MB | 10/10
    8 |        4.0339s |        126.93 |        2044.8MB |         6463.1MB | 10/10
   16 |        5.2270s |        195.91 |        4153.1MB |         8659.1MB | 10/10
   32 |        8.5899s |        238.42 |        8187.4MB |        12870.3MB | 10/10
   64 |       16.2781s |        251.63 |       16486.1MB |        21519.8MB | 10/10

🚀 BEST THROUGHPUT: Batch 64 = 251.6 tokens/sec
⚡ BEST LATENCY: Batch 64 = 0.2543s per sample
💾 MEMORY RANGE: 253.7MB - 16486.1MB increase
