In [6]:
# Dynamic Batching Benchmarking

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import asyncio
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer, AutoModel
import batch
from batch import inference
import torch

# Load a pretrained model (e.g., BERT)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Helper function to generate random text
def generate_random_text(num_words=10):
    words = np.random.choice(['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog'], size=num_words)
    return ' '.join(words)

# Patch the model's forward method for dynamic batching
def forward_wrapper(*args, **kwargs):
    print("Forward called with args:", args, "and kwargs:", kwargs)
    with torch.no_grad():
        return model.forward(*args, **kwargs)

model.forward = inference.dynamically(forward_wrapper)

# Simple batching function
def simple_batch_inference(texts, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        outputs = forward_wrapper(**inputs)
        results.extend(outputs['last_hidden_state'][:, 0, :].numpy())
    return results

# Dynamic batching function
def dynamic_batch_inference(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs["last_hidden_state"][:, 0, :].numpy()

# Benchmark: Simple vs. Dynamic Batching (without threads or asyncio)
def benchmark_simple_vs_dynamic(num_requests=1000):
    texts = [generate_random_text() for _ in range(num_requests)]
    
    start_time = time.time()
    simple_results = simple_batch_inference(texts)
    simple_time = time.time() - start_time
    
    start_time = time.time()
    dynamic_results = dynamic_batch_inference(texts)
    dynamic_time = time.time() - start_time
    
    return simple_time, dynamic_time

# Benchmark: Simple vs. Dynamic Batching with Threads
def benchmark_threaded(num_requests=1000, num_threads=4):
    texts = [generate_random_text() for _ in range(num_requests)]
    
    def process_simple(text):
        return simple_batch_inference([text])[0]
    
    def process_dynamic(text):
        return dynamic_batch_inference([text])[0]
    
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        simple_results = list(executor.map(process_simple, texts))
    simple_time = time.time() - start_time
    
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        dynamic_results = list(executor.map(process_dynamic, texts))
    dynamic_time = time.time() - start_time
    
    return simple_time, dynamic_time

# Benchmark: Simple vs. Dynamic Batching with Asyncio
async def benchmark_asyncio(num_requests=1000):
    texts = [generate_random_text() for _ in range(num_requests)]
    
    async def process_simple(text):
        return simple_batch_inference([text])[0]
    
    async def process_dynamic(text):
        return dynamic_batch_inference([text])[0]
    
    start_time = time.time()
    simple_results = await asyncio.gather(*[process_simple(text) for text in texts])
    simple_time = time.time() - start_time
    
    start_time = time.time()
    dynamic_results = await asyncio.gather(*[process_dynamic(text) for text in texts])
    dynamic_time = time.time() - start_time
    
    return simple_time, dynamic_time

# Run benchmarks
print("Running benchmarks...")

simple_time, dynamic_time = benchmark_simple_vs_dynamic()
print(f"Simple vs. Dynamic (no threads/asyncio): {simple_time:.2f}s vs {dynamic_time:.2f}s")

simple_time_threaded, dynamic_time_threaded = benchmark_threaded()
print(f"Simple vs. Dynamic (threaded): {simple_time_threaded:.2f}s vs {dynamic_time_threaded:.2f}s")

simple_time_asyncio, dynamic_time_asyncio = asyncio.run(benchmark_asyncio())
print(f"Simple vs. Dynamic (asyncio): {simple_time_asyncio:.2f}s vs {dynamic_time_asyncio:.2f}s")

# Prepare data for visualization
data = {
    'Method': ['Simple', 'Dynamic'] * 3,
    'Scenario': ['No Threads/Asyncio'] * 2 + ['Threaded'] * 2 + ['Asyncio'] * 2,
    'Time (s)': [simple_time, dynamic_time,
                 simple_time_threaded, dynamic_time_threaded,
                 simple_time_asyncio, dynamic_time_asyncio]
}

df = pd.DataFrame(data)

# Create bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Scenario', y='Time (s)', hue='Method', data=df)
plt.title('Benchmarking Results: Simple vs. Dynamic Batching')
plt.ylabel('Time (seconds)')
plt.legend(title='Batching Method')
plt.show()

# Calculate speedup
speedup_no_threads = simple_time / dynamic_time
speedup_threaded = simple_time_threaded / dynamic_time_threaded
speedup_asyncio = simple_time_asyncio / dynamic_time_asyncio

print(f"\nSpeedup (Dynamic vs. Simple):")
print(f"No Threads/Asyncio: {speedup_no_threads:.2f}x")
print(f"Threaded: {speedup_threaded:.2f}x")
print(f"Asyncio: {speedup_asyncio:.2f}x")

# Create speedup bar plot
speedup_data = {
    'Scenario': ['No Threads/Asyncio', 'Threaded', 'Asyncio'],
    'Speedup': [speedup_no_threads, speedup_threaded, speedup_asyncio]
}

df_speedup = pd.DataFrame(speedup_data)

plt.figure(figsize=(10, 5))
sns.barplot(x='Scenario', y='Speedup', data=df_speedup)
plt.title('Speedup of Dynamic Batching vs. Simple Batching')
plt.ylabel('Speedup Factor')
plt.show()

# Restore original forward method
model.forward = original_forward



False
Running benchmarks...
Forward called with args: () and kwargs: {'input_ids': tensor([[  101,  1996, 14523,  2058, 13971,  3899, 14523,  2829,  3899,  4419,
          4419,   102],
        [  101,  3899,  2829,  2829,  1996,  3899,  3899,  1996,  2058,  4419,
          2829,   102],
        [  101,  2829,  2829, 13971,  1996, 13971,  4248,  4419,  3899,  4248,
          2829,   102],
        [  101,  2829, 14523, 13971,  2829,  4248,  3899,  3899,  1996,  2829,
         14523,   102],
        [  101,  1996,  3899,  4248, 14523, 14523,  2058,  4248,  1996,  2829,
          4248,   102],
        [  101,  3899,  4419,  4248, 13971,  4248,  4419,  4419,  1996, 14523,
         13971,   102],
        [  101,  2058,  2829, 14523,  2058,  2058,  4248, 13971,  3899,  4248,
          4419,   102],
        [  101,  1996,  4419, 14523,  2058,  1996, 14523,  1996,  2829,  3899,
          2829,   102],
        [  101, 14523,  2829,  2058,  4419,  2829,  1996,  4248, 13971,  1996,
          3899

In [5]:
%pip install torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
