In [1]:
# Benchmarking SentenceTransformers and AutoModel with and without batching

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from timeit import timeit
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import batch
from batch import inference
import torch

# Load models
st_model = SentenceTransformer('all-MiniLM-L6-v2')
auto_model_name = "bert-base-uncased"
auto_tokenizer = AutoTokenizer.from_pretrained(auto_model_name)
auto_model = AutoModel.from_pretrained(auto_model_name)

# Helper function to generate random sentences
def generate_random_sentences(num_sentences, words_per_sentence=10):
    vocab = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']
    return [' '.join(np.random.choice(vocab, size=words_per_sentence)) for _ in range(num_sentences)]

# Benchmark functions
def benchmark_st_no_batch(sentences):
    return [st_model.encode(sentence) for sentence in sentences]

@batch.dynamically
def st_with_general_batch(sents: list[str]):
    return st_model.encode(sents)

@inference.dynamically
def auto_with_inference_batch(feats):
    with torch.no_grad():
        outputs = auto_model(**feats)
    return outputs.last_hidden_state



def benchmark_auto_with_inference_batch(sentences):
    inputs = auto_tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = auto_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Run benchmarks
num_sentences = 100
sentences = generate_random_sentences(num_sentences)

benchmarks = [
    ("SentenceTransformers (No Batch)", lambda: benchmark_st_no_batch(sentences)),
    ("SentenceTransformers (General Batch)", lambda: benchmark_st_with_general_batch(sentences)),
    ("AutoModel (No Batch)", lambda: benchmark_auto_no_batch(sentences)),
    ("AutoModel (Inference Batch)", lambda: benchmark_auto_with_inference_batch(sentences))
]

results = []
for name, func in benchmarks:
    time_taken = timeit(func, number=5) / 5  # Average over 5 runs
    results.append((name, time_taken))

# Prepare data for visualization
df = pd.DataFrame(results, columns=['Method', 'Time (s)'])

# Create bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Method', y='Time (s)', data=df)
plt.title('Benchmarking Results: SentenceTransformers vs AutoModel (With and Without Batching)')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Calculate and display speedups
st_speedup = df.loc[0, 'Time (s)'] / df.loc[1, 'Time (s)']
auto_speedup = df.loc[2, 'Time (s)'] / df.loc[3, 'Time (s)']

print(f"SentenceTransformers Speedup (General Batch vs No Batch): {st_speedup:.2f}x")
print(f"AutoModel Speedup (Inference Batch vs No Batch): {auto_speedup:.2f}x")

# Create speedup bar plot
speedup_data = {
    'Model': ['SentenceTransformers', 'AutoModel'],
    'Speedup': [st_speedup, auto_speedup]
}

df_speedup = pd.DataFrame(speedup_data)

plt.figure(figsize=(10, 5))
sns.barplot(x='Model', y='Speedup', data=df_speedup)
plt.title('Speedup of Batched vs Non-Batched Processing')
plt.ylabel('Speedup Factor')
plt.show()

# Restore original forward method for AutoModel
auto_model.forward = original_forward


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:  12%|#1        | 52.4M/440M [00:00<?, ?B/s]

Running benchmarks...
Forward called with args: () and kwargs: {'input_ids': tensor([[  101, 13971,  3899,  1996,  2829,  2058, 13971, 13971,  4248, 13971,
          3899,   102],
        [  101,  2058,  2829,  3899, 13971,  2829,  4419, 13971,  4419,  2058,
          4248,   102],
        [  101,  4419,  4419,  2058,  1996, 13971,  2829,  2829,  2058, 13971,
         13971,   102],
        [  101, 14523,  2058,  2058,  1996, 13971, 13971, 13971,  2058,  4419,
          2058,   102],
        [  101,  1996,  4248, 13971,  2829, 14523,  2829,  4419,  1996, 14523,
          1996,   102],
        [  101, 14523,  4248,  2058,  4248, 13971,  4419,  1996,  1996,  2829,
         13971,   102],
        [  101,  4248, 14523,  2058,  1996, 14523,  3899,  4248,  3899,  2829,
          4248,   102],
        [  101,  4248,  1996,  4248, 13971, 14523,  2058,  4248, 13971,  2058,
          2058,   102],
        [  101,  2829,  1996,  2829,  2058,  2058,  2058, 13971,  1996,  4248,
          2829,   10

KeyboardInterrupt: 

In [5]:
%pip install torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
