In [1]:
from batch.asyncio_.batch_processor import dynamically as async_dynamically
from batch.thread_.batch_processor import dynamically as thread_dynamically

from batch.inference.asyncio_ import dynamically as async_inference_dynamically
from batch.inference.thread_ import dynamically as thread_inference_dynamically

from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load your model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [3]:
from functools import partial

encode = partial(
    model.encode, 
    prompt_name=None, 
    prompt=None, 
    batch_size=32, 
    show_progress_bar=False,
    output_value='sentence_embedding',
    precision="float32",
    convert_to_tensor=False,
    convert_to_numpy=True,
    device="mps",
    normalize_embeddings=True,
)

In [4]:
from concurrent.futures import ThreadPoolExecutor
import timeit

def benchmark():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

# Run the benchmark
number_of_runs = 3
execution_time = timeit.timeit(benchmark, number=number_of_runs)

# Calculate and print the average execution time
average_time = execution_time / number_of_runs
print(f"Average execution time: {average_time:.4f} seconds")

Average execution time: 29.0773 seconds


In [5]:
async_encode = async_dynamically(encode)

def benchmark_async():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(async_encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

# Run the benchmark
number_of_runs = 3
execution_time = timeit.timeit(benchmark_async, number=number_of_runs)

# Calculate and print the average execution time
average_time = execution_time / number_of_runs
print(f"Average execution time for async_encode: {average_time:.4f} seconds")

Average execution time for async_encode: 1.8133 seconds


In [6]:
thread_encode = thread_dynamically(encode)

def benchmark_thread():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(thread_encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

# Run the benchmark
number_of_runs = 3
execution_time = timeit.timeit(benchmark_thread, number=number_of_runs)

# Calculate and print the average execution time
average_time = execution_time / number_of_runs
print(f"Average execution time for thread_encode: {average_time:.4f} seconds")

Average execution time for thread_encode: 1.7592 seconds


In [7]:
from ofen.models import TextEncoder
model = TextEncoder("mixedbread-ai/mxbai-embed-large-v1")

def ofen_encode(texts):
    results = model.encode(
        texts, 
        batch_size=32, 
        show_progress=False, 
        normalize=True, 
        dimensions=None, 
        encoding_format="float"
    )
    return results.embeddings




In [8]:
import timeit

def benchmark():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(ofen_encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

number_of_runs = 3
execution_time = timeit.timeit(benchmark, number=number_of_runs)
average_time = execution_time / number_of_runs

print(f"Execution time: {average_time:.4f} seconds")

Execution time: 24.2413 seconds


In [9]:
import timeit

async_ofen_encode = async_dynamically(ofen_encode)

def benchmark_async_ofen():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(async_ofen_encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

number_of_runs = 3
execution_time = timeit.timeit(benchmark_async_ofen, number=number_of_runs)
average_time = execution_time / number_of_runs

print(f"Async OFen Execution time: {average_time:.4f} seconds")

Async OFen Execution time: 1.7326 seconds


In [10]:
import timeit

thread_ofen_encode = thread_dynamically(ofen_encode)

def benchmark_thread_ofen():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(thread_ofen_encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

number_of_runs = 3
execution_time = timeit.timeit(benchmark_thread_ofen, number=number_of_runs)
average_time = execution_time / number_of_runs
print(f"Thread OFen Execution time: {average_time:.4f} seconds")

Thread OFen Execution time: 1.7014 seconds


In [11]:
import timeit
from concurrent.futures import ThreadPoolExecutor

forward = model.forward

def the_forward(features: dict):
    return forward(**features)["embeddings"]

In [12]:
batched_forward = async_inference_dynamically(the_forward)
model.forward = lambda **kwargs: {"embeddings": batched_forward(**kwargs)}

def benchmark():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(model.encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

number_of_runs = 3
execution_time = timeit.timeit(benchmark, number=number_of_runs)
average_time = execution_time / number_of_runs
print(f"Async Inference Execution time: {average_time:.4f} seconds")

Async Inference Execution time: 1.8018 seconds


In [13]:
batched_forward = thread_inference_dynamically(the_forward)
model.forward = lambda **kwargs: {"embeddings": batched_forward(**kwargs)}

def benchmark_thread_inference():
    with ThreadPoolExecutor(max_workers=128) as executor:
        futures = [executor.submit(model.encode, ["Hello, world!"]) for _ in range(1000)]
        results = [future.result() for future in futures]
    return results

number_of_runs = 3
execution_time = timeit.timeit(benchmark_thread_inference, number=number_of_runs)
average_time = execution_time / number_of_runs
print(f"Thread Inference Execution time: {average_time:.4f} seconds")

Thread Inference Execution time: 1.7984 seconds
