# vectorwrap Performance Benchmark

This notebook benchmarks vectorwrap across all supported backends with 10,000 vectors.

## Setup

**Requirements:**
- `pip install "vectorwrap[sqlite,duckdb]" numpy matplotlib`
- PostgreSQL with pgvector extension (optional)
- MySQL 8.2+ (optional)

**Test Dataset:**
- 10,000 random 384-dimensional vectors (typical sentence-transformer size)
- 1,000 query operations
- Measures: Insert throughput, Query QPS, Memory usage

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict, Any
import os
import tempfile
from vectorwrap import VectorDB

# Configuration
VECTOR_DIM = 384  # Typical sentence-transformer dimension
NUM_VECTORS = 10_000
NUM_QUERIES = 1_000
TOP_K = 5

print(f"Benchmarking {NUM_VECTORS:,} vectors, {NUM_QUERIES:,} queries")
print(f"Vector dimension: {VECTOR_DIM}")

## Generate Test Data

In [None]:
# Generate random normalized vectors (realistic embeddings)
np.random.seed(42)

def generate_vectors(n: int, dim: int) -> List[List[float]]:
    """Generate n normalized random vectors of dimension dim."""
    vectors = np.random.randn(n, dim).astype(np.float32)
    # Normalize to unit length (common for embeddings)
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    vectors = vectors / norms
    return vectors.tolist()

# Generate data
print("Generating test vectors...")
insert_vectors = generate_vectors(NUM_VECTORS, VECTOR_DIM)
query_vectors = generate_vectors(NUM_QUERIES, VECTOR_DIM)

print(f"Generated {len(insert_vectors):,} insert vectors")
print(f"Generated {len(query_vectors):,} query vectors")
print(f"Sample vector norm: {np.linalg.norm(insert_vectors[0]):.3f}")

## Benchmark Functions

In [None]:
def benchmark_backend(db_url: str, name: str) -> Dict[str, Any]:
    """Benchmark a single backend."""
    print(f"\nTesting {name}...")
    
    try:
        # Initialize database
        db = VectorDB(db_url)
        collection_name = "bench_test"
        
        # Create collection
        start_time = time.time()
        db.create_collection(collection_name, VECTOR_DIM)
        create_time = time.time() - start_time
        
        # Benchmark inserts
        print(f"  Inserting {NUM_VECTORS:,} vectors...")
        start_time = time.time()
        
        for i, vector in enumerate(insert_vectors):
            db.upsert(collection_name, i, vector)
            if (i + 1) % 1000 == 0:
                print(f"    Inserted {i+1:,} vectors...")
        
        insert_time = time.time() - start_time
        insert_throughput = NUM_VECTORS / insert_time
        
        # Benchmark queries
        print(f"  Running {NUM_QUERIES:,} queries...")
        start_time = time.time()
        
        total_results = 0
        for i, query_vector in enumerate(query_vectors):
            results = db.query(collection_name, query_vector, top_k=TOP_K)
            total_results += len(results)
            if (i + 1) % 100 == 0:
                print(f"    Completed {i+1:,} queries...")
        
        query_time = time.time() - start_time
        query_qps = NUM_QUERIES / query_time
        
        results = {
            'name': name,
            'success': True,
            'create_time': create_time,
            'insert_time': insert_time,
            'insert_throughput': insert_throughput,
            'query_time': query_time,
            'query_qps': query_qps,
            'total_results': total_results,
            'avg_results_per_query': total_results / NUM_QUERIES
        }
        
        print(f"  {name} completed successfully")
        print(f"     Insert: {insert_throughput:.1f} vectors/sec")
        print(f"     Query:  {query_qps:.1f} QPS")
        
        return results
        
    except Exception as e:
        print(f"  {name} failed: {e}")
        return {
            'name': name,
            'success': False,
            'error': str(e)
        }

## Run Benchmarks

We'll test all available backends. Some may fail if the database server isn't running - that's expected.

In [None]:
# Test configuration
backends_to_test = [
    ("sqlite:///:memory:", "SQLite (in-memory)"),
    ("duckdb:///:memory:", "DuckDB (in-memory)"),
]

# Add file-based backends
with tempfile.TemporaryDirectory() as tmpdir:
    sqlite_file = os.path.join(tmpdir, "bench.db")
    duckdb_file = os.path.join(tmpdir, "bench.duckdb")
    
    backends_to_test.extend([
        (f"sqlite:///{sqlite_file}", "SQLite (file)"),
        (f"duckdb:///{duckdb_file}", "DuckDB (file)"),
    ])

# Add network databases if available
network_backends = [
    ("postgresql://postgres:secret@localhost/postgres", "PostgreSQL + pgvector"),
    ("mysql://root:secret@localhost:3306/vectordb", "MySQL (JSON fallback)"),
]

print("Available backends to test:")
for url, name in backends_to_test + network_backends:
    print(f"  - {name}")

print("\nNetwork databases will be skipped if not available")

In [None]:
# Run benchmarks
results = []

# Test local backends first
for db_url, name in backends_to_test:
    result = benchmark_backend(db_url, name)
    results.append(result)

# Test network backends (may fail)
for db_url, name in network_backends:
    result = benchmark_backend(db_url, name)
    results.append(result)

print("\nAll benchmarks completed!")

## Results Analysis

In [None]:
# Filter successful results
successful_results = [r for r in results if r.get('success', False)]

print(f"Performance Results ({len(successful_results)} backends tested)\n")
print(f"{'Backend':<25} {'Insert (vec/s)':<15} {'Query (QPS)':<12} {'Avg Results':<12}")
print("-" * 70)

for result in successful_results:
    print(f"{result['name']:<25} "
          f"{result['insert_throughput']:<15.1f} "
          f"{result['query_qps']:<12.1f} "
          f"{result['avg_results_per_query']:<12.1f}")

# Show failed backends
failed_results = [r for r in results if not r.get('success', False)]
if failed_results:
    print(f"\nFailed backends ({len(failed_results)} failed):")
    for result in failed_results:
        print(f"  - {result['name']}: {result.get('error', 'Unknown error')}")

## Performance Visualization

In [None]:
if successful_results:
    # Create performance charts
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    names = [r['name'] for r in successful_results]
    insert_throughput = [r['insert_throughput'] for r in successful_results]
    query_qps = [r['query_qps'] for r in successful_results]
    
    # Insert throughput chart
    bars1 = ax1.bar(names, insert_throughput, color='skyblue', alpha=0.7)
    ax1.set_title('Insert Throughput (vectors/second)', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Vectors per Second')
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, value in zip(bars1, insert_throughput):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(insert_throughput)*0.01,
                f'{value:.0f}', ha='center', va='bottom', fontweight='bold')
    
    # Query QPS chart
    bars2 = ax2.bar(names, query_qps, color='lightcoral', alpha=0.7)
    ax2.set_title('Query Performance (QPS)', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Queries per Second')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, value in zip(bars2, query_qps):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(query_qps)*0.01,
                f'{value:.0f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Performance summary
    best_insert = max(successful_results, key=lambda x: x['insert_throughput'])
    best_query = max(successful_results, key=lambda x: x['query_qps'])
    
    print(f"\nPerformance Champions:")
    print(f"  Fastest Insert: {best_insert['name']} ({best_insert['insert_throughput']:.0f} vec/s)")
    print(f"  Fastest Query:  {best_query['name']} ({best_query['query_qps']:.0f} QPS)")
else:
    print("No successful benchmarks to visualize")

## Recommendations

Based on the benchmark results:

In [None]:
print("Usage Recommendations:\n")

if successful_results:
    # Analyze results and provide recommendations
    memory_backends = [r for r in successful_results if 'memory' in r['name'].lower()]
    file_backends = [r for r in successful_results if 'file' in r['name'].lower()]
    network_backends = [r for r in successful_results if any(db in r['name'].lower() for db in ['postgresql', 'mysql'])]
    
    print("For Development & Testing:")
    if memory_backends:
        fastest_memory = max(memory_backends, key=lambda x: x['query_qps'])
        print(f"   → {fastest_memory['name']} - Fastest in-memory option ({fastest_memory['query_qps']:.0f} QPS)")
    
    print("\nFor Prototyping & Small Apps:")
    if file_backends:
        fastest_file = max(file_backends, key=lambda x: x['query_qps'])
        print(f"   → {fastest_file['name']} - Best persistent option ({fastest_file['query_qps']:.0f} QPS)")
    
    print("\nFor Production:")
    if network_backends:
        fastest_network = max(network_backends, key=lambda x: x['query_qps'])
        print(f"   → {fastest_network['name']} - Scalable production choice ({fastest_network['query_qps']:.0f} QPS)")
    else:
        print("   → PostgreSQL + pgvector - Best for production (requires setup)")
        print("   → MySQL 8.2+ with native VECTOR - Good alternative")
    
    print("\nFor Analytics + Vectors:")
    duckdb_results = [r for r in successful_results if 'duckdb' in r['name'].lower()]
    if duckdb_results:
        best_duckdb = max(duckdb_results, key=lambda x: x['query_qps'])
        print(f"   → {best_duckdb['name']} - Combines analytics with vector search ({best_duckdb['query_qps']:.0f} QPS)")
else:
    print("Install vectorwrap with: `pip install \"vectorwrap[sqlite,duckdb]\"`")
    print("For production, set up PostgreSQL + pgvector or MySQL 8.2+")

print(f"\nTest Configuration:")
print(f"   • Vectors: {NUM_VECTORS:,} x {VECTOR_DIM}D (normalized)")
print(f"   • Queries: {NUM_QUERIES:,} x top-{TOP_K}")
print(f"   • Machine: {os.uname().machine} / {os.uname().system}")

print("\nNote: Performance varies by dataset size, vector dimensions, and hardware.")
print("      Run this benchmark on your target environment for accurate results.")