# Cosine Similarity

*How fast can we make it?*


In [69]:
import numpy as np
import torch
import time

# Create a larger synthetic dataset for testing
def create_test_data(n_vectors, dim):
    ids = [f"id_{i}" for i in range(n_vectors)]
    embeddings = np.random.randn(n_vectors, dim)
    return list(zip(ids, embeddings))

def run_test(A, B, device_type):
    # Extract IDs and embeddings
    start_time = time.time()

    A_ids = [item[0] for item in A]
    A_embeddings = np.array([item[1] for item in A])
    
    B_ids = [item[0] for item in B]
    B_embeddings = np.array([item[1] for item in B])
    
    device = torch.device(device_type)
    
    # Warm-up for GPU (to initialize CUDA)
    if device_type == 'cuda':
        dummy = torch.zeros(1).to(device)
        dummy = dummy + 1
        torch.cuda.synchronize()
    
    # Transfer data to device (not timed)
    A_tensor = torch.tensor(A_embeddings, dtype=torch.float32).to(device)
    B_tensor = torch.tensor(B_embeddings, dtype=torch.float32).to(device)
    
    # Start timing AFTER data transfer
    
    A_normalized = A_tensor / torch.norm(A_tensor, dim=1, keepdim=True)
    B_normalized = B_tensor / torch.norm(B_tensor, dim=1, keepdim=True)
    
    similarities = torch.matmul(A_normalized, B_normalized.T)
    
    # Ensure GPU operations are complete before stopping timer
    if device_type == 'cuda':
        torch.cuda.synchronize()
    
    
    # Don't include transfer back to CPU in timing
    similarities_cpu = similarities.cpu().numpy()
    compute_time = time.time() - start_time

    # Results processing not included in timing
    
    return compute_time, similarities_cpu

# Create larger test data
embedding_dim = 1000  # Common for embeddings
A = create_test_data(10_000, embedding_dim)
B = create_test_data(10_000, embedding_dim)

# Run on CPU
cpu_time, cpu_results = run_test(A, B, 'cpu')
print(f"CPU time: {cpu_time:.4f} seconds")
print(len(cpu_results))

# Run on GPU if available
if torch.cuda.is_available():
    # Run twice, first for warm-up
    # _, _ = run_test(A, B, 'cuda')
    gpu_time, gpu_results = run_test(A, B, 'cuda')
    print(f"GPU time: {gpu_time:.4f} seconds")
    print(f"GPU speedup: {cpu_time/gpu_time:.2f}x")
else:
    print("CUDA not available")

CPU time: 0.2187 seconds
10000
GPU time: 0.1974 seconds
GPU speedup: 1.11x


In [72]:
import numpy as np
import torch
import time

# Create a larger synthetic dataset for testing
def create_test_data(n_vectors, dim):
    ids = [f"id_{i}" for i in range(n_vectors)]
    embeddings = np.random.randn(n_vectors, dim)
    return list(zip(ids, embeddings))

def run_test(A, B, device_type):
    # Extract IDs and embeddings
    start_time = time.time()

    A_ids = [item[0] for item in A]
    A_embeddings = np.array([item[1] for item in A])
    
    B_ids = [item[0] for item in B]
    B_embeddings = np.array([item[1] for item in B])
    
    device = torch.device(device_type)
    
    # Warm-up for GPU (to initialize CUDA)
    if device_type == 'cuda':
        dummy = torch.zeros(1).to(device)
        dummy = dummy + 1
        torch.cuda.synchronize()
    
    # Transfer data to device (not timed)
    A_tensor = torch.tensor(A_embeddings, dtype=torch.float32).to(device)
    B_tensor = torch.tensor(B_embeddings, dtype=torch.float32).to(device)
    
    # Start timing AFTER data transfer
    
    A_normalized = A_tensor / torch.norm(A_tensor, dim=1, keepdim=True)
    B_normalized = B_tensor / torch.norm(B_tensor, dim=1, keepdim=True)
    
    similarities = torch.matmul(A_normalized, B_normalized.T)
    
    # Ensure GPU operations are complete before stopping timer
    if device_type == 'cuda':
        torch.cuda.synchronize()
    
    
    # Don't include transfer back to CPU in timing
    similarities_cpu = similarities.cpu().numpy()
    compute_time = time.time() - start_time
    similarities_cpu = similarities_cpu.reshape(-1)

    # Results processing not included in timing
    
    return compute_time, similarities_cpu

# Create larger test data
embedding_dim = 1000  # Common for embeddings
A = create_test_data(10_000, embedding_dim)
B = create_test_data(10_000, embedding_dim)

# Run on CPU
cpu_time, cpu_results = run_test(A, B, 'cpu')
print(f"CPU time: {cpu_time:.4f} seconds")
print(f"Result shape: {cpu_results.shape}")  # Print the full shape instead of just length
print(f"Total number of similarity scores: {cpu_results.size}")  # Total number of elements
print("---------------")
# Run on GPU if available
if torch.cuda.is_available():
    # Run twice, first for warm-up
    # _, _ = run_test(A, B, 'cuda')
    gpu_time, gpu_results = run_test(A, B, 'cuda')
    print(f"GPU time: {gpu_time:.4f} seconds")
    print(f"GPU speedup: {cpu_time/gpu_time:.2f}x")
    print(f"Result shape: {gpu_results.shape}")  # Print the full shape
    print(f"Total number of similarity scores: {gpu_results.size}")  # Total number of elements
else:
    print("CUDA not available")

CPU time: 0.2172 seconds
Result shape: (100000000,)
Total number of similarity scores: 100000000
---------------
GPU time: 0.2024 seconds
GPU speedup: 1.07x
Result shape: (100000000,)
Total number of similarity scores: 100000000
