In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

model_name = "BAAI/bge-base-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


## Test with one embedding

In [2]:
text = "I love you"
text2 = "They hate me"

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)

inputs2 = tokenizer(text2, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs2)
    embeddings2 = outputs.last_hidden_state.mean(dim=1)

# Assuming embeddings and embeddings2 are tensors of shape (1, hidden_dim)
cosine_similarity = torch.nn.functional.cosine_similarity(embeddings, embeddings2, dim=1)

# Print the similarity score
print("Cosine Similarity:", cosine_similarity.item())

Cosine Similarity: 0.5622000098228455


In [None]:
import faiss
import torch

# Assuming embeddings and embeddings2 are PyTorch tensors on GPU
embeddings = embeddings.contiguous()
embeddings2 = embeddings2.contiguous()

# Normalize embeddings on GPU using PyTorch
embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
embeddings2 = embeddings2 / embeddings2.norm(dim=1, keepdim=True)

# Convert embeddings to NumPy arrays on CPU
embeddings_np = embeddings.cpu().numpy()
embeddings2_np = embeddings2.cpu().numpy()

# Initialize GPU resources
res = faiss.StandardGpuResources()

# Create a GPU index directly
gpu_index = faiss.GpuIndexFlatIP(res, embeddings.size(1))

# Add embeddings to the index
gpu_index.add(embeddings_np)

# Perform search
k = 5
distances, indices = gpu_index.search(embeddings2_np, k)

# Convert results to CPU tensors if needed
distances = torch.from_numpy(distances).cpu()
indices = torch.from_numpy(indices).cpu()

print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)

Indices of nearest neighbors: tensor([[ 0, -1, -1, -1, -1]])
Distances to nearest neighbors: tensor([[ 5.6220e-01, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38]])
