In [1]:
documents = [
    "climate change",
    "quantum computing",
    "renaissance art",
    "neuroscience",
    "cryptocurrency",
    "medieval history",
    "oceanography",
    "robotics",
    "classical music",
    "genetic engineering",
    "bethoven",
]

In [2]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings
embeddings = model.encode(documents, convert_to_tensor=False)

In [3]:
import faiss
import numpy as np

# Convert embeddings to float32 NumPy array
embeddings = np.array(embeddings).astype('float32')

# Initialize a Faiss index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance (Euclidean)

# Add embeddings to the index
index.add(embeddings)

In [4]:
print(index)

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x32eac2430> >


In [None]:
# Define a query sentence
query = "sun and ice and melting"

# Compute the embedding for the query
query_embedding = model.encode([query], convert_to_tensor=False).astype('float32')

# Search the index for the top 3 most similar sentences
k = 10
distances, indices = index.search(query_embedding, k)

# Display the results
print("Query:", query)
print("\nTop 3 most similar sentences in the corpus:")

for i, idx in enumerate(indices[0]):
    print(f"{i+1}. {documents[idx]} (Distance: {distances[0][i]:.4f})")

Query: sun and ice and melting

Top 3 most similar sentences in the corpus:
1. climate change (Distance: 1.0630)
2. cryptocurrency (Distance: 1.5013)
3. oceanography (Distance: 1.6184)
4. classical music (Distance: 1.7625)
5. renaissance art (Distance: 1.7873)
6. robotics (Distance: 1.8202)
7. medieval history (Distance: 1.8522)
8. bethoven (Distance: 1.8590)
9. quantum computing (Distance: 1.8774)
10. neuroscience (Distance: 1.9257)
