In [4]:
documents = [
    "Cats are independent pets.",
    "Dogs are loyal and friendly animals.",
    "Birds can fly and live in trees.",
    "Fish live in water and need aquariums.",
    "Tigers are wild animals found in forests."
]

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents)

In [15]:
print(doc_embeddings.shape)
print(doc_embeddings.shape[1])

(5, 384)
384


In [17]:
import faiss
import numpy as np

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

In [27]:
query = "Which animals make friendly pets?"
query_embedding = model.encode([query])
query_embedding = np.array(query_embedding).astype("float32")

In [36]:
k =5
distances, indices = index.search(query_embedding, k)

print("\nTop Matches:\n")
for idx in indices[0]:
    print(f"Content: {documents[idx]}")


Top Matches:

Content: Dogs are loyal and friendly animals.
Content: Cats are independent pets.
Content: Tigers are wild animals found in forests.
Content: Fish live in water and need aquariums.
Content: Birds can fly and live in trees.


In [37]:
print(indices)
print(distances)

[[1 0 4 3 2]]
[[0.62283707 0.9848762  1.2443695  1.3975534  1.4905014 ]]


### FAISS cosine similarity

In [38]:
documents = [
    "Cats are independent pets.",
    "Dogs are loyal and friendly animals.",
    "Birds can fly and live in trees.",
    "Fish live in water and need aquariums.",
    "Tigers are wild animals found in forests."
]

In [39]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents)
doc_embeddings = normalize(doc_embeddings, norm='l2')  # Normalize to unit vectors

In [40]:
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use Inner Product instead of L2
index.add(np.array(doc_embeddings).astype("float32"))


In [41]:
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use Inner Product instead of L2
index.add(np.array(doc_embeddings).astype("float32"))


In [42]:
k =5
distances, indices = index.search(query_embedding, k)

print("\nTop Matches:\n")
for idx in indices[0]:
    print(f"Content: {documents[idx]}")


Top Matches:

Content: Dogs are loyal and friendly animals.
Content: Cats are independent pets.
Content: Tigers are wild animals found in forests.
Content: Fish live in water and need aquariums.
Content: Birds can fly and live in trees.


In [43]:
print(indices)
print(distances)

[[1 0 4 3 2]]
[[0.68858135 0.50756186 0.37781513 0.3012232  0.25474924]]
