HNSW INDEXING USING FAISS

In [None]:
!pip install -qq langchain langchain faiss-cpu langchain-huggingface langchain-community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

document = """
The Taj Mahal is a white marble mausoleum located in Agra, India.
It was commissioned in 1632 by the Mughal emperor Shah Jahan to house the tomb of his favorite wife, Mumtaz Mahal.
The Taj Mahal is widely recognized as the jewel of Muslim art in India and one of the universally admired masterpieces of world heritage.
It attracts millions of tourists every year.
"""

model = SentenceTransformer('all-MiniLM-L6-V2')
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20
)

chunks = splitter.split_text(document)
embeddings =  model.encode(chunks)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

create HNSW index and add embedding

In [None]:
import faiss
import numpy as np

embedding_dim = embeddings.shape[1]

hnsw_index = faiss.IndexHNSWFlat(embedding_dim,32)

hnsw_index.hnsw.efConstruction = 40
hnsw_index.hnsw.efSearch = 16

hnsw_index.add(embeddings)

print("vector indexing",hnsw_index.ntotal)

vector indexing 6


In [None]:
from langchain_community.vectorstores.faiss import FAISS
from langchain.schema import  Document
from langchain_huggingface import HuggingFaceEmbeddings


doc =  [Document(page_content=chunk)  for chunk in chunks]

# Use HuggingFaceEmbeddings as the embedding object
embedding_model_langchain = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-V2')

vectorstore = FAISS.from_documents(doc, embedding=embedding_model_langchain)

retriever = vectorstore.as_retriever()

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Step 1: Load HuggingFace LLM
hf_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=100)

# Step 2: Wrap it in LangChain-compatible LLM
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Step 3: Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Step 4: Ask a question!
query = "When was it built?"
answer = qa_chain.run(query)
print("Answer:", answer)





Device set to use cpu


Answer: 1632


In [None]:
query = "Who commissioned it??"
answer = qa_chain.run(query)
print("Answer:", answer)

Answer: Mughal emperor Shah Jahan


In [None]:
import faiss
import numpy as np
import time

# Step 1: Build Flat Index
embedding_dim = embeddings.shape[1]
flat_index = faiss.IndexFlatL2(embedding_dim)

# Step 2: Add embeddings
flat_index.add(embeddings)

# Step 3: Create query embedding
query = "Who built the Taj Mahal?"
query_embedding = model.encode([query], convert_to_numpy=True)

# Step 4: Search and time it
start = time.time()
D, I = flat_index.search(query_embedding, k=1)
end = time.time()

# Step 5: Show results
print(f"⏱️ Search Time (Flat): {round((end - start) * 1000, 3)} ms")
print(f"📍 Top Chunk Index: {I[0][0]}")
print(f"📄 Top Chunk Text: {chunks[I[0][0]]}")


⏱️ Search Time (Flat): 0.503 ms
📍 Top Chunk Index: 3
📄 Top Chunk Text: The Taj Mahal is widely recognized as the jewel of Muslim art in India and one of the universally


In [None]:
# Step 1: Create quantizer and IVF index
nlist = 2  # Number of clusters (small since few chunks)
quantizer = faiss.IndexFlatL2(embedding_dim)
ivf_index = faiss.IndexIVFFlat(quantizer, embedding_dim, nlist)

# Step 2: Train IVF index
ivf_index.train(embeddings)

# Step 3: Add vectors
ivf_index.add(embeddings)

# Step 4: Optional - set nprobe (how many clusters to search)
ivf_index.nprobe = 2  # Set to all for small data

# Step 5: Encode query
query_embedding = model.encode([query], convert_to_numpy=True)

# Step 6: Search and time it
start = time.time()
D, I = ivf_index.search(query_embedding, k=1)
end = time.time()

# Step 7: Show result
print(f"⏱️ Search Time (IVF): {round((end - start) * 1000, 3)} ms")
print(f"📍 Top Chunk Index: {I[0][0]}")
print(f"📄 Top Chunk Text: {chunks[I[0][0]]}")


⏱️ Search Time (IVF): 5.467 ms
📍 Top Chunk Index: 3
📄 Top Chunk Text: The Taj Mahal is widely recognized as the jewel of Muslim art in India and one of the universally


In [None]:
# Step 1: Create HNSW Index (M = number of connections per vector)
hnsw_index = faiss.IndexHNSWFlat(embedding_dim, 32)

# Step 2: Optional tuning
hnsw_index.hnsw.efConstruction = 40  # For indexing quality
hnsw_index.hnsw.efSearch = 16        # For query time recall

# Step 3: Add vectors
hnsw_index.add(embeddings)

# Step 4: Encode query
query_embedding = model.encode([query], convert_to_numpy=True)

# Step 5: Search and time it
start = time.time()
D, I = hnsw_index.search(query_embedding, k=1)
end = time.time()

# Step 6: Show result
print(f"⏱️ Search Time (HNSW): {round((end - start) * 1000, 3)} ms")
print(f"📍 Top Chunk Index: {I[0][0]}")
print(f"📄 Top Chunk Text: {chunks[I[0][0]]}")


⏱️ Search Time (HNSW): 0.47 ms
📍 Top Chunk Index: 3
📄 Top Chunk Text: The Taj Mahal is widely recognized as the jewel of Muslim art in India and one of the universally
