In [None]:
from langchain_community.document_loaders import PDFMinerLoader # PyPDFLoader, UnstructuredPDFLoader,
from langchain_experimental.text_splitter import SemanticChunker # RecursiveCharacterTextSplitter # CharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_qdrant import QdrantVectorStore
import glob

In [None]:
pdfs = glob.glob('Pdf/*.pdf')
all_docs=[]
# loader = PyPDFLoader(tmp_filepath) # not properly working. 
# docs = loader.load()

In [3]:
# cleaning the extracted text
def is_valid_doc(doc: Document) -> bool:
    text = doc.page_content
    clean_text = text.replace('\x0c', '').strip() # Remove whitespace + form feed (\x0c) and check length
    return len(clean_text) > 30  # or any reasonable threshold

In [4]:
for pdf_path in pdfs[0:100]:
    loader = PDFMinerLoader(str(pdf_path))
    docs = loader.load()
    valid_docs = [doc for doc in docs if is_valid_doc(doc)]
    all_docs.extend(valid_docs)

In [5]:
# Splitting the text into chunks

# Sematic splitting with openai or huggingFace
# openai_chunking = OpenAIEmbeddings(model="text-embedding-3-small")
HF_chunking = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", cache_folder = '.')
chunker = SemanticChunker(
    embeddings=HF_chunking,
    breakpoint_threshold_type="percentile",       # or "standard"
    breakpoint_threshold_amount=0.9,           # more strict
    min_chunk_size=800,                        # at least 300 token
)
chunks = chunker.split_documents(all_docs)

# CharacterTextSplitter and RecursiveCharacterTextSplitter both works good with seperator '\n\n'  
# splitter = RecursiveCharacterTextSplitter(separators="\n\n", chunk_size=1000, chunk_overlap=200)
# chunks = splitter.split_documents(all_docs)

  HF_chunking = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", cache_folder = '.')
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Embedding models
# HF_MiniLM = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder='.')
# HF_MPNet = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", cache_folder='.')
# HF_BGE = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5", cache_folder='.')
openai_embedding = OpenAIEmbeddings(model="text-embedding-3-small")

  openai_embedding = OpenAIEmbeddings(model="text-embedding-3-small")


In [7]:
# Vector stores
chroma_db = Chroma.from_documents(
    documents=chunks,
    embedding=openai_embedding,
    persist_directory="./chroma_store"
)
chroma_db.persist()

faiss_index = FAISS.from_documents(chunks, openai_embedding)
faiss_index.save_local("faiss_index")
# faiss_index = FAISS.load_local("faiss_index", embeddings=openai_embedding) # Load later

qdrant = QdrantClient()  
collection_name = "semantic_chunks"
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=openai_embedding.embed_query("test").__len__(), distance=Distance.COSINE),
) # Create collection
qdrant_store = QdrantVectorStore.from_documents(
    documents=chunks,
    embedding=openai_embedding,
    collection_name=collection_name,
) # Store


  chroma_db.persist()
  qdrant.recreate_collection(


In [8]:
chroma_retriever = chroma_db.as_retriever(search_kwargs={"k": 5}) # Maximum Marginal Relevance - 
faiss_retriever = faiss_index.as_retriever(search_kwargs={"k": 5})
qdrant_retriever = qdrant_store.as_retriever(search_kwargs={"k": 5})

In [9]:
# define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
# RAG chain
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=qdrant_retriever)

  llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)


In [10]:
query = "On which date was this session arranged?"
result = rag_chain(query)
result

  result = rag_chain(query)


{'query': 'On which date was this session arranged?',
 'result': 'The session was arranged for November 16th at 11:00 AM in 2318 Rayburn House Office Building.'}

In [None]:
# checking the latency
import time
import pandas as pd

retrievers = {
    "chroma": chroma_retriever,
    "faiss":  faiss_retriever,
    "qdrant": qdrant_retriever,
}

queries = [
    "What is the problem in North Carolina?",
    "On which date was this session arranged?",
    # …add more queries as needed…
]

records = []
for store_name, retriever in retrievers.items():
    for q in queries:
        start = time.perf_counter()
        docs = retriever.get_relevant_documents(q)
        end   = time.perf_counter()
        records.append({
            "vector_store": store_name,
            "query":         q,
            "latency_s":    end - start,
            "num_results":  len(docs)
        })

df = pd.DataFrame(records)
print(df)

summary = df.groupby("vector_store")["latency_s"].agg(["min","mean","max","std"])
print("\nSummary:\n", summary)

  docs = retriever.get_relevant_documents(q)


  vector_store                                     query  latency_s  \
0       chroma    What is the problem in North Carolina?   1.086196   
1       chroma  On which date was this session arranged?   0.323732   
2        faiss    What is the problem in North Carolina?   2.381728   
3        faiss  On which date was this session arranged?   0.849033   
4       qdrant    What is the problem in North Carolina?   0.837467   
5       qdrant  On which date was this session arranged?   0.585165   

   num_results  
0            5  
1            5  
2            5  
3            5  
4            5  
5            5  

Summary:
                    min      mean       max       std
vector_store                                        
chroma        0.323732  0.704964  1.086196  0.539144
faiss         0.849033  1.615381  2.381728  1.083779
qdrant        0.585165  0.711316  0.837467  0.178404
