In [36]:
# Environment & config
import os
from pathlib import Path

os.environ["GOOGLE_API_KEY"] = "AIzaSyBW6XKhpcRVHrthjvUBmyLHxTW7DJooWaA"

COLLECTION_NAME         = "pdf_documents"
MILVUS_HOST, MILVUS_PORT = "localhost", "19530"

# Set to True only when you add/replace PDFs
REBUILD_INDEX = True

In [37]:
# Core imports & Milvus connection
from pymilvus import connections
from langchain.embeddings import SentenceTransformerEmbeddings

connections.connect(alias="default", host=MILVUS_HOST, port=MILVUS_PORT)
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [38]:
# (Re)build Milvus index  ▸ run only when REBUILD_INDEX = True
if REBUILD_INDEX:
    from langchain.document_loaders import PyPDFLoader
    from langchain.text_splitter import CharacterTextSplitter
    from langchain.vectorstores import Milvus

    folder_path = Path("./pdfs")
    splitter    = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    docs = []
    for pdf in folder_path.glob("*.pdf"):
        docs.extend(
            PyPDFLoader(str(pdf)).load_and_split(text_splitter=splitter)
        )

    Milvus.from_documents(
        docs,
        embeddings,
        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
        collection_name=COLLECTION_NAME,
        drop_old=True,                                  
        index_params={"index_type": "IVF_FLAT",
                      "metric_type": "COSINE",
                      "params": {"nlist": 128}},
        search_params={"metric_type": "COSINE",
                       "params": {"nprobe": 10}},
    )
    print(f"✅ Indexed {len(docs)} chunks → {COLLECTION_NAME}")
else:
    print("⏩  Skipping rebuild (REBUILD_INDEX = False)")


✅ Indexed 31 chunks → pdf_documents


In [39]:
# Quick RAG sanity check (Optional)
from langchain.vectorstores import Milvus
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain import PromptTemplate

vectorstore = Milvus(
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    search_params={"metric_type": "COSINE", "params": {"nprobe": 10}},
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm       = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1)

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are an expert assistant. Use the context (with page numbers) "
        "to answer.\n\nContext:\n{context}\n\nQuestion:\n{question}\n\n"
        "Answer:\n1. Summary – one sentence.\n2. Key Points – bullet list with page citations."
    ),
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
)

print(qa_chain({ "query": "What are sets?" })["result"])


1. Summary: Sets are unordered collections that contain only one of each distinct value and are similar to arrays but must contain only one data type.

2. Key Points:
* Sets are similar to arrays and must contain only one type (page 9).
* Sets are unordered (page 9).
* Sets can only contain one of each distinct value (page 9).
* Sets can be created using `Set([values])` or by declaring a variable of type `Set` and adding values using `.insert()` and removing values using `.remove()` (page 9).
