In [1]:
from langchain_community.document_loaders import PyPDFLoader

In [2]:
FILE_PATH=r"C:\Monfort\AgenticAI\AgenticAI\Assignments\PlayingitMyWay.pdf"

In [3]:
loader=PyPDFLoader(FILE_PATH)

In [4]:
len(loader.load())

351

In [5]:
pages=loader.load()

In [6]:
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,#hyperparameter
    chunk_overlap=100 #hyperparemeter
)

In [9]:
split_docs = splitter.split_documents(pages)

In [10]:
len(split_docs)

783

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



  from .autonotebook import tqdm as notebook_tqdm


In [36]:
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType , Collection, connections
import os
from dotenv import load_dotenv


api_key = os.getenv("pytoken")
endpoint = os.getenv("pymilvusurl")

client = MilvusClient(
    uri=endpoint,
    token=api_key
)


In [13]:
collection_name = "sachin_collection"
if collection_name not in client.list_collections():
    # Define schema properly
    schema = CollectionSchema(
        fields=[
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
            FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
            FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384)
        ],
        description="Sachin book embedding collection"
    )

    client.create_collection(
        collection_name=collection_name,
        schema=schema
    )
    print(f"Collection '{collection_name}' created successfully.")

In [37]:

connections.connect(
    alias="default",
    uri=os.getenv("pymilvusurl"),
    token=os.getenv("pytoken")
)
collection = Collection("sachin_collection")

# Define index parameters as a dict
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128}
}

# Call create_index on the instance (self is passed automatically)
collection.create_index(field_name="vector", index_params=index_params)


Status(code=0, message=)

In [15]:
# Generate embeddings for the split documents
vectors = embeddings.embed_documents([doc.page_content for doc in split_docs])

data = [
    {"text": split_docs[idx].page_content, "vector": vec}
    for idx, vec in enumerate(vectors)
]

client.insert(
    collection_name="sachin_collection",
    data=data
)

print("Data successfully stored in Zilliz!")

Data successfully stored in Zilliz!


In [16]:
# Assuming collection name and vector field are defined correctly
collection_name = "sachin_collection"

# Step 1: Generate the query vector
query_vector = embeddings.embed_query("When did sachin made debut in Irani Trophy?")

# Step 2: Define search parameters
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10}
}

## Load the collection into memory before search
client.load_collection(collection_name="sachin_collection")

# Now perform the search
results = client.search(
    collection_name=collection_name,
    data=[query_vector],
    limit=5,
    search_params=search_params,
    output_fields=["text"],
    vector_field="vector"
)

for hit in results[0]:
    print(hit)

{'id': 458312172764778213, 'distance': 0.7788976430892944, 'entity': {'text': 'for him. In my debut match I scored 100 not out and in the process became the youngest Indian to\nscore a century on his first-class debut. I finished the 1988–89 season as Mumbai’s highest run-scorer\nand made half-centuries in six of the seven matches I played. Mumbai lost a hard-fought semi-final\nagainst Delhi and I ended my debut season with a respectable batting average of 64.77. In the semi-\nfinal, Madan Lal, former India fast bowler and coach, was playing for Delhi and I remember playing a\nstraight drive to him that was much talked about that evening. It was a shot that got me noticed,\nadding to my stock at the time. Everything about the shot was perfect – balance, head position, timing\n– and the ball raced to the boundary.\nMy performances for Mumbai got me selected for the season-opening Irani Trophy match at the\nbeginning of November 1989. The Irani Trophy, between the Ranji Trophy champions 

In [17]:
def retriever(query, top_k=5):
        query_vector = embeddings.embed_query(query)
        search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
        
        results = client.search(
            collection_name= collection_name,
            data=[query_vector],
            limit=top_k,
            search_params=search_params,
            output_fields=["text"],
            vector_field="vector"
        )
        
        return [hit["text"] for hit in results[0]]


docs = retriever("When did sachin made debut in Irani Trophy")
for doc in docs:
    print(doc)


for him. In my debut match I scored 100 not out and in the process became the youngest Indian to
score a century on his first-class debut. I finished the 1988–89 season as Mumbai’s highest run-scorer
and made half-centuries in six of the seven matches I played. Mumbai lost a hard-fought semi-final
against Delhi and I ended my debut season with a respectable batting average of 64.77. In the semi-
final, Madan Lal, former India fast bowler and coach, was playing for Delhi and I remember playing a
straight drive to him that was much talked about that evening. It was a shot that got me noticed,
adding to my stock at the time. Everything about the shot was perfect – balance, head position, timing
– and the ball raced to the boundary.
My performances for Mumbai got me selected for the season-opening Irani Trophy match at the
beginning of November 1989. The Irani Trophy, between the Ranji Trophy champions and the Rest of
India, is a key component of the Indian domestic cricket calendar and is

In [18]:
###BM25

from langchain.retrievers import BM25Retriever

# Create a BM25 retriever from your list of documents
bm25_retriever = BM25Retriever.from_documents(split_docs)

# Retrieve top relevant docs based on BM25 scoring
query = "when did sachin made debut in Irani trophy?"
top_docs = bm25_retriever.get_relevant_documents(query)

# Print top-ranked results
for doc in top_docs:
    print(doc.page_content)


for him. In my debut match I scored 100 not out and in the process became the youngest Indian to
score a century on his first-class debut. I finished the 1988–89 season as Mumbai’s highest run-scorer
and made half-centuries in six of the seven matches I played. Mumbai lost a hard-fought semi-final
against Delhi and I ended my debut season with a respectable batting average of 64.77. In the semi-
final, Madan Lal, former India fast bowler and coach, was playing for Delhi and I remember playing a
straight drive to him that was much talked about that evening. It was a shot that got me noticed,
adding to my stock at the time. Everything about the shot was perfect – balance, head position, timing
– and the ball raced to the boundary.
My performances for Mumbai got me selected for the season-opening Irani Trophy match at the
beginning of November 1989. The Irani Trophy, between the Ranji Trophy champions and the Rest of
India, is a key component of the Indian domestic cricket calendar and is

  top_docs = bm25_retriever.get_relevant_documents(query)


In [19]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [20]:
import pprint

In [21]:
pprint.pprint(prompt.messages)

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [22]:
# from langchain.prompts import ChatPromptTemplate
# # Custom prompt
# prompt = ChatPromptTemplate.from_template(
#     """You are a helpful assistant for answering questions using provided context.

# Use the following context to answer the question. If the answer is not contained in the context, say "I don't know".

# Context:
# {context}

# Question:
# {question}

# Answer:"""
# )

In [23]:
import os
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")


from langchain_google_genai import ChatGoogleGenerativeAI
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [24]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

In [25]:
def format_docs(docs):
    return "\n\n".join(doc for doc in docs)
    
    

In [26]:
retriever_runnable = RunnableLambda(retriever)
format_docs_runnable = RunnableLambda(format_docs)  

# Compose the RAG chain
rag_chain = (
    {"context": retriever_runnable | format_docs_runnable, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [27]:

question = "When did sachin made debut in Irani Trophy?"
answer = rag_chain.invoke(question)
print("Answer:", answer)

Answer: Sachin made his Irani Trophy debut at the beginning of November 1989.  He played for the Rest of India and scored a century against Delhi.  This was also when he was selected for the Indian team touring Pakistan.


In [28]:
from docx import Document

# Create a new Word document
doc = Document()
doc.add_heading("Retrieved Documents", level=1)

doc.add_paragraph(answer)

# Save the document
doc.save("Results.docx")

print("Document successfully saved as retrieved_results.docx!")

Document successfully saved as retrieved_results.docx!
