In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [1]:
import os
from dotenv import load_dotenv
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")

In [2]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
embeddings.embed_query("aur kya kr rha h")

In [None]:
dim_len = embeddings.embed_query("Hello, how are you?")
print(dim_len)

In [5]:
len(dim_len)

3072

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
documents=["what is a capital of USA?",
           "Who is a president of USA?",
           "Who is a prime minister of India?"]

In [None]:
doc_embeddings=embeddings.embed_documents(documents)
doc_embeddings

In [9]:
my_query="Narendra modi is prime minister of india?"

In [10]:
query_embedding=embeddings.embed_query(my_query)

In [11]:
cosine_similarity([query_embedding],doc_embeddings)

array([[0.17597817, 0.31412641, 0.64803648]])

In [12]:
from sklearn.metrics.pairwise import euclidean_distances

In [14]:
euclidean_distances([query_embedding],doc_embeddings)

array([[1.28376157, 1.17121613, 0.83900363]])

In [15]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [21]:
index=faiss.IndexFlatL2(3072)

In [22]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000019D46DB8750> >

In [23]:
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [24]:
vector_store.add_texts(["AI is future","AI is powerful","Dogs are cute"])

['c4406cf2-d2e9-44f1-8915-80a544cb3534',
 'bdef99e5-2e54-42a9-8c3c-d65a208fd1bb',
 '7c4944b1-6ca8-4a87-99c8-8eaf1bce9f4f']

In [25]:
vector_store.index_to_docstore_id

{0: 'c4406cf2-d2e9-44f1-8915-80a544cb3534',
 1: 'bdef99e5-2e54-42a9-8c3c-d65a208fd1bb',
 2: '7c4944b1-6ca8-4a87-99c8-8eaf1bce9f4f'}

In [26]:
results = vector_store.similarity_search("Tell me about AI", k=3)

In [27]:
results

[Document(id='c4406cf2-d2e9-44f1-8915-80a544cb3534', metadata={}, page_content='AI is future'),
 Document(id='bdef99e5-2e54-42a9-8c3c-d65a208fd1bb', metadata={}, page_content='AI is powerful'),
 Document(id='7c4944b1-6ca8-4a87-99c8-8eaf1bce9f4f', metadata={}, page_content='Dogs are cute')]


| Feature               | `Flat`                | `IVF` (Inverted File Index)        | `HNSW` (Graph-based Index)          |
| --------------------- | --------------------- | ---------------------------------- | ----------------------------------- |
| Type of Search     | Exact                 | Approximate (cluster-based)        | Approximate (graph-based traversal) |
| Speed               | Slow (linear scan)    | Fast (search only in top clusters) | Very Fast (graph walk)              |



| Dataset Size              | Recommended Index                 |
| ------------------------- | --------------------------------- |
| UPTO 1L                     | `IndexFlatL2` or `IndexFlatIP`    |
| UPTO 1M                  | `IndexIVFFlat` or `IndexHNSWFlat` |
| > 1M                      | `IndexIVFPQ` or `IndexHNSWFlat`   |

In [28]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('IPL_Season_Summary_2008_2025.pdf')
pdf_data = loader.load()
pdf_data

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250607130752', 'source': 'IPL_Season_Summary_2008_2025.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='IPL Season Summary (2008 - 2025)\nIPL 2008 Summary\nWinner: Rajasthan Royals\nRunner-Up: Chennai Super Kings\nFinal Venue: DY Patil Stadium, Navi Mumbai\nSummary: Shane Warne led Rajasthan to a fairytale win.\nPage 1'),
 Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250607130752', 'source': 'IPL_Season_Summary_2008_2025.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content='IPL Season Summary (2008 - 2025)\nIPL 2009 Summary\nWinner: Deccan Chargers\nRunner-Up: RCB\nFinal Venue: Wanderers, Johannesburg\nSummary: Tournament held in South Africa due to elections.\nPage 2'),
 Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'P

In [29]:
index=faiss.IndexFlatIP(3072)
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [30]:
vector_store.add_documents(pdf_data)

['763272bf-cd32-4f1b-8e77-96a831490d53',
 '2c2a527a-39ae-4c43-b356-fa38a30be4a3',
 '3b8ff115-e861-4819-a030-9a575652bae7',
 '5abc41f6-a43f-4779-b323-ffbcaed8fdf3',
 'bdd99bb8-51eb-49b9-ab7d-df7078f0072b',
 '3f8ab664-8a70-4f69-b4ac-cc030d703748',
 'ee245eae-859e-446b-ad53-f62828e141bc',
 'e6a1c09a-86f6-4afc-a2e7-b538d7d1e45e',
 '7347dc42-b3e5-4f70-ba0e-c4901c2abda9',
 '6e1ef39a-b171-41cc-8910-a09f9cabe25e',
 'ccd6d324-bbab-4472-a1be-936a2e4552bf',
 '6809b495-8e5b-49e3-87af-f2624f77458a',
 '19ca5f08-d23a-4d91-ab58-110e2c9a91c4',
 'ed3b2434-6a1d-42df-aee9-24a0072bf328',
 'a3f2ec50-f68a-40ed-aa28-f5684884611d',
 'c15a9a02-4acb-4136-adb3-0acb8ae562fb',
 '7cc5020a-88f8-4185-b6c3-149cca42ca16',
 '0e8c9496-950b-41f1-b761-4e6eea8a0186']

In [33]:
answer = vector_store.similarity_search(
    "2016 me ipl kon jeeta tha",
    k=2 #hyperparameter
    
)

In [34]:
answer[0].page_content

"IPL Season Summary (2008 - 2025)\nIPL 2016 Summary\nWinner: Sunrisers Hyderabad\nRunner-Up: RCB\nFinal Venue: Bengaluru\nSummary: Warner's SRH triumphed over Kohli's RCB.\nPage 9"

NOW TRY TO MAKE STRUCTURED ANSWER

In [44]:
from langchain_openai import ChatOpenAI
model=ChatOpenAI(model = "o1-mini")

In [35]:
retriever=vector_store.as_retriever(
    search_kwargs={"k": 2} #hyperparameter
)

In [36]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [37]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [38]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [43]:
answer = vector_store.similarity_search(
    "2016 me ipl kon jeeta tha",
    k=2 #hyperparameter
    
)
answer[0].page_content

"IPL Season Summary (2008 - 2025)\nIPL 2016 Summary\nWinner: Sunrisers Hyderabad\nRunner-Up: RCB\nFinal Venue: Bengaluru\nSummary: Warner's SRH triumphed over Kohli's RCB.\nPage 9"

In [40]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [41]:
rag_chain.invoke("2016 me ipl kon jeeta tha")

"Sunrisers Hyderabad won the IPL in 2016. They defeated Royal Challengers Bangalore in the final held in Bengaluru. This victory marked SRH's first IPL title."