In [3]:
# pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph

In [2]:
# You may need additional installs:
# ! pip install langchain langchain-groq langchain-ollama langchain-chroma PyPDF2
import os
from getpass import getpass
from dotenv import load_dotenv

# Load the environment variables from .env
load_dotenv()

# --- LangSmith Settings (optional) ---
#os.environ['LANGCHAIN_TRACING_V2'] = 'true'
#os.environ['LANGSMITH_ENDPOINT'] = 'https://api.smith.langchain.com'
#os.environ['LANGSMITH_PROJECT'] = 'multimodal_RAG'
#os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
#os.environ['LANGCHAIN_API_KEY'] = "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"


True

In [3]:
# --- Set up Groq API key ---
if not os.environ.get("GROQ_API_KEY"):
    os.environ["GROQ_API_KEY"] = getpass("Enter API key for Groq: ")



In [4]:
# --- Import necessary modules ---
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
# PDF loader from LangChain
from langchain.document_loaders import PyPDFLoader
# Groq LLM and Ollama embeddings
from langchain_groq import ChatGroq
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableMap, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [3]:
#### 1) Load PDF Documents ####
# Replace "path/to/your.pdf" with the local or remote path to your PDF file.
loader = PyPDFLoader("data/Lectures/Knn and Prob.pdf")
docs = loader.load()


In [4]:
#### 2) Split the text into chunks ####
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


In [5]:
#### 3) Create Embeddings + Vector Store ####
embeddings = OllamaEmbeddings(model="llama3.2")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
retriever = vectorstore.as_retriever()



In [6]:
# 4) Define a function for formatting retrieved docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

format_docs_runnable = RunnableLambda(func=format_docs)


In [7]:
# 5) Construct the Prompt Template (a Runnable)

# Instead of a plain string, we make a ChatPromptTemplate
prompt_template = ChatPromptTemplate.from_template(
    """You are a helpful assistant.
You are given the following context:
{context}

Please use this context to answer the question below as completely as possible.

Question: {question}"""
)

In [8]:
#### 6) Create your LLM ####
llm = ChatGroq(model="llama3-8b-8192")



In [9]:
#### 7) Build the RAG chain ####
rag_chain = (
    # Create a mapping: "context" -> pipeline from retriever to format_docs,
    #                   "question" -> pass through as-is
    RunnableMap({
        "context": retriever | format_docs_runnable,
        "question": RunnablePassthrough()
    })
    # Then pipe the dictionary into the ChatPromptTemplate
    | prompt_template
    # Then pipe the resulting prompt into the LLM
    | llm
    # Finally parse the LLM output as a string
    | StrOutputParser()
)


In [10]:
#### 8) Ask a question about the PDF ####
answer = rag_chain.invoke("What is the main takeaway from the PDF?")
print(answer)


Based on the provided context, the main takeaway from the PDF is that the k-Nearest Neighbors (kNN) algorithm can be sensitive to the choice of the parameter k, which determines the number of nearest neighbors to consider when making a classification decision.

The PDF highlights that:

* A small value of k (e.g., k=1) can result in a complex decision boundary that may lead to overfitting.
* Increasing k (e.g., k=15) can improve the decision boundary by reducing the impact of noise and outliers, but may also increase the risk of underfitting if k is too large.

The PDF also illustrates how the choice of k can affect the classification decision in a specific example, showing how changing k from 3 to 5 can change the classification outcome from Red to Green.

Overall, the main takeaway is that the choice of k is crucial in kNN and requires careful consideration to achieve good classification performance.
