In [20]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

To load the data form webpage

In [21]:
URL="https://ollama.com/library/nomic-embed-text"
loader = WebBaseLoader(URL)


To load the data from PDF for chat boat

In [22]:
#loader = PyPDFLoader("data/handbook.pdf")

In [23]:
pages = loader.load_and_split()
print(f"Number of pages: {len(pages)}")

Number of pages: 1


In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)
len(chunks)

24

In [25]:
embeddings = OllamaEmbeddings(model="embeddinggemma")

In [26]:
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings)

In [27]:
retriever = vector_store.as_retriever()

In [28]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [29]:
llm = OllamaLLM(model="llama3.2")


In [30]:
template="""SYSTEM: You are a QnA bot.
            Be factual and concise in your answers.
            Respond to the following question: {question} only from 
            the below context: {context}.
            If you don't know the answer, just say that you don't know,
            don't try to make up an answer.
            """
prompt = PromptTemplate.from_template(template)

In [31]:
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [32]:
#for pdf file
#chain.invoke("What is work from home policy?")

In [None]:
#for web page
chain.invoke("what is the size of nomic-embed-text?")

'The size of nomic-embed-text is 274MB.'