In [3]:
from langchain_community.document_loaders import WebBaseLoader

In [6]:
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Werner_Heisenberg")
loader

<langchain_community.document_loaders.web_base.WebBaseLoader at 0x110630cd0>

In [8]:
docs = loader.load()
docs

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Werner_Heisenberg', 'title': 'Werner Heisenberg - Wikipedia', 'language': 'en'}, page_content='\n\n\n\nWerner Heisenberg - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\

In [None]:
## Load Data --> Docs --> Chunks --> Vectors --> Embeddings --> Vector Store --> Retrieval --> RAG --> LLM --> Answer

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

documents = text_splitter.split_documents(docs)

len(documents) # Number of chunks and 1000 characters per chunk

154

In [11]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="gemma:2b", base_url="http://localhost:11434")



In [12]:
from langchain_community.vectorstores import FAISS

vectordb = FAISS.from_documents(documents, embeddings)

In [13]:
query = "Who is the person mentioned in the documents?"
result = vectordb.similarity_search(query)
result[0].page_content

"Heisenberg was the target of an assassination by spy Moe Berg in the film The Catcher Was a Spy, based on real events. Heisenberg is also credited with building the atomic bomb used by the Axis in the Amazon TV series adaptation of the novel The Man in the High Castle by Philip K. Dick. Atomic bombs in this universe are referred to as Heisenberg Devices.\nThe 2015 TV film Kampen om Tongtvannet (The Heavy Water War: Stopping Hitler's Atomic Bomb) [187] directed by Per-Olav Sørensen, extensively features Werner Heisenberg and his career, including his nuclear research under the Nazis.   \nDaniel Craig portrayed Heisenberg in the 2002 film Copenhagen, an adaptation of Michael Frayn's play.[188] Matthias Schweighöfer portrayed Heisenberg in the 2023 biopic Oppenheimer.[189]\nHeisenberg is the namesake of Resident Evil Village secondary antagonist Karl Heisenberg. Heisenberg's research on ferromagnetism served as inspiration for the character's magnetic abilities."

In [14]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model="gemma:2b", base_url="http://localhost:11434")

In [23]:
# Retrieval chain, Document chain, LLM chain

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context:
    <question>
        {input}
    </question>
    <context>
        {context}
    </context>
    """
)

document_chain = create_stuff_documents_chain(llm, prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <question>\n        {input}\n    </question>\n    <context>\n        {context}\n    </context>\n    '), additional_kwargs={})])
| ChatOllama(model='gemma:2b', base_url='http://localhost:11434')
| StrOutputParser(), kwargs={}, config={'run_name': 'stuff_documents_chain'}, config_factories=[])

In [25]:
from langchain_core.documents import Document

print(document_chain.invoke({
    "input": "Who is the person mentioned in the documents?",
    "context": [Document(page_content=result[0].page_content)]
}))

The person mentioned in the context is Werner Heisenberg. He is the namesake of Resident Evil Village secondary antagonist Karl Heisenberg. Heisenberg's research on ferromagnetism served as inspiration for the character's magnetic abilities.


In [17]:
result[0].page_content

"Heisenberg was the target of an assassination by spy Moe Berg in the film The Catcher Was a Spy, based on real events. Heisenberg is also credited with building the atomic bomb used by the Axis in the Amazon TV series adaptation of the novel The Man in the High Castle by Philip K. Dick. Atomic bombs in this universe are referred to as Heisenberg Devices.\nThe 2015 TV film Kampen om Tongtvannet (The Heavy Water War: Stopping Hitler's Atomic Bomb) [187] directed by Per-Olav Sørensen, extensively features Werner Heisenberg and his career, including his nuclear research under the Nazis.   \nDaniel Craig portrayed Heisenberg in the 2002 film Copenhagen, an adaptation of Michael Frayn's play.[188] Matthias Schweighöfer portrayed Heisenberg in the 2023 biopic Oppenheimer.[189]\nHeisenberg is the namesake of Resident Evil Village secondary antagonist Karl Heisenberg. Heisenberg's research on ferromagnetism served as inspiration for the character's magnetic abilities."

However, it is good practice to let the documents first come from the retriever. That way, the retriever can be used to dynamically select the most relevant documents and pass those in for a given question.

In [26]:
## Retriever 

retriever = vectordb.as_retriever()
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(retriever, document_chain)


In [27]:
## Get the response from the LLM

response = retrieval_chain.invoke({"input": "Who is the person mentioned in the documents?"})
response['answer']

'The person mentioned in the context is Werner Heisenberg. He is the namesake of Resident Evil Village secondary antagonist Karl Heisenberg.'