In [24]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [1]:
# PDF Reader
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader('../data/Natural Language Processing with Python.pdf')
docs = loader.load()

In [5]:
docs[:20]

[Document(page_content='', metadata={'source': '../data/Natural Language Processing with Python.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': '../data/Natural Language Processing with Python.pdf', 'page': 1}),
 Document(page_content='Natural Language Processing with Python', metadata={'source': '../data/Natural Language Processing with Python.pdf', 'page': 2}),
 Document(page_content='', metadata={'source': '../data/Natural Language Processing with Python.pdf', 'page': 3}),
 Document(page_content='Natural Language Processing\nwith Python\nSteven Bird, Ewan Klein, and Edward Loper\nBeijing •Cambridge •Farnham •Köln •Sebastopol •Taipei •Tokyo', metadata={'source': '../data/Natural Language Processing with Python.pdf', 'page': 4}),
 Document(page_content='Natural Language Processing with Python\nby Steven Bird, Ewan Klein, and Edward Loper\nCopyright © 2009 Steven Bird, Ewan Klein, and Edward Loper. All rights reserved.\nPrinted in the United States of America.\nPublis

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len)
doc_chunks = splitter.split_documents(docs)

In [7]:
doc_chunks[:20]

[Document(page_content='Natural Language Processing with Python', metadata={'source': '../data/Natural Language Processing with Python.pdf', 'page': 2}),
 Document(page_content='Natural Language Processing\nwith Python\nSteven Bird, Ewan Klein, and Edward Loper\nBeijing •Cambridge •Farnham •Köln •Sebastopol •Taipei •Tokyo', metadata={'source': '../data/Natural Language Processing with Python.pdf', 'page': 4}),
 Document(page_content='Natural Language Processing with Python\nby Steven Bird, Ewan Klein, and Edward Loper\nCopyright © 2009 Steven Bird, Ewan Klein, and Edward Loper. All rights reserved.\nPrinted in the United States of America.\nPublished by O’Reilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472.\nO’Reilly \nbooks may be purchased for educational, business, or sales promotional use. Online editions\nare also available for most titles ( http://my.safaribooksonline.com). For more information, contact our\ncorporate/institutional sales department: (800) 998

In [8]:
## FAISS vector database
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
db=FAISS.from_documents(doc_chunks[:30], OpenAIEmbeddings())

  warn_deprecated(


In [9]:
## Vector database
query="What is NLTK?"
result = db.similarity_search(query)

In [10]:
result

[Document(page_content='NLTK-Data\nThis contains the linguistic corpora that are analyzed and processed in the book.\nNumPy (recommended)\nThis \nis a scientific computing library with support for multidimensional arrays and\nlinear algebra, required for certain probability, tagging, clustering, and classifica-\ntion tasks.\nMatplotlib (recommended)\nThis is a 2D plotting library for data visualization, and is used in some of the book’s\ncode samples that produce line graphs and bar charts.\nNetworkX (optional)\nThis is a library for storing and manipulating network structures consisting of\nnodes and edges. For visualizing semantic networks, also install the Graphviz\nlibrary.\nProver9 (optional)\nThis is an automated theorem prover for first-order and equational logic, used to\nsupport inference in language processing.\nNatural Language Toolkit (NLTK)\nNLTK was originally created in 2001 as part of a computational linguistics course in\nthe Department of Computer and Information Scie

In [11]:
## Chain and Retriever
from langchain_community.llms import Ollama
## Load Ollama LAMA2 LLM model
llm=Ollama(model="llama2")
llm

Ollama()

In [25]:
from langchain_groq import ChatGroq
## load the Groq API key
groq_api_key=os.environ['GROQ_API_KEY']
llm=ChatGroq(groq_api_key=groq_api_key,
             model_name="llama3-8b-8192")

In [12]:
## Design ChatPrompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.
<context>
{context}
</context>
Question: {input}
""")

In [26]:
## Chain Introduction
## Create Stuff Document Chain
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain=create_stuff_documents_chain(llm,prompt)

In [27]:
"""
Retrievers: A retriever is an interface that returns documents given
 an unstructured query. It is more general than a vector store.
 A retriever does not need to be able to store documents, only to 
 return (or retrieve) them. Vector stores can be used as the backbone
 of a retriever, but there are other types of retrievers as well. 
 https://python.langchain.com/docs/modules/data_connection/retrievers/   
"""

retriever=db.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x10c5b1c50>)

In [28]:
"""
Retrieval chain:This chain takes in a user inquiry, which is then
passed to the retriever to fetch relevant documents. Those documents 
(and original inputs) are then passed to an LLM to generate a response
https://python.langchain.com/docs/modules/chains/
"""
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [29]:
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x10c5b1c50>), config={'run_name': 'retrieve_documents'})
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), config={'run_name': 'format_inputs'})
            | ChatPromptTemplate(input_variables=['context', 'input'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], template='\nAnswer the following question based only on the provided context.\nThink step by step before providing a detailed answer.\n<context>\n{context}\n</context>\nQuestion: {input}\n'))])
            | ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x117397450>, async_client=

In [30]:
retrieval_chain.invoke({"input":"What is NLTK?"})

{'input': 'What is NLTK?',
 'context': [Document(page_content='NLTK-Data\nThis contains the linguistic corpora that are analyzed and processed in the book.\nNumPy (recommended)\nThis \nis a scientific computing library with support for multidimensional arrays and\nlinear algebra, required for certain probability, tagging, clustering, and classifica-\ntion tasks.\nMatplotlib (recommended)\nThis is a 2D plotting library for data visualization, and is used in some of the book’s\ncode samples that produce line graphs and bar charts.\nNetworkX (optional)\nThis is a library for storing and manipulating network structures consisting of\nnodes and edges. For visualizing semantic networks, also install the Graphviz\nlibrary.\nProver9 (optional)\nThis is an automated theorem prover for first-order and equational logic, used to\nsupport inference in language processing.\nNatural Language Toolkit (NLTK)\nNLTK was originally created in 2001 as part of a computational linguistics course in\nthe Depa

In [31]:
res = retrieval_chain.invoke({"input":"What is NLTK?"})

In [35]:
res['context'][0].page_content

'NLTK-Data\nThis contains the linguistic corpora that are analyzed and processed in the book.\nNumPy (recommended)\nThis \nis a scientific computing library with support for multidimensional arrays and\nlinear algebra, required for certain probability, tagging, clustering, and classifica-\ntion tasks.\nMatplotlib (recommended)\nThis is a 2D plotting library for data visualization, and is used in some of the book’s\ncode samples that produce line graphs and bar charts.\nNetworkX (optional)\nThis is a library for storing and manipulating network structures consisting of\nnodes and edges. For visualizing semantic networks, also install the Graphviz\nlibrary.\nProver9 (optional)\nThis is an automated theorem prover for first-order and equational logic, used to\nsupport inference in language processing.\nNatural Language Toolkit (NLTK)\nNLTK was originally created in 2001 as part of a computational linguistics course in\nthe Department of Computer and Information Science at the University o

In [36]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain,create_history_aware_retriever

contextualize_q_system_prompt = (
        "Given a chat history and the latest user question "
        "which might reference context in the chat history, "
        "formulate a standalone question which can be understood "
        "without the chat history. Do NOT answer the question, "
        "just reformulate it if needed and otherwise return it as is."
    )
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

### Answer question ###
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Think step by step before providing a detailed answer. "
    "Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [38]:
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x10c5b1c50>))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without

In [39]:
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [40]:
conversational_rag_chain.invoke(
    {"input": "What is NLTK?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)

{'input': 'What is NLTK?',
 'chat_history': [],
 'context': [Document(page_content='NLTK-Data\nThis contains the linguistic corpora that are analyzed and processed in the book.\nNumPy (recommended)\nThis \nis a scientific computing library with support for multidimensional arrays and\nlinear algebra, required for certain probability, tagging, clustering, and classifica-\ntion tasks.\nMatplotlib (recommended)\nThis is a 2D plotting library for data visualization, and is used in some of the book’s\ncode samples that produce line graphs and bar charts.\nNetworkX (optional)\nThis is a library for storing and manipulating network structures consisting of\nnodes and edges. For visualizing semantic networks, also install the Graphviz\nlibrary.\nProver9 (optional)\nThis is an automated theorem prover for first-order and equational logic, used to\nsupport inference in language processing.\nNatural Language Toolkit (NLTK)\nNLTK was originally created in 2001 as part of a computational linguistic