In [1]:
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai chromadb bs4

Note: you may need to restart the kernel to use updated packages.


In [7]:
import os

In [4]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [10]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125",openai_api_key=os.environ["OPENAI_API_KEY"])

In [11]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [12]:
rag_chain.invoke("What is Task Decomposition?")

'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps, making them more manageable. It involves transforming big tasks into multiple smaller tasks that are easier to handle and interpret. This process can be facilitated by prompting techniques like Chain of Thought or Tree of Thoughts.'

In [14]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts.prompt import PromptTemplate

In [15]:
file_path = "./Doc-3.pdf"
loader = PyPDFLoader(file_path, extract_images=True)

docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    keep_separator=False,
    separators=["\n\n", "\n", ""],
)

splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. Please provide answer as per the given context. If the answer is not in the context please simply state I don't know and don't try to make up an answer.
      Question: {question}
      Context: {context}
      Answer:
      """

prompt_template = PromptTemplate(
    template=template,
    input_variables=[
        "context",
        "question",
    ],
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [16]:
rag_chain.invoke("Please extract comprehensive information about the software project deliverables as outlined in the context. Focus on identifying and describing each deliverable in detail.")

"I don't know"

In [17]:
pip list

Package                                  Version
---------------------------------------- ---------------
aiohttp                                  3.9.3
aiosignal                                1.3.1
annotated-types                          0.6.0
anyio                                    4.3.0
appnope                                  0.1.4
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
arrow                                    1.3.0
asgiref                                  3.8.1
asttokens                                2.4.1
async-lru                                2.0.4
attrs                                    23.2.0
Babel                                    2.14.0
backoff                                  2.2.1
bcrypt                                   4.1.2
beautifulsoup4                           4.12.3
bleach                                   6.1.0
bs4                                      0.0.2
build                                    1.