In [1]:
! pip install -qU langchain langchain-core langchain-community langchain-openai

In [2]:
! pip install -qU qdrant-client protobuf==4.25.3

In [3]:
! pip install -qU tiktoken pymupdf

In [4]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [5]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-4o")

In [7]:
from langchain.document_loaders import PyMuPDFLoader

docs = PyMuPDFLoader("data/United states securities and exchange commission 10-Q.pdf").load()

In [8]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o").encode(
        text,
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 0,
    length_function = tiktoken_len,
)

split_chunks = text_splitter.split_documents(docs)

In [9]:
len(split_chunks)

229

In [10]:
max_chunk_length = 0

for chunk in split_chunks:
  max_chunk_length = max(max_chunk_length, tiktoken_len(chunk.page_content))

print(max_chunk_length)

199


In [11]:
from langchain_openai.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [12]:
from langchain_community.vectorstores import Qdrant

qdrant_vectorstore = Qdrant.from_documents(
    split_chunks,
    embedding_model,
    location=":memory:",
    collection_name="Airbnb",
)

In [13]:
qdrant_retriever = qdrant_vectorstore.as_retriever()

In [14]:

from langchain_core.prompts import ChatPromptTemplate

system_template = "You are an expert in financial statements. Use the provided context to answer the user's query. You may not answer the user's query unless there is specific context in the following text. If you do not know the answer, or cannot answer, please respond with I don't know."
human_template = "{content}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("human", human_template)
])



RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

Never use the phrase "based on provided context"
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [15]:
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

In [16]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What is the 'maximum number of shares to be sold under the 10b5-1 Trading plan' by Brian Chesky?"})

In [17]:
response["response"].content

'The maximum number of shares to be sold under the 10b5-1 Trading Plan by Brian Chesky is 1,146,000.'

In [18]:
for context in response["context"]:
  print("Context:")
  print(context)
  print("----")

Context:
page_content='Table of Contents\nItem 5. Other Information\nDirector and Officer 10b5-1 Trading Plans (“10b5-1 Plans”)\nThe following table sets forth the material terms of 10b5-1 Plans intended to satisfy the affirmative defense conditions of Rule 10b5–1(c) that were adopted, terminated, or modified\nby our directors and officers during the three months ended March\xa031, 2024:\nName and Title of Director or Officer\nAction\n\xa0Date\nExpiration Date\nMaximum Number of\nShares to be Sold\nUnder the Plan\nDavid Bernstein, Chief Accounting Officer\nAdopt\n2/22/2024\n1/27/2025\n41,000\nBrian Chesky, Chief Executive Officer and Director\nAdopt\n2/28/2024\n11/11/2024\n1,146,000\nJoseph Gebbia, Director\nAdopt\n2/29/2024' metadata={'source': 'data/United states securities and exchange commission 10-Q.pdf', 'file_path': 'data/United states securities and exchange commission 10-Q.pdf', 'page': 32, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDG

In [21]:
response = retrieval_augmented_qa_chain.invoke({"question" : "How are you doing?"})

In [22]:
response

{'response': AIMessage(content="I'm doing well, thank you! How can I assist you today?", response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 1824, 'total_tokens': 1838}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_3e7d703517', 'finish_reason': 'stop', 'logprobs': None}, id='run-ba78e693-adb4-45cc-95b9-c7e02c040f81-0', usage_metadata={'input_tokens': 1824, 'output_tokens': 14, 'total_tokens': 1838}),
 'context': [Document(page_content='Table of Contents\nItem 3. Quantitative and Qualitative Disclosures About Market Risk\nThere have been no material changes in our market risk during the three months ended March 31, 2024. For additional information, see Part II, Item 7A. "Quantitative and\nQualitative Disclosures About Market Risk" in our Annual Report on Form 10-K for the year ended December 31, 2023.\nItem 4. Controls and Procedures\nEvaluation of Disclosure Controls and Procedures\nOur management, with the participation of our principal executi