In [14]:
from io import BytesIO
import sys

from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader 
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import streamlit as st

from langchain.chains.llm import LLMChain
from langchain.chains.question_answering import load_qa_chain

sys.path.append("../src")
from prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT

In [15]:
def get_pdf_text(pdf_docs: list[BytesIO]) -> str:
    """Extracts string from streamlit UploadedFile

    Args:
        pdf_docs (list[BytesIO]): List containing multiple streamlit UploadedFiles

    Returns:
        str: Text data
    """

    text_data = ""

    for pdf in pdf_docs:
        loader = PyPDFLoader(pdf)
        pages = loader.load_and_split()
        for page in pages:
            text_data += (page.page_content)

    return text_data

def get_text_chunks(text_data: str) -> list[str]:
    """Split raw text into chunks

    Args:
        text_data (str): Raw text data

    Returns:
        list[str]: List containing chunks of raw text input
    """

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )

    chunks = splitter.split_text(text_data)

    return chunks

def get_vectorstore(chunks: list[str]) -> FAISS:
    """Instantiate Faiss vectorstore

    Args:
        chunks (list[str]): Text chunks

    Returns:
        FAISS: Vectorstore object
    """

    # Uses text-embedding-ada-002 embedding model by default
    # ada-002 uses cl100k_base tokeniser and takes max 8191 input tokens
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)

    return vectorstore

def get_conversation_chain(vectorstore: FAISS) -> ConversationalRetrievalChain:
    """Create conversation chain

    Args:
        vectorstore (FAISS): FAISS vectorstore object

    Returns:
        ConversationalRetrievalChain: A conversation retrieval chain object
    """

    llm = ChatOpenAI()
    memory = ConversationBufferMemory(memory_key="chat_history", output_key="answer", return_messages=True)
    # question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
    # doc_chain = load_qa_chain(llm, chain_type="stuff", prompt=QA_PROMPT)

    # Can consider changing 'chain_type' to ['map_reduce', 'refine', 'map_rerank']
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        return_source_documents=True,
        condense_question_prompt=CONDENSE_QUESTION_PROMPT,
        combine_docs_chain_kwargs={"prompt": QA_PROMPT}
    )
    return conversation_chain

In [16]:
text_data = ""
loader = PyPDFLoader("../data/crwd_annualreport_fy22.pdf")
pages = loader.load_and_split()
for page in pages:
    text_data += (page.page_content)

In [17]:
load_dotenv()

# Get pdf text data
# raw_text = get_pdf_text("../data/crwd_annualreport_fy22.pdf")

# Convert text data into chunks
chunks = get_text_chunks(text_data)

# Create vector store
vectorstore = get_vectorstore(chunks)

# Create conversation chain and save it as a stateful variable
conversation = get_conversation_chain(vectorstore)

In [18]:
result = conversation({"question": "what is crowdstrike's net profit for 2022?", "chat_history": []})

In [19]:
result

{'question': "what is crowdstrike's net profit for 2022?",
 'chat_history': [HumanMessage(content="what is crowdstrike's net profit for 2022?", additional_kwargs={}, example=False),
  AIMessage(content="Based on the provided information, CrowdStrike's net loss for 2022 is $232,378. Therefore, they did not have a net profit in 2022.", additional_kwargs={}, example=False)],
 'answer': "Based on the provided information, CrowdStrike's net loss for 2022 is $232,378. Therefore, they did not have a net profit in 2022.",
 'source_documents': [Document(page_content='Pacific, including Japan, and expanding current data centers overseas.\n•Extending Our Falcon Platform and Ecosystem. We designed our architecture to be open, interoperable, and highly\nextensible. We launched the CrowdStrike Store, the first open cloud-based application PaaS for cybersecurity, which\nallows customers to purchase CrowdStrike products and provides an ecosystem of trusted partners and applications for\nour customers 

In [20]:
type(result["source_documents"][0])

langchain.schema.Document

In [34]:
sample = result["source_documents"]

In [35]:
sample

[Document(page_content='Pacific, including Japan, and expanding current data centers overseas.\n•Extending Our Falcon Platform and Ecosystem. We designed our architecture to be open, interoperable, and highly\nextensible. We launched the CrowdStrike Store, the first open cloud-based application PaaS for cybersecurity, which\nallows customers to purchase CrowdStrike products and provides an ecosystem of trusted partners and applications for\nour customers to choose from. We plan to continue investing in the CrowdStrike Store to empower our partners by\nmaking it easier to build applications and to enable our customers to more easily discover, try, and purchase additional\ncloud modules from both trusted partners and us.\n1127/06/2023, 21:47 crwd-20230131\nhttps://www .sec.gov/Ar chives/edgar/data/1535527/000153552723000008/crwd-20230131.htm 15/161Table of Contents\nWe have experienced significant growth, with revenue increasing from $1.5 billion in fiscal 2022 to $2.2\xa0billion in fisc

In [36]:
sample[0].page_content

'Pacific, including Japan, and expanding current data centers overseas.\n•Extending Our Falcon Platform and Ecosystem. We designed our architecture to be open, interoperable, and highly\nextensible. We launched the CrowdStrike Store, the first open cloud-based application PaaS for cybersecurity, which\nallows customers to purchase CrowdStrike products and provides an ecosystem of trusted partners and applications for\nour customers to choose from. We plan to continue investing in the CrowdStrike Store to empower our partners by\nmaking it easier to build applications and to enable our customers to more easily discover, try, and purchase additional\ncloud modules from both trusted partners and us.\n1127/06/2023, 21:47 crwd-20230131\nhttps://www .sec.gov/Ar chives/edgar/data/1535527/000153552723000008/crwd-20230131.htm 15/161Table of Contents\nWe have experienced significant growth, with revenue increasing from $1.5 billion in fiscal 2022 to $2.2\xa0billion in fiscal 2023,'

In [37]:
sample[0].json()

'{"page_content": "Pacific, including Japan, and expanding current data centers overseas.\\n\\u2022Extending Our Falcon Platform and Ecosystem. We designed our architecture to be open, interoperable, and highly\\nextensible. We launched the CrowdStrike Store, the first open cloud-based application PaaS for cybersecurity, which\\nallows customers to purchase CrowdStrike products and provides an ecosystem of trusted partners and applications for\\nour customers to choose from. We plan to continue investing in the CrowdStrike Store to empower our partners by\\nmaking it easier to build applications and to enable our customers to more easily discover, try, and purchase additional\\ncloud modules from both trusted partners and us.\\n1127/06/2023, 21:47 crwd-20230131\\nhttps://www .sec.gov/Ar chives/edgar/data/1535527/000153552723000008/crwd-20230131.htm 15/161Table of Contents\\nWe have experienced significant growth, with revenue increasing from $1.5 billion in fiscal 2022 to $2.2\\u00a0bi

In [39]:
[f"Source {i}: {doc.page_content}" for i, doc in enumerate(sample)]

['Source 0: Pacific, including Japan, and expanding current data centers overseas.\n•Extending Our Falcon Platform and Ecosystem. We designed our architecture to be open, interoperable, and highly\nextensible. We launched the CrowdStrike Store, the first open cloud-based application PaaS for cybersecurity, which\nallows customers to purchase CrowdStrike products and provides an ecosystem of trusted partners and applications for\nour customers to choose from. We plan to continue investing in the CrowdStrike Store to empower our partners by\nmaking it easier to build applications and to enable our customers to more easily discover, try, and purchase additional\ncloud modules from both trusted partners and us.\n1127/06/2023, 21:47 crwd-20230131\nhttps://www .sec.gov/Ar chives/edgar/data/1535527/000153552723000008/crwd-20230131.htm 15/161Table of Contents\nWe have experienced significant growth, with revenue increasing from $1.5 billion in fiscal 2022 to $2.2\xa0billion in fiscal 2023,',
 

In [23]:
sample2 = result["chat_history"][0]

In [24]:
sample2.json()

'{"content": "what is crowdstrike\'s net profit for 2022?", "additional_kwargs": {}, "example": false}'

In [16]:
result["chat_history"]

[HumanMessage(content='what is the net income?', additional_kwargs={}, example=False),
 AIMessage(content='The net income for the year ended January 31, 2023, is a loss of $182,285.', additional_kwargs={}, example=False)]