In [1]:
from io import BytesIO

from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader 
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import streamlit as st

In [2]:
def get_pdf_text(pdf_docs: list[BytesIO]) -> str:
    """Extracts string from streamlit UploadedFile

    Args:
        pdf_docs (list[BytesIO]): List containing multiple streamlit UploadedFiles

    Returns:
        str: Text data
    """

    text_data = ""

    for pdf in pdf_docs:
        loader = PyPDFLoader(pdf)
        pages = loader.load_and_split()
        for page in pages:
            text_data += (page.page_content)

    return text_data

def get_text_chunks(text_data: str) -> list[str]:
    """Split raw text into chunks

    Args:
        text_data (str): Raw text data

    Returns:
        list[str]: List containing chunks of raw text input
    """

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )

    chunks = splitter.split_text(text_data)

    return chunks

def get_vectorstore(chunks: list[str]) -> FAISS:
    """Instantiate Faiss vectorstore

    Args:
        chunks (list[str]): Text chunks

    Returns:
        FAISS: Vectorstore object
    """

    # Uses text-embedding-ada-002 embedding model by default
    # ada-002 uses cl100k_base tokeniser and takes max 8191 input tokens
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)

    return vectorstore

def get_conversation_chain(vectorstore: FAISS) -> ConversationalRetrievalChain:
    """Create conversation chain

    Args:
        vectorstore (FAISS): FAISS vectorstore object

    Returns:
        ConversationalRetrievalChain: A conversation retrieval chain object
    """

    llm = ChatOpenAI()
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain

def handle_userinput(user_question):
    result = st.session_state.conversation({"question": user_question})
    st.write(result)

In [3]:
text_data = ""
loader = PyPDFLoader("../data/crwd_annualreport_fy22.pdf")
pages = loader.load_and_split()
for page in pages:
    text_data += (page.page_content)

In [4]:
load_dotenv()

# Get pdf text data
# raw_text = get_pdf_text("../data/crwd_annualreport_fy22.pdf")

# Convert text data into chunks
chunks = get_text_chunks(text_data)

# Create vector store
vectorstore = get_vectorstore(chunks)

# Create conversation chain and save it as a stateful variable
conversation = get_conversation_chain(vectorstore)

In [5]:
result = conversation({"question": "who is crowdstrike's CEO?"})

In [6]:
result

{'question': "who is crowdstrike's CEO?",
 'chat_history': [HumanMessage(content="who is crowdstrike's CEO?", additional_kwargs={}, example=False),
  AIMessage(content="CrowdStrike's CEO is George Kurtz.", additional_kwargs={}, example=False)],
 'answer': "CrowdStrike's CEO is George Kurtz."}

In [11]:
result["chat_history"][0]

HumanMessage(content="who is crowdstrike's CEO?", additional_kwargs={}, example=False)

In [None]:
result.