## Build a chat based product using pdfs as a knowledge base

- [Langchain reference](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/qdrant.html)
- [Qdrant reference](https://qdrant.tech/articles/langchain-integration/)

In [None]:
import glob
from os import environ
from pathlib import Path
from typing import List, Optional

import openai
import qdrant_client
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant

from llm.settings import llm_settings
from workspace.settings import ws_settings

# -*- Set OPENAI_API_KEY in one of 3 ways:
# 1. Set OPENAI_API_KEY variable in workspace/secrets/dev_jupyter_secrets.yml
#    NOTE: Container needs to be restarted to pickup new env: `phi ws restart dev:docker:jupyter -y`
# 2. In `workspace/secrets/openai-api-key.txt` (does not need restart)
# 3. Directly in this notebook (NOT RECOMMENDED)
if environ.get("OPENAI_API_KEY", None) is None:
    OPENAI_API_KEY_FILE = ws_settings.ws_root.joinpath(
        "workspace/secrets/openai-api-key.txt"
    )
    openai.api_key = OPENAI_API_KEY_FILE.read_text().strip()

In [None]:
# Create a list of PDFs to read
pdfs_to_read: List[Path] = []
for pdf in ws_settings.ws_root.glob("data/*.pdf"):
    pdfs_to_read.append(pdf)

In [None]:
%%time

# Create a list of loaded PDF documents
loaded_pdfs: List[Document] = []
for pdf in pdfs_to_read:
    loader = PyPDFLoader(str(pdf))
    # Load the PDF document
    document = loader.load()
    # Add the loaded document to the list
    loaded_pdfs.append(document)
    print(f"Loaded: {str(pdf)}")

In [None]:
%%time

# Create list of chunked loaded pdfs
chunked_pdfs: List[List[Document]] = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
for pdf in loaded_pdfs:
    # Chunk the loaded pdf
    texts = text_splitter.split_documents(pdf)
    # Add the chunks to chunked_pdfs
    chunked_pdfs.append(texts)
    print(f"Chunked PDF length: {len(texts)}")

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
# Flatten the chunked_pdfs
flat_chunked_pdf = [item for sublist in chunked_pdfs for item in sublist]

In [None]:
%%time

# Connect to Qdrant
qdrant = Qdrant.from_documents(
    documents=flat_chunked_pdf,
    embedding=embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="chat_with_pdf",
)

In [None]:
retriever = qdrant.as_retriever()
print(f"Retriever: {retriever}")

In [None]:
%%time

# Create the chain
qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(
        temperature=llm_settings.default_temperature, model=llm_settings.chat_model
    ),
    retriever=retriever,
    condense_question_llm=ChatOpenAI(
        temperature=llm_settings.default_temperature, model=llm_settings.condense_model
    ),
)

In [None]:
chat_history = []

In [None]:
%%time

query = "When did Airbnb IPO?"
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])

In [None]:
%%time

query = "What was Airbnb's net income that year?"
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])

In [None]:
%%time

query = "How many shares were issued that year?"
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])