In [None]:
import dropbox
import shutil
import getpass
import os

DOWNLOAD_FOLDER = "content"
TEST_CONTENT_DROPBOX_FOLDER = "LLM Doc Exp Test Content"

In [None]:
dbx = dropbox.Dropbox(getpass.getpass("Dropbox API Key:"))

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Open AI API Key:")

In [None]:
# Ensure local download folder exists, and delete its contents

def create_download_folder():
    if not os.path.exists(DOWNLOAD_FOLDER):
        os.makedirs(DOWNLOAD_FOLDER)

def clear_downloads_folder():
    for filename in os.listdir(DOWNLOAD_FOLDER):
        file_path = os.path.join(DOWNLOAD_FOLDER, filename)
        print(file_path)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

create_download_folder()
clear_downloads_folder()

# Filter dropbox paper docs and download to content file

Download selected test paper docs using the [Dropbox API](https://www.dropbox.com/developers/documentation/http/documentation#paper-docs-download) and [Python SDK](https://dropbox-sdk-python.readthedocs.io/en/latest/index.html)

The dashboard for the Dropbox App used to do this can be found [here](https://www.dropbox.com/developers/apps/info/la3hq2wkhl5wx4m)

In [None]:
from dropbox.paper import ExportFormat, ListPaperDocsFilterBy

def get_file_path(doc_id):
    return os.path.join(DOWNLOAD_FOLDER, f"{doc_titles[doc_id]}.md")

def download_doc(doc_id):
    result = dbx.paper_docs_download_to_file(get_file_path(doc_id), doc_id, ExportFormat('markdown'))
    print(f"- downloaded '{result.title}'")
    return result

print("Retrieving document IDs")
doc_ids = dbx.paper_docs_list(filter_by=ListPaperDocsFilterBy.docs_created).doc_ids
print(f"- {len(doc_ids)} documents found")

print("Filtering documents in folder")
docs_ids_in_folder = [doc_id for doc_id in doc_ids if TEST_CONTENT_DROPBOX_FOLDER in [folder.name for folder in dbx.paper_docs_get_folder_info(doc_id).folders or []]]
print(f"- {len(docs_ids_in_folder)} documents found in folder")

print("Retrieving document titles")
doc_titles = {doc_id: dbx.paper_docs_download(doc_id, ExportFormat('markdown'))[0].title for doc_id in docs_ids_in_folder}

print("Downloading documents")
results = [download_doc(doc_id) for doc_id in docs_ids_in_folder]
print("Download complete")

# Simple RAG Q&A using the downloaded files

Uses [this repo](https://github.com/AI-Maker-Space/LLM-Ops-Cohort-1/blob/main/Week%201/Tuesday/Barbie_Retrieval_Augmented_Question_Answering_(RAQA)_Assignment%20(Assignment%20Version).ipynb) as a reference.


Additional Resources:
https://github.com/zylon-ai/private-gpt/issues/358#issuecomment-1563663500
https://python.langchain.com/docs/integrations/vectorstores/starrocks/
https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter/

In [None]:
# from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# markdown_path = "content/How Express Entry works.md"
markdown_path = "content"

# loader = TextLoader(markdown_path)
loader = DirectoryLoader(markdown_path, glob="**/*.md", loader_cls=TextLoader)

raw_documents = loader.load()

In [None]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, # the character length of the chunk
    chunk_overlap = 100, # the character length of the overlap between chunks
    length_function = len, # the length function
)

documents = text_splitter.split_documents(raw_documents)

In [None]:
print(documents[0])
print(documents[1])

# Index Creation

https://python.langchain.com/docs/modules/data_connection/vectorstores/

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.storage import LocalFileStore

In [None]:
store = LocalFileStore("./cache/")

core_embeddings_model = OpenAIEmbeddings()

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=core_embeddings_model.model
)

vector_store = FAISS.from_documents(documents, embedder)

In [None]:
# Example query on the vector store

query = "What do I need to send in my application for urgent citizenship processing?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)


# Retrieval Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [None]:
retriever = vector_store.as_retriever()

In [None]:
handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True,
)

In [None]:
qa_with_sources_chain({"query" : "What do I need to send in my application for urgent citizenship processing?"})