In [None]:
import dropbox
import shutil
import getpass
import os
from enum import Enum

class Model(Enum):
    OPEN_AI = 1
    LLAMA = 2
    NOMIC = 3

QUERIES = [
    "How to deal with a memory leak in prod?",
    "What are the responsibilities of the First Captain during a fire?",
    "What are the First Captain's responsibilities during a fire?",
    "What should I do if there are problems with the haskell quiz engine?",
    "What are my responsibilites when on call?",
    "What do I do if my time off conflicts with being on call?",
]

DOWNLOAD_FOLDER = "content"
TEST_CONTENT_DROPBOX_FOLDER = "LLM Doc Exp Test Content"
EMBEDDINGS_MODEL = Model.NOMIC
LLM_MODEL = Model.LLAMA
RESET_DROPBOX = False
RESET_VECTOR_STORE = False
TEST_QUERY = QUERIES[1]

In [86]:
# import requests
# import sseclient
# import json


# def get_stream(app, query):
#     full_response = ""
#     url = "http://127.0.0.1:8000/query/stream_events/"
#     response = requests.post(url, json={"input": {"query": query}}, stream=True)
#     client = sseclient.SSEClient(response)
#     for event in client.events():
#         output = json.loads(event.data)
#         if output["event"] == "on_llm_stream":
#             if chunk := output["data"].get("chunk"):
#                 full_response += chunk
#                 yield full_response
#         elif output["event"] == "on_chain_end":
#             if source_documents := output["data"]["output"].get("source_documents"):
#                 yield output["data"]["output"]["result"]

# for chunk in get_stream(None, TEST_QUERY):
#     print(chunk, flush=True)


# from langserve import RemoteRunnable
# query_chain = RemoteRunnable("http://127.0.0.1:8000/query/")

# async for msg in query_chain.astream({"query": QUERIES[0]}):
#     print(msg, end="", flush=True)


{'query': 'How to deal with a memory leak in prod?', 'result': "Based on the provided context, it appears that the memory leak issue was resolved by identifying and fixing the root cause in a specific pull request (https://github.com/NoRedInk/NoRedInk/pull/13550) and deploying the fix in https://github.com/NoRedInk/NoRedInk/pull/13703. The exact steps taken to resolve the issue are not specified, but it seems that a combination of investigation, testing, and deployment were involved.\n\nIt's worth noting that the context also mentions some potential solutions or workarounds that were tried, such as running free memory usage then re-running the application, disabling Sass, and increasing the thread pool. However, these attempts did not ultimately resolve the issue.\n\nIn general, dealing with a memory leak in production can be challenging and may require a combination of detective work, testing, and deployment to identify and fix the root cause.", 'source_documents': [Document(page_cont

In [None]:
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = {
    Model.OPEN_AI: OpenAIEmbeddings,
    Model.LLAMA: lambda: OllamaEmbeddings(model="llama3"),
    Model.NOMIC: lambda: OllamaEmbeddings(model="nomic-embed-text"),
}

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain_community.llms import Ollama

llms = {
    Model.OPEN_AI: lambda: ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
    Model.LLAMA: lambda: Ollama(model="llama3", temperature=0),
    Model.NOMIC: lambda: Ollama(model="llama3", temperature=0),
}


In [None]:
if RESET_DROPBOX:
    dbx = dropbox.Dropbox(getpass.getpass("Dropbox API Key:"))

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Open AI API Key:")

In [None]:
# Ensure local download folder exists, and delete its contents

def create_download_folder():
    if not os.path.exists(DOWNLOAD_FOLDER):
        os.makedirs(DOWNLOAD_FOLDER)

def clear_downloads_folder():
    for filename in os.listdir(DOWNLOAD_FOLDER):
        file_path = os.path.join(DOWNLOAD_FOLDER, filename)
        print(file_path)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))


create_download_folder()
if RESET_DROPBOX:
    clear_downloads_folder()

# Filter dropbox paper docs and download to content file

Download selected test paper docs using the [Dropbox API](https://www.dropbox.com/developers/documentation/http/documentation#paper-docs-download) and [Python SDK](https://dropbox-sdk-python.readthedocs.io/en/latest/index.html)

The dashboard for the Dropbox App used to do this can be found [here](https://www.dropbox.com/developers/apps/info/la3hq2wkhl5wx4m)

In [None]:
# from dropbox.paper import ExportFormat
# folders_to_match = set(["Engineering", "Fires"])

# print("Getting doc ids")
# doc_ids = dbx.paper_docs_list().doc_ids

# print("Getting docs in matching folders")
# docs_ids_in_folder = [doc_id for doc_id in doc_ids if folders_to_match.issubset(set([folder.name for folder in dbx.paper_docs_get_folder_info(doc_id).folders or []]))]

# print("Getting doc titles")
# doc_titles = {doc_id: dbx.paper_docs_download(doc_id, ExportFormat('markdown'))[0].title for doc_id in docs_ids_in_folder}


In [None]:
from dropbox.paper import ExportFormat, ListPaperDocsFilterBy

def get_file_path(doc_id):
    return os.path.join(DOWNLOAD_FOLDER, f"{doc_titles[doc_id]}.md")

def download_doc(doc_id):
    result = dbx.paper_docs_download_to_file(get_file_path(doc_id), doc_id, ExportFormat('markdown'))
    print(f"- downloaded '{result.title}'")
    return result

if RESET_DROPBOX:
    print("Retrieving document IDs")
    doc_ids = dbx.paper_docs_list(filter_by=ListPaperDocsFilterBy.docs_created).doc_ids
    print(f"- {len(doc_ids)} documents found")

    print("Filtering documents in folder")
    docs_ids_in_folder = [doc_id for doc_id in doc_ids if TEST_CONTENT_DROPBOX_FOLDER in [folder.name for folder in dbx.paper_docs_get_folder_info(doc_id).folders or []]]
    print(f"- {len(docs_ids_in_folder)} documents found in folder")

    print("Retrieving document titles")
    doc_titles = {doc_id: dbx.paper_docs_download(doc_id, ExportFormat('markdown'))[0].title for doc_id in docs_ids_in_folder}

    print("Downloading documents")
    results = [download_doc(doc_id) for doc_id in docs_ids_in_folder]
    print("Download complete")

# Simple RAG Q&A using the downloaded files

Uses [this repo](https://github.com/AI-Maker-Space/LLM-Ops-Cohort-1/blob/main/Week%201/Tuesday/Barbie_Retrieval_Augmented_Question_Answering_(RAQA)_Assignment%20(Assignment%20Version).ipynb) as a reference.


Additional Resources:
https://github.com/zylon-ai/private-gpt/issues/358#issuecomment-1563663500
https://python.langchain.com/docs/integrations/vectorstores/starrocks/
https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter/

In [None]:
from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownTextSplitter
from langchain_text_splitters import Language


In [None]:
loader = DirectoryLoader(DOWNLOAD_FOLDER, glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)

raw_documents = loader.load()


In [None]:

# headers_to_split_on = [
#     ("#", "Header 1"),
#     ("##", "Header 2"),
# ]

# # MD splits
# markdown_splitter = MarkdownHeaderTextSplitter(
#     headers_to_split_on=headers_to_split_on, strip_headers=False
# )

# loader.load_and_split(text_splitter=markdown_splitter)
# MarkdownTextSplitter()

# # Char-level splits
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# chunk_size = 250
# chunk_overlap = 30
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )

# # Split
# splits = text_splitter.split_documents(md_header_splits)
# splits

In [None]:

# text_splitter = MarkdownTextSplitter(
#     chunk_size = 1000, # the character length of the chunk
#     chunk_overlap = 100, # the character length of the overlap between chunks
#     length_function = len, # the length function
# )

text_splitter = RecursiveCharacterTextSplitter.from_language(Language.MARKDOWN, 
    chunk_size = 1000, # the character length of the chunk
    chunk_overlap = 100, # the character length of the overlap between chunks
    length_function = len, # the length function
)

documents = text_splitter.split_documents(raw_documents)

In [None]:
print(documents[0])
print(documents[1])

# Index Creation

https://python.langchain.com/docs/modules/data_connection/vectorstores/

In [None]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.storage import LocalFileStore

In [None]:
store = LocalFileStore("./cache/")

core_embeddings_model = embeddings[EMBEDDINGS_MODEL]()

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=core_embeddings_model.model
)

vector_store = FAISS.from_documents(documents, embedder)
vector_store.save_local("./vector_store")

In [None]:
# Example query on the vector store

query = TEST_QUERY
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(f">>>{page.page_content}<<<")


# Retrieval Chain

In [None]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain

In [None]:
llm = llms[LLM_MODEL]()

In [None]:
retriever = vector_store.as_retriever()

In [None]:
handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True,
)

In [None]:
# result = qa_with_sources_chain({"query" : TEST_QUERY})
result = qa_with_sources_chain.invoke(TEST_QUERY)
print(result)

In [None]:
from langchain_core.prompts import PromptTemplate, MessagesPlaceholder, ChatPromptTemplate
from langchain_core.runnables import RunnableBranch, RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain_core.messages import AIMessage, HumanMessage

RESPONSE_TEMPLATE = """\
You are an expert programmer and problem-solver, tasked with answering any question \
about Langchain.

Generate a comprehensive and informative answer of 80 words or less for the \
given question based solely on the provided search results (URL and content). You must \
only use information from the provided search results. Use an unbiased and \
journalistic tone. Combine search results together into a coherent answer. Do not \
repeat text. Cite search results using [${{number}}] notation. Only cite the most \
relevant results that answer the question accurately. Place these citations at the end \
of the sentence or paragraph that reference them - do not put them all at the end. If \
different results refer to different entities within the same name, write separate \
answers for each entity.

You should use bullet points in your answer for readability. Put citations where they apply
rather than putting them all at the end.

If there is nothing in the context relevant to the question at hand, just say "Hmm, \
I'm not sure." Don't try to make up an answer.

Anything between the following `context`  html blocks is retrieved from a knowledge \
bank, not part of the conversation with the user. 

<context>
    {context} 
<context/>

REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
not sure." Don't try to make up an answer. Anything between the preceding 'context' \
html blocks is retrieved from a knowledge bank, not part of the conversation with the \
user.\
"""

REPHRASE_TEMPLATE = """\
Given the following conversation and a follow up question, rephrase the follow up \
question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone Question:"""

def format_docs(docs) -> str:
    formatted_docs = []
    for i, doc in enumerate(docs):
        doc_string = f"<doc id='{i}'>{doc.page_content}</doc>"
        formatted_docs.append(doc_string)
    return "\n".join(formatted_docs)

def serialize_history(request):
    chat_history = request["chat_history"] or []
    converted_chat_history = []
    for message in chat_history:
        if message.get("human") is not None:
            converted_chat_history.append(HumanMessage(content=message["human"]))
        if message.get("ai") is not None:
            converted_chat_history.append(AIMessage(content=message["ai"]))
    return converted_chat_history


CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(REPHRASE_TEMPLATE)
condense_question_chain = (
    CONDENSE_QUESTION_PROMPT | llm | StrOutputParser()
).with_config(
    run_name="CondenseQuestion",
)
conversation_chain = condense_question_chain | retriever
retriever_chain = RunnableBranch(
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),
        conversation_chain.with_config(run_name="RetrievalChainWithHistory"),
    ),
    (
        RunnableLambda(itemgetter("question")).with_config(
            run_name="Itemgetter:question"
        )
        | retriever
    ).with_config(run_name="RetrievalChainWithNoHistory"),
).with_config(run_name="RouteDependingOnChatHistory")


context = (
    RunnablePassthrough.assign(docs=retriever_chain)
    .assign(context=lambda x: format_docs(x["docs"]))
    .with_config(run_name="RetrieveDocs")
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", RESPONSE_TEMPLATE),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
default_response_synthesizer = prompt | llm

chain = RunnablePassthrough.assign(chat_history=serialize_history) | context | default_response_synthesizer

chain.run({"question": "How to deal with a memory leak in prod?", "chat_history": []})


In [None]:
import pprint
print(f"{result['query']}\n")
pprint.pp(result['result'])
print()

pprint.pp([document.metadata['source'] for document in result['source_documents']])
# pprint.pp(result['source_documents'][0].page_content)