# Docs Copilot – A Generative AI App for Searching Documentation

Github: https://github.com/miztiik/docs-copilot

Google Colab: https://colab.research.google.com/github//miztiik/docs-copilot/blob/main/src/docs_copilot_w_chroma.ipynb

![Miztiik Automation: Docs Copilot](../images/miztiik_automation_docs_copilot_using_llm_rag_01.png)


In [None]:
# UNCOMMENT NEXT LINE TO SKIP THIS CELL EXECUTION
# %%script skipping --no-raise-error

# Install the dependencies for the project

%pip install --quiet pandas
%pip install --quiet numpy
%pip install --quiet openai
%pip install --quiet python-dotenv
%pip install --quiet tenacity
%pip install --quiet tiktoken 
%pip install --quiet --upgrade chromadb 
%pip install --quiet langchain
%pip install --quiet sentence-transformers

# For progress bar and process time
%pip install --quiet tqdm


In [None]:
import os
import shutil

import tiktoken

import pandas as pd

from dotenv import load_dotenv, dotenv_values

from tqdm.auto import tqdm

# For Vector Embeddings store
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings

from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

from openai import AzureOpenAI

# For exponential backoff
from tenacity import retry, wait_random_exponential, stop_after_attempt

### Load the environment variables


In [None]:
# Load the environment variables

# specify the name of the .env file name
env_path = ".env"
load_dotenv(dotenv_path=env_path)
# config = dotenv_values(env_name)

# Raw Data Path
RAW_DATA_PATH = "./../data/raw/azure_docs/"

# DB_PATH = os.getenv("DB_PATH")
DB_PATH = "./../data/processed/dbs/azure_docs/"
COLLECTION_NAME = "fn_markdown"

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")


# Azure OpenAI Models
embeddings_deployment_name = "nice"
embeddings_deployment_model = "text-embedding-ada-002"
completions_deployment_name = "hellno"
completions_deployment_model = "gpt-35-turbo-16k"

# Hugging Face Models
hf_model_name = "all-MiniLM-L6-v2"

print(f"AZURE_OPENAI_ENDPOINT: {os.getenv('AZURE_OPENAI_ENDPOINT')}")
print(f"AZURE_OPENAI_API_VERSION: {os.getenv('AZURE_OPENAI_API_VERSION')}")

if AZURE_OPENAI_API_KEY is None:
    print("Please set the AZURE_OPENAI_API_KEY environment variable")
    raise EnvironmentError("Please set the AZURE_OPENAI_API_KEY environment variable")

In [None]:
import logging

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)

logging.info("Welcome to Miztiik Automation for Docs Copilot")

### Setup Hugging Face Embeddings & Test them


#### Setup Models Cache


In [None]:
# Setting the Hugging Face Model Cache
MODEL_CACHE_DIR = "./../models_cache/hf/"

# https://stackoverflow.com/questions/63312859/how-to-change-huggingface-transformers-default-cache-directory
os.environ["HF_HOME"] = MODEL_CACHE_DIR

In [None]:
sample_txt = ["This is an Miztikal World", "Lets rejoice to together"]

# Inititalise the embedding
embeddings_fn_by_hf = HuggingFaceEmbeddings()

print(f"Refer Embedding Leaderboard: https://huggingface.co/spaces/mteb/leaderboard")
print(
    f"Refer Embedding Leaderboard: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
)
hf_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2", cache_folder=MODEL_CACHE_DIR
)
sample_txt_embeddings = hf_model.encode(sample_txt)

if sample_txt_embeddings is None:
    print("Unable to embedd the query")

embeddings_fn_by_hf = HuggingFaceEmbeddings(
    model_name=hf_model_name,
)

# Inititalise the embedding fn for Chroma Document Level Embedding
embeddings_fn_4_collections = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2"
)


sample_txt_embeddings = embeddings_fn_by_hf.embed_query(sample_txt[0])

if sample_txt_embeddings is None:
    print(f"Unable to embedd the query")
else:
    print(f"Successfully generated embeddings")

### Setup Vector Database


In [None]:
def delete_chromadb(db_path):
    if os.path.exists(db_path) and os.path.isdir(db_path):
        print("Deleting existing ChromaDB at", db_path)
        shutil.rmtree(db_path)


def write_to_vec_store_collection(
    db_path, collection_name, docs_list, ids_list, embeddings_list, metadatas_list
):
    docs_vs_status = False
    # from chromadb.config import Settings
    # client = chromadb.Client(path=db_path, Settings=Settings(anonymized_telemetry=False))
    try:
        vs_client = chromadb.PersistentClient(
            path=db_path,
        )
        vs_collection = vs_client.get_or_create_collection(name=collection_name)

        vs_collection.add(
            documents=docs_list,
            ids=ids_list,
            embeddings=embeddings_list,
            metadatas=metadatas_list,
        )

        vs_client = None
        docs_vs_status = True
    except Exception as e:
        print(f"ERROR: {str(e)}")
        raise e
    return docs_vs_status

### Ingest Documents to ChromaDB


In [None]:
# Count Tokens in each document
def count_tokens(model_name, docs):
    token_count = 0
    tokenizer = tiktoken.encoding_for_model(model_name)
    token_count = [len(tokenizer.encode(d.page_content)) for d in docs]
    print(token_count)
    return token_count

In [None]:
md_headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    md_headers_to_split_on, strip_headers=False
)

# Initialize text splitter and embeddings
chunk_size = 1000
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    # length_function=len,
    separators=["#", "##", "###", "\n\n", "\n", "(?<=\, )"],
)


def get_files_with_extension(dir_path, doc_extension):
    """
    Get a list of files in a directory (including subdirectories) matching a file extension.
    """
    files = []

    for dirpath, dirnames, filenames in os.walk(dir_path):
        for filename in filenames:
            if filename.endswith(doc_extension):
                files.append(os.path.join(dirpath, filename))
    return files


def ingest_docs_in_dir_to_chromadb(docs_path, doc_extension):
    # Process each file in the docs_path directory
    for file in os.listdir(docs_path):
        if file.endswith(doc_extension):
            file_path = os.path.join(docs_path, file)

            print(f"Processing {file_path} file.")

            with open(file_path) as f:
                try:
                    documents_list = []
                    ids_list = []
                    metadatas_list = []
                    embeddings_list = []

                    file_contents = f.read()

                    file_chunks = text_splitter.split_text(file_contents)

                    for i, file_chunk in enumerate(file_chunks):
                        documents_list.append(file_chunk)
                        ids_list.append(f"{file}_{i}")
                        metadatas_list.append({"source": file, "chunk_id": i})
                        # INGEST TO VECTOR STORE
                        doc_vectors = embeddings_fn_by_hf.embed_query(file_chunk)
                        embeddings_list.append(doc_vectors)

                    # Ingest the documents into the vector store
                    __vs_resp = write_to_vec_store_collection(
                        DB_PATH,
                        COLLECTION_NAME,
                        documents_list,
                        ids_list,
                        embeddings_list,
                        metadatas_list,
                    )

                    if not __vs_resp:
                        raise Exception(f"Error occurred while processing {file} file.")

                    print(f"file: {file} added to vector store.")

                except Exception as e:
                    print(f"Error occurred while processing {file} file.")
                    print(str(e))
                    raise e

    print(f"{len(os.listdir(docs_path))} files added to vector store")

### Split the documents into chunks

![Miztiik Automation: Docs Copilot](../images/miztiik_automation_docs_copilot_using_llm_rag_02.png)

### Store the chunks in ChromaDB

![Miztiik Automation: Docs Copilot](../images/miztiik_automation_docs_copilot_using_llm_rag_03.png)


In [None]:
ingest_docs_in_dir_to_chromadb(RAW_DATA_PATH, ".md")

# Load the Database from disk


In [None]:
vs_chroma_client = chromadb.PersistentClient(path=DB_PATH)
docs_collection = vs_chroma_client.get_or_create_collection(
    name=COLLECTION_NAME)

print(f"ChromaDB Heartbeat: {vs_chroma_client.heartbeat()}")
print(f"ChromaDB Collections: {vs_chroma_client.list_collections()}")


# Verify ChromaDB is setup correctly, by checking document count
print(f"ChromaDB has {docs_collection.count()} documents")

Confirm that the data was inserted by looking at the database


In [None]:
# docs_collection.peek(2)
pd.DataFrame(docs_collection.peek(3))

## Query the database


Query for text matching the query string

![Miztiik Automation: Docs Copilot](../images/miztiik_automation_docs_copilot_using_llm_rag_04.png)


In [None]:
def get_relevant_txt(query, docs_collection):
    passage = docs_collection.query(query_texts=[query], n_results=1)[
        "documents"][0][0]
    return passage


def get_relevant_docs(query, docs_collection):
    docs = docs_collection.query(
        query_texts=[query], n_results=5, include=["documents"]
    )
    return docs

In [None]:
# Perform embedding search
usr_query_1 = "How to configure Azure Functions with a virtual network"
usr_query = "What are Azure Functions"

matching_txt = get_relevant_txt(usr_query, docs_collection)

print(f"Matching Text: {matching_txt}")

In [None]:
matching_docs = get_relevant_docs(usr_query, docs_collection)

print(f"Total docs found: {len(matching_docs['documents'][0])}")

print("\033[31m" + "User Input:" + usr_query + "\033[0m")


for result in matching_docs["documents"]:
    for i in result:
        print(i)
        print("\033[32m" + "+++++++++++++++++++++++++++++++++++" + "\033[0m")

## Make a prompt to pass to GPT


Setup Azure OpenAI Client & Test


In [None]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)


def embeddings_generator_az_oai(text, model="nice"):
    # model = "deployment_name"
    return client.embeddings.create(input=[text], model=model).data[0].embedding

Verify if Azure OpenAI Embeddings are generated


In [None]:
sample_txt_embeddings = embeddings_generator_az_oai(
    "Welcome to Miztiikal World",
    # model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
    model=embeddings_deployment_name,
)

if sample_txt_embeddings is None:
    print("No embeddings found")
else:
    print(f"Successfully generated embeddings")

In [None]:
# This function helps to ground the model with prompts and system instructions.


@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def generate_completion(usr_query, r_data, num_tokens=2000):
    system_prompt = """You are an intelligent assistant for Microsoft Azure services.
    Use the following pieces of context to answer the question at the end. Question is enclosed in <question></question>.
    Do keep the following things in mind when answering the question:
        - If you don't know the answer, just say that you don't know, don't try to make up an answer.
        - Keep the answer as concise as possible.
        - Use only the context to answer the question. Context is enclosed in <context></context>
        - The context contains one or more paragraph of text that is formatted as markdown. When answering, remove the sentences from the markdown that contain markdown links.
        - If the answer is not found in context, simply output "I'm sorry but I do not know the answer to your question. Please visit Microsoft Learn (https://learn.microsoft.com) or ask a question on StackOverflow (https://stackoverflow.com/questions/tagged/azure).
        - Do not include the code in output unless the question is asked to produce the code.
        """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": usr_query},
        {"role": "assistant", "content": r_data},
    ]

    resp = client.chat.completions.create(
        model=completions_deployment_name,
        messages=messages,
        # max_tokens=num_tokens,
        temperature=0,
        stop="+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n",
    )

    return resp

## Single Query


In [None]:
matching_docs = get_relevant_docs(usr_query, docs_collection)

matching_docs_str = "".join(matching_docs["documents"][0])

resp = generate_completion(usr_query, matching_docs_str)

print(f"User_Query: \033[32m {usr_query} \033[0m")
print(f"Response: \033[36m { resp.choices[0].message.content} \033[0m")
print(f"total_tokens: \033[36m { resp.usage.total_tokens} \033[0m")

In [None]:
usr_query = input("Prompt: ")


while usr_query.lower() not in ["end", "quit", "exit", "stop"]:
    if usr_query.lower() == "stop":
        break

    matching_docs = get_relevant_docs(usr_query, docs_collection)

    print(f"Total docs found: {len(matching_docs['documents'][0])}")

    matching_docs_str = "".join(matching_docs["documents"][0])

    # print("\033[32m----------------------------------------------\033[0m")
    # print(f"{matching_docs}")
    # print("\033[32m----------------------------------------------\033[0m")

    resp = generate_completion(usr_query, matching_docs_str)

    assistant_response = resp.choices[0].message.content

    print(f"total_tokens: \033[36m { resp.usage.total_tokens} \033[0m")
    print(f"User_Query: \033[32m {usr_query} \033[0m")
    print(f"Assistant: \033[36m { assistant_response} \033[0m")

    print(
        "\033[32m" + "How can I help you? - Type 'stop' when you are done." + "\033[0m"
    )

    usr_query = input("Question: ")

## Clean up the database


In [None]:
# UNCOMMENT NEXT LINE TO SKIP THIS CELL EXECUTION
# %%script skipping --no-raise-error


# To cleanup, you can delete the collection

vs_chroma_client.delete_collection()

vs_chroma_client.persist()


# Or just nuke the persist directory

# rm -rf data/processed/dbs/azure_docs/