# Docs Copilot – A Generative AI App for Searching Documentation

Github: https://github.com/miztiik/docs-copilot


In [None]:
# UNCOMMENT NEXT LINE TO SKIP THIS CELL EXECUTION
# %%script skipping --no-raise-error

# Install the dependencies for the project

%pip install --quiet numpy
%pip install --quiet openai
%pip install --quiet python-dotenv
%pip install --quiet tenacity
%pip install --quiet tiktoken 
%pip install --quiet --upgrade chromadb 
%pip install --quiet langchain

# For progress bar and process time
%pip install --quiet tqdm


In [13]:
import os
import shutil

import tiktoken

import pandas as pd

from dotenv import load_dotenv, dotenv_values

from tqdm.auto import tqdm

# For Vector Embeddings store
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings

from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

from openai import AzureOpenAI

# For exponential backoff
from tenacity import retry, wait_random_exponential, stop_after_attempt

### Load the environment variables


In [52]:
# Load the environment variables

# specify the name of the .env file name
# env_name = "./env/docs_copilot.env"
# config = dotenv_values(env_name)

load_dotenv()

# Raw Data Path
RAW_DATA_PATH = "./../data/raw/azure_docs/"

# DB_PATH = os.getenv("DB_PATH")
DB_PATH = "./../data/processed/dbs/azure_docs/"
COLLECTION_NAME = "fn_markdown"

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")


# Azure OpenAI Models
embeddings_deployment_name = "nice"
embeddings_deployment_model = "text-embedding-ada-002"
completions_deployment_name = "hellno"
completions_deployment_model = "gpt-35-turbo-16k"

# Hugging Face Models
hf_model_name = "all-MiniLM-L6-v2"

print(f"AZURE_OPENAI_ENDPOINT: {os.getenv('AZURE_OPENAI_ENDPOINT')}")
print(f"AZURE_OPENAI_API_VERSION: {os.getenv('AZURE_OPENAI_API_VERSION')}")

if AZURE_OPENAI_API_KEY is None:
    print("Please set the AZURE_OPENAI_API_KEY environment variable")
    raise EnvironmentError("Please set the AZURE_OPENAI_API_KEY environment variable")

AZURE_OPENAI_ENDPOINT: https://eastus.api.cognitive.microsoft.com/
AZURE_OPENAI_API_VERSION: 2023-05-15


In [31]:
import logging

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)

logging.info("Welcome to Miztiik Automation for Docs Copilot")

2024-01-21 15:40:45,328 - root - INFO - Welcome to Miztiik Automation for Docs Copilot


### Setup Hugging Face Embeddings & Test them


In [36]:
sample_txt = ["This is an Miztikal World", "Lets rejoice to together"]

# Inititalise the embedding
embeddings_fn_by_hf = HuggingFaceEmbeddings()

print(f"Refer Embedding Leaderboard: https://huggingface.co/spaces/mteb/leaderboard")
print(
    f"Refer Embedding Leaderboard: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
)
hf_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
sample_txt_embeddings = hf_model.encode(sample_txt)

if sample_txt_embeddings is None:
    print("Unable to embedd the query")

embeddings_fn_by_hf = HuggingFaceEmbeddings(model_name=hf_model_name)

# Inititalise the embedding fn for Chroma Document Level Embedding
embeddings_fn_4_collections = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2"
)


sample_txt_embeddings = embeddings_fn_by_hf.embed_query(sample_txt[0])

if sample_txt_embeddings is None:
    print(f"Unable to embedd the query")
else:
    print(f"Successfully generated embeddings")

2024-01-21 15:42:05,211 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2024-01-21 15:42:05,498 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu
2024-01-21 15:42:05,499 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2024-01-21 15:42:05,566 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


Refer Embedding Leaderboard: https://huggingface.co/spaces/mteb/leaderboard
Refer Embedding Leaderboard: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-01-21 15:42:05,603 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-01-21 15:42:05,684 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu
2024-01-21 15:42:05,703 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-01-21 15:42:05,777 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Successfully generated embeddings


### Setup Vector Database


In [38]:
def delete_chromadb(db_path):
    if os.path.exists(db_path) and os.path.isdir(db_path):
        print("Deleting existing ChromaDB at", db_path)
        shutil.rmtree(db_path)


def write_to_vec_store_collection(
    db_path, collection_name, docs_list, ids_list, embeddings_list, metadatas_list
):
    docs_vs_status = False
    try:
        vs_client = chromadb.PersistentClient(
            path=db_path,
        )
        vs_collection = vs_client.get_or_create_collection(name=collection_name)

        vs_collection.add(
            documents=docs_list,
            ids=ids_list,
            embeddings=embeddings_list,
            metadatas=metadatas_list,
        )

        vs_client = None
        docs_vs_status = True
    except Exception as e:
        print(f"ERROR: {str(e)}")
        raise e
    return docs_vs_status

### Ingest Documents to ChromaDB


In [None]:
# Count Tokens in each document
def count_tokens(model_name, docs):
    token_count = 0
    tokenizer = tiktoken.encoding_for_model(model_name)
    token_count = [len(tokenizer.encode(d.page_content)) for d in docs]
    print(token_count)
    return token_count

In [24]:
md_headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    md_headers_to_split_on, strip_headers=False
)

chunk_size = 250
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    # length_function=len,
    separators=["\n\n", "\n", "(?<=\, )", " ", "", "#", "##", "###"],
)


# Initialize text splitter and embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)


def get_files_with_extension(dir_path, doc_extension):
    """
    Get a list of files in a directory (including subdirectories) matching a file extension.
    """
    files = []

    for dirpath, dirnames, filenames in os.walk(dir_path):
        for filename in filenames:
            if filename.endswith(doc_extension):
                files.append(os.path.join(dirpath, filename))
    return files


def ingest_docs_in_dir_to_chromadb(docs_path, doc_extension):
    # Process each file in the docs_path directory
    for file in os.listdir(docs_path):
        if file.endswith(doc_extension):
            file_path = os.path.join(docs_path, file)

            print(f"Processing {file_path} file.")

            with open(file_path) as f:
                try:
                    documents_list = []
                    ids_list = []
                    metadatas_list = []
                    embeddings_list = []

                    file_contents = f.read()

                    file_chunks = text_splitter.split_text(file_contents)

                    for i, file_chunk in enumerate(file_chunks):
                        documents_list.append(file_chunk)
                        ids_list.append(f"{file}_{i}")
                        metadatas_list.append({"source": file, "chunk_id": i})
                        # INGEST TO VECTOR STORE
                        doc_vectors = embeddings_fn_by_hf.embed_query(file_chunk)
                        embeddings_list.append(doc_vectors)

                    # Ingest the documents into the vector store
                    __vs_resp = write_to_vec_store_collection(
                        DB_PATH,
                        COLLECTION_NAME,
                        documents_list,
                        ids_list,
                        embeddings_list,
                        metadatas_list,
                    )

                    if not __vs_resp:
                        raise Exception(f"Error occurred while processing {file} file.")

                    print(f"file: {file} added to vector store.")

                except Exception as e:
                    print(f"Error occurred while processing {file} file.")
                    print(str(e))
                    raise e

    print(f"{len(os.listdir(docs_path))} files added to vector store")

In [None]:
ingest_docs_in_dir_to_chromadb(RAW_DATA_PATH, ".md")

# Load the Database from disk


In [55]:
vs_chroma_client = chromadb.PersistentClient(path=DB_PATH)
docs_collection = vs_chroma_client.get_or_create_collection(
    name=COLLECTION_NAME)

print(f"ChromaDB Heartbeat: {vs_chroma_client.heartbeat()}")
print(f"ChromaDB Collections: {vs_chroma_client.list_collections()}")


# Verify ChromaDB is setup correctly, by checking document count
print(f"ChromaDB has {docs_collection.count()} documents")

ChromaDB Heartbeat: 1705849036209515100
ChromaDB Collections: [Collection(name=fn_markdown)]
ChromaDB has 4585 documents


Confirm that the data was inserted by looking at the database


In [56]:
# docs_collection.peek(2)
pd.DataFrame(docs_collection.peek(3))

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,add-bindings-existing-function.md_0,"[-0.04316641017794609, -0.11902757734060287, -...","{'chunk_id': 0, 'source': 'add-bindings-existi...",---\ntitle: Connect functions to other Azure s...,,
1,add-bindings-existing-function.md_1,"[-0.008143081329762936, -0.0859667956829071, -...","{'chunk_id': 1, 'source': 'add-bindings-existi...",## Local development \n\nWhen you develo...,,
2,add-bindings-existing-function.md_2,"[-0.03155401349067688, -0.048116981983184814, ...","{'chunk_id': 2, 'source': 'add-bindings-existi...",### Manually add bindings based on examples\n\...,,


## Query the database


Query for text matching the query string


In [57]:
def get_relevant_txt(query, docs_collection):
    passage = docs_collection.query(query_texts=[query], n_results=1)[
        "documents"][0][0]
    return passage


def get_relevant_docs(query, docs_collection):
    docs = docs_collection.query(
        query_texts=[query], n_results=5, include=["documents"]
    )
    return docs

In [59]:
# Perform embedding search
usr_query_1 = "How to configure Azure Functions with a virtual network"
usr_query = "What are Azure Functions"

matching_txt = get_relevant_txt(usr_query, docs_collection)

print(f"Matching Text: {matching_txt}")

Matching Text: ---
title: Guidance for developing Azure Functions
description: Learn the Azure Functions concepts and techniques that you need to develop functions in Azure, across all programming languages and bindings.
ms.assetid: d8efe41a-bef8-4167-ba97-f3e016fcd39e
ms.topic: conceptual
ms.date: 09/06/2023
ms.custom: ignite-2022, devx-track-extended-java, devx-track-js, devx-track-python
zone_pivot_groups: programming-languages-set-functions
---

# Azure Functions developer guide

In Azure Functions, all functions share some core technical concepts and components, regardless of your preferred language or development environment. This article is language-specific. Choose your preferred language at the top of the article.

This article assumes that you've already read the [Azure Functions overview](functions-overview.md).


In [60]:
matching_docs = get_relevant_docs(usr_query, docs_collection)

print(f"Total docs found: {len(matching_docs['documents'][0])}")

print("\033[31m" + "User Input:" + usr_query + "\033[0m")


for result in matching_docs["documents"]:
    for i in result:
        print(i)
        print("\033[32m" + "+++++++++++++++++++++++++++++++++++" + "\033[0m")

Total docs found: 5
[31mUser Input:What are Azure Functions[0m
---
title: Guidance for developing Azure Functions
description: Learn the Azure Functions concepts and techniques that you need to develop functions in Azure, across all programming languages and bindings.
ms.assetid: d8efe41a-bef8-4167-ba97-f3e016fcd39e
ms.topic: conceptual
ms.date: 09/06/2023
ms.custom: ignite-2022, devx-track-extended-java, devx-track-js, devx-track-python
zone_pivot_groups: programming-languages-set-functions
---

# Azure Functions developer guide

In Azure Functions, all functions share some core technical concepts and components, regardless of your preferred language or development environment. This article is language-specific. Choose your preferred language at the top of the article.

This article assumes that you've already read the [Azure Functions overview](functions-overview.md).
[32m+++++++++++++++++++++++++++++++++++[0m
---
title: Azure Functions Scenarios 
description: Identify key scenar

## Make a prompt to pass to GPT


Setup Azure OpenAI Client & Test


In [61]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)


def embeddings_generator_az_oai(text, model="nice"):
    # model = "deployment_name"
    return client.embeddings.create(input=[text], model=model).data[0].embedding

Verify if Azure OpenAI Embeddings are generated


In [62]:
sample_txt_embeddings = embeddings_generator_az_oai(
    "Welcome to Miztiikal World",
    # model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
    model=embeddings_deployment_name,
)

if sample_txt_embeddings is None:
    print("No embeddings found")
else:
    print(f"Successfully generated embeddings")

2024-01-21 16:01:12,762 - httpx - INFO - HTTP Request: POST https://eastus.api.cognitive.microsoft.com//openai/deployments/nice/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"


Successfully generated embeddings


In [63]:
# This function helps to ground the model with prompts and system instructions.


@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def generate_completion(usr_query, r_data, num_tokens=2000):
    system_prompt = """You are an intelligent assistant for Microsoft Azure services.
    Use the following pieces of context to answer the question at the end. Question is enclosed in <question></question>.
    Do keep the following things in mind when answering the question:
        - If you don't know the answer, just say that you don't know, don't try to make up an answer.
        - Keep the answer as concise as possible.
        - Use only the context to answer the question. Context is enclosed in <context></context>
        - The context contains one or more paragraph of text that is formatted as markdown. When answering, remove the sentences from the markdown that contain markdown links.
        - If the answer is not found in context, simply output "I'm sorry but I do not know the answer to your question. Please visit Microsoft Learn (https://learn.microsoft.com) or ask a question on StackOverflow (https://stackoverflow.com/questions/tagged/azure).
        - Do not include the code in output unless the question is asked to produce the code.
        """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": usr_query},
        {"role": "assistant", "content": r_data},
    ]

    # print("\033[32m----------------------------------------------\033[0m")
    # print(f"{messages}")
    # print("\033[32m----------------------------------------------\033[0m")

    resp = client.chat.completions.create(
        model=completions_deployment_name,
        messages=messages,
        # max_tokens=num_tokens,
        temperature=0,
        stop="+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n",
    )

    return resp

## Single Query


In [64]:
matching_docs = get_relevant_docs(usr_query, docs_collection)

matching_docs_str = "".join(matching_docs["documents"][0])

resp = generate_completion(usr_query, matching_docs_str)

print(f"User_Query: \033[32m {usr_query} \033[0m")
print(f"Response: \033[36m { resp.choices[0].message.content} \033[0m")
print(f"total_tokens: \033[36m { resp.usage.total_tokens} \033[0m")

2024-01-21 16:01:29,362 - httpx - INFO - HTTP Request: POST https://eastus.api.cognitive.microsoft.com//openai/deployments/hellno/chat/completions?api-version=2023-05-15 "HTTP/1.1 200 OK"


User_Query: [32m What are Azure Functions [0m
Response: [36m Azure Functions is a serverless computing service provided by Microsoft Azure. It allows developers to write and deploy small pieces of code, called functions, that can be triggered by events such as HTTP requests, database changes, or scheduled timers. Azure Functions abstracts away the underlying infrastructure, allowing developers to focus on writing the code that matters most to them. Functions can be written in various programming languages and can integrate with other Azure services to provide feature-rich implementations. [0m
total_tokens: [36m 1126 [0m


In [65]:
usr_query = input("Prompt: ")


while usr_query.lower() not in ["end", "quit", "exit", "stop"]:
    if usr_query.lower() == "stop":
        break

    matching_docs = get_relevant_docs(usr_query, docs_collection)

    print(f"Total docs found: {len(matching_docs['documents'][0])}")

    matching_docs_str = "".join(matching_docs["documents"][0])

    # print("\033[32m----------------------------------------------\033[0m")
    # print(f"{matching_docs}")
    # print("\033[32m----------------------------------------------\033[0m")

    resp = generate_completion(usr_query, matching_docs_str)

    assistant_response = resp.choices[0].message.content

    print(f"total_tokens: \033[36m { resp.usage.total_tokens} \033[0m")
    print(f"User_Query: \033[32m {usr_query} \033[0m")
    print(f"Assistant: \033[36m { assistant_response} \033[0m")

    print(
        "\033[32m" + "How can I help you? - Type 'stop' when you are done." + "\033[0m"
    )

    usr_query = input("Question: ")

Total docs found: 5


2024-01-21 16:02:21,464 - httpx - INFO - HTTP Request: POST https://eastus.api.cognitive.microsoft.com//openai/deployments/hellno/chat/completions?api-version=2023-05-15 "HTTP/1.1 200 OK"


User_Query: [32m trigger blob to functions [0m
Assistant: [36m To trigger a function when a new or updated blob is detected in Azure Blob storage, you can use the Blob storage trigger in Azure Functions. This trigger starts a function whenever a blob is added or updated in a specified container. The blob contents can be provided as input to the function.

Here is an example of how to use the Blob storage trigger in Python:

```python
import logging
import azure.functions as func

@app.blob_trigger(name="BlobTrigger", path="container-name/{blobname}", connection="AzureWebJobsStorage")
def process_blob(myblob: func.InputStream):
    logging.info(f"Blob trigger function processed blob \n"
                 f"Name: {myblob.name}\n"
                 f"Blob Size: {myblob.length} bytes")
```

In this example, the `@app.blob_trigger` decorator is used to define the function as a blob trigger. The `name` parameter specifies the name of the function, the `path` parameter specifies the path pat

## Clean up the database


In [None]:
# UNCOMMENT NEXT LINE TO SKIP THIS CELL EXECUTION
# %%script skipping --no-raise-error


# To cleanup, you can delete the collection

vs_chroma_client.delete_collection()

vs_chroma_client.persist()


# Or just nuke the persist directory

# rm -rf data/processed/dbs/azure_docs/