# Cosmos DB NoSQL

This preprocessing notebook will guide you through chunking, embedding, and uploading to Cosmos DB NOSQL.

## Prerequisite
- [Create a Cosmos DB NoSQL database according to the microsoft documentation](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/) in the Azure portal
- Add your connection string to the .`env` at the root of the repository. It should look something like \
    "AccountEndpoint=https://{INSERT}.documents.azure.com:443/;AccountKey={INSERT}"

In [None]:
import os
from abc import ABC, abstractmethod
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import pandas as pd

Please make sure you have already run the "../../../preprocessing/step0_data_preprocessor.ipynb" notebook to obtain DATA from the source (e.g. blobstorage).

In [None]:
docx_loader = DirectoryLoader("../../../preprocessing/DATA", glob="**/*.docx")

In [None]:
docs = docx_loader.load()

In [None]:
import re


def extract_info_from_filename(filename):
    """
    Input: filename ("MSFTTranscriptFY23Q4")
    Output: Extract stock symbol, year and quarter from filename
    """
    pattern = r"([A-Z]+)TranscriptFY(\d{2})Q(\d)"
    match = re.search(pattern, filename)

    if match:
        symbol = match.group(1)
        fiscal_year = match.group(2)
        fiscal_quarter = match.group(3)
        return symbol, fiscal_year, fiscal_quarter
    else:
        return None

In [None]:
doc_chunks = []

for doc in docs:
    source = doc.metadata["source"]
    symbol, fiscal_year, fiscal_quarter = extract_info_from_filename(source)

    text_splitter = RecursiveCharacterTextSplitter(
        separators=[
            "\n## ",
            "\n### ",
            "\n#### ",
            "\n##### ",
            "\n###### ",
            "```\n\n",
            "\n\n***\n\n",
            "\n\n---\n\n",
            "\n\n___\n\n",
            "\n\n",
            "\n",
            " ",
            "",
        ]
    )

    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "source": source,
                "symbol": symbol,
                "fiscal_year": fiscal_year,
                "fiscal_quarter": fiscal_quarter,
                "chunk": i,
            },
        )
        doc_chunks.append(doc)

In [None]:
len(doc_chunks)

In [None]:
from dotenv import dotenv_values
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
import openai
import pandas as pd
import numpy as np
import time
import requests

# specify the name of the .env file name
env_name = "../../../../.env"  # change to your own .env file name
config = dotenv_values(env_name)

if config["KEYS_FROM"] == "KEYVAULT":
    print("keyvault was selected.")
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)

    openai.api_type = client.get_secret("OPENAI-API-TYPE").value
    openai.api_key = client.get_secret("OPENAI-API-KEY").value
    openai.api_base = client.get_secret("OPENAI-API-BASE").value
    openai.api_version = client.get_secret("OPENAI-API-VERSION").value
    deployment_embedding = client.get_secret("OPENAI-DEPLOYMENT-EMBEDDING").value
else:
    openai.api_type = config["OPENAI_API_TYPE"]
    openai.api_key = config["OPENAI_API_KEY"]
    openai.api_base = config["OPENAI_API_BASE"]
    openai.api_version = config["OPENAI_API_VERSION"]
    deployment_embedding = config["OPENAI_DEPLOYMENT_EMBEDDING"]


def createEmbeddings(text, endpoint, api_key, api_version, embedding_model_deployment):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {"Content-Type": "application/json", "api-key": api_key}
    request_payload = {"input": text}
    embedding_response = requests.post(
        request_url, json=request_payload, headers=headers, timeout=None
    )
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")

In [None]:
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from dotenv import dotenv_values

config = dotenv_values(env_name)


if config["KEYS_FROM"] == "KEYVAULT":
    print("keyvault was selected.")
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)
    NOSQL_CONN_STRING = client.get_secret("COSMOS-DB-NOSQL-CONN-STRING").value
else:
    print(".env was selected.")
    NOSQL_CONN_STRING = config["COSMOS_DB_NOSQL_CONN_STRING"]

In [None]:
docs

In [None]:
data = []
for i, doc in enumerate(doc_chunks):
    # Create embeddings using the provided function
    content_embeddings = createEmbeddings(
        doc.page_content,
        openai.api_base,
        openai.api_key,
        openai.api_version,
        deployment_embedding,
    )[0]
    source_embeddings = createEmbeddings(
        doc.metadata["source"],
        openai.api_base,
        openai.api_key,
        openai.api_version,
        deployment_embedding,
    )[0]
    data.append(
        {
            "id": i,
            "content": doc.page_content,
            "content_vector": content_embeddings,
            "source_vector": source_embeddings,
            "symbol": doc.metadata["symbol"],
            "fiscal_year": doc.metadata["fiscal_year"],
            "fiscal_quarter": doc.metadata["fiscal_quarter"],
            "source": doc.metadata["source"],
            "chunkid": doc.metadata["chunk"],
        }
    )

In [None]:
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path": "/contentVector",
            "dataType": "float32",
            "distanceFunction": "dotproduct",
            "dimensions": 1536,
        },
        {
            "path": "/sourceVector",
            "dataType": "float32",
            "distanceFunction": "cosine",
            "dimensions": 1536,
        },
    ]
}

note: in the cell below we exclude '_etag' so any changes to it will not invoke re-indexing. We also exclude the source vector and content vector as their change is usually accomodateed by change in source and content, which will invoke re-indexing regardless. 

In [None]:
indexing_policy = {
    "includedPaths": [{"path": "/*"}],
    "excludedPaths": [
        {"path": '/"_etag"/?'},
        {"path": "/source_vector/*"},
        {"path": "/content_vector/*"},
    ],
    "vectorIndexes": [
        {"path": "/sourceVector", "type": "quantizedFlat"},
        {"path": "/contentVector", "type": "quantizedFlat"},
    ],
}

In [None]:
SEARCH_INDEX_CONFIG = {
    "indexingPolicy": indexing_policy,
    "vectorEmbeddingPolicy": vector_embedding_policy,
}

In [None]:
from azure.cosmos import CosmosClient

COSMOS_NOSQL_CLIENT = CosmosClient.from_connection_string(NOSQL_CONN_STRING)

In [None]:
# Cosmos DB imports
from azure.cosmos import CosmosClient
from azure.cosmos.aio import CosmosClient as CosmosAsyncClient
from azure.cosmos import PartitionKey, exceptions
from abc import ABC, abstractmethod
import pandas as pd


class DatabaseService(ABC):
    @abstractmethod
    def store_data(self, data):
        pass

    @abstractmethod
    def retrieve_data(self, query, num_results):
        pass


class NOSQLDBService(DatabaseService):
    def __init__(
        self, db_name, container_name, search_index_config=SEARCH_INDEX_CONFIG
    ):
        self.db_name = db_name
        self.container_name = container_name
        self.search_index_config = search_index_config
        self.client = COSMOS_NOSQL_CLIENT
        self._create_db()
        self._create_container()

    def _create_db(self):
        import json

        self.db = self.client.create_database_if_not_exists(id=self.db_name)
        self.db_properties = self.db.read()
        print(json.dumps(self.db_properties))

    def _create_container(self):
        try:
            self.container = self.db.create_container_if_not_exists(
                id=self.container_name,
                partition_key=PartitionKey(path="/id", kind="Hash"),
                indexing_policy=self.search_index_config["indexingPolicy"],
                vector_embedding_policy=self.search_index_config[
                    "vectorEmbeddingPolicy"
                ],
            )
        except exceptions.CosmosResourceExistsError:
            print(f"Container {self.container_name} already exists.")
            self.container = self.db.get_container_client(self.container_name)
        except exceptions.CosmosHttpResponseError as e:
            print(f"Failed to create container {self.container_name}: {e}")

    def create_schema(self, doc_chunks):
        data = []
        for i, doc in enumerate(doc_chunks):
            # Create embeddings using the provided function
            content_embeddings = createEmbeddings(
                doc.page_content,
                openai.api_base,
                openai.api_key,
                openai.api_version,
                deployment_embedding,
            )[0]
            source_embeddings = createEmbeddings(
                doc.metadata["source"],
                openai.api_base,
                openai.api_key,
                openai.api_version,
                deployment_embedding,
            )[0]
            data.append(
                {
                    "id": str(i),
                    "content": doc.page_content,
                    "contentVector": content_embeddings,
                    "sourceVector": source_embeddings,
                    "symbol": doc.metadata["symbol"],
                    "fiscal_year": doc.metadata["fiscal_year"],
                    "fiscal_quarter": doc.metadata["fiscal_quarter"],
                    "source": doc.metadata["source"],
                    "chunkid": doc.metadata["chunk"],
                    "@search.action": "upload",
                }
            )
        return data

    def store_data(self, data):
        # Convert the DataFrame to a list of tuples for bulk insertion
        for item in data:
            print("writing item ", item["id"])
            try:
                self.container.upsert_item(item)
            except Exception as e:
                print(f"Error inserting item: {e}")

    def retrieve_data(self, query, num_results=3):
        # Register 'pgvector' type for the 'embedding' column
        queryEmbedding = createEmbeddings(
            query,
            openai.api_base,
            openai.api_key,
            openai.api_version,
            deployment_embedding,
        )[0]
        output = self.container.query_items(
            query="SELECT TOP @num_results c.content, c.symbol, c.fiscal_year,c.fiscal_quarter, VectorDistance(c.content_vector,@embedding) AS SimilarityScore  FROM c ORDER BY VectorDistance(c.content_vector,@embedding)",
            parameters=[
                {"name": "@embedding", "value": queryEmbedding},
                {"name": "@num_results", "value": num_results},
            ],
            enable_cross_partition_query=True,
        )

        return output

In [None]:
testdb = NOSQLDBService(db_name="promptflow_sample", container_name="ms_transcripts")

In [None]:
len(doc_chunks)

In [None]:
data = testdb.create_schema(doc_chunks)

In [None]:
len(data)

In [None]:
testdb.store_data(data)

In [None]:
results = testdb.retrieve_data("what is the growth rate for azure ml revenue?")

In [None]:
for result in results:
    print(f"result {i}:\n\n")
    print(f"Content: {result['content']}")
    print(f"Symbol: {result['symbol']}")
    print(f"Fiscal Year: {result['fiscal_year']}")
    print(f"Fiscal Quarter: {result['fiscal_quarter']}")