In [None]:
import os
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from abc import ABC, abstractmethod

Please make sure you have already run the "../../../preprocessing/step0_data_preprocessor.ipynb" notebook to obtain DATA from the source (e.g. blobstorage).

In [None]:
docx_loader = DirectoryLoader("../../../preprocessing/DATA", glob="**/*.docx")

In [None]:
docs = docx_loader.load()

In [None]:
import re


def extract_info_from_filename(filename):
    """
    Input: filename ("MSFTTranscriptFY23Q4")
    Output: Extract stock symbol, year and quarter from filename
    """
    pattern = r"([A-Z]+)TranscriptFY(\d{2})Q(\d)"
    match = re.search(pattern, filename)

    if match:
        symbol = match.group(1)
        fiscal_year = match.group(2)
        fiscal_quarter = match.group(3)
        return symbol, fiscal_year, fiscal_quarter
    else:
        return None

In [None]:
doc_chunks = []

for doc in docs:
    source = doc.metadata["source"]
    symbol, fiscal_year, fiscal_quarter = extract_info_from_filename(source)

    text_splitter = RecursiveCharacterTextSplitter(
        separators=[
            "\n## ",
            "\n### ",
            "\n#### ",
            "\n##### ",
            "\n###### ",
            "```\n\n",
            "\n\n***\n\n",
            "\n\n---\n\n",
            "\n\n___\n\n",
            "\n\n",
            "\n",
            " ",
            "",
        ]
    )

    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "source": source,
                "symbol": symbol,
                "fiscal_year": fiscal_year,
                "fiscal_quarter": fiscal_quarter,
                "chunk": i,
            },
        )
        doc_chunks.append(doc)

In [None]:
len(doc_chunks)

In [None]:
from dotenv import dotenv_values
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
import openai
import pandas as pd
import numpy as np
import time
import requests

# specify the name of the .env file name
env_name = "../../../../.env"  # change to your own .env file name
config = dotenv_values(env_name)

if config["KEYS_FROM"] == "KEYVAULT":
    print("keyvault was selected.")
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)

    openai.api_type = client.get_secret("OPENAI-API-TYPE").value
    openai.api_key = client.get_secret("OPENAI-API-KEY").value
    openai.api_base = client.get_secret("OPENAI-API-BASE").value
    openai.api_version = client.get_secret("OPENAI-API-VERSION").value
    deployment_embedding = client.get_secret("OPENAI-DEPLOYMENT-EMBEDDING").value
else:
    openai.api_type = config["OPENAI_API_TYPE"]
    openai.api_key = config["OPENAI_API_KEY"]
    openai.api_base = config["OPENAI_API_BASE"]
    openai.api_version = config["OPENAI_API_VERSION"]
    deployment_embedding = config["OPENAI_DEPLOYMENT_EMBEDDING"]


def createEmbeddings(text, endpoint, api_key, api_version, embedding_model_deployment):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {"Content-Type": "application/json", "api-key": api_key}
    request_payload = {"input": text}
    embedding_response = requests.post(
        request_url, json=request_payload, headers=headers, timeout=None
    )
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")

In [None]:
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from dotenv import dotenv_values

config = dotenv_values(env_name)

if config["KEYS_FROM"] == "KEYVAULT":
    print("keyvault was selected.")
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)
    MONGO_CONN = (client.get_secret("COSMOS-DB-MONGO-URI").value,)
else:
    print(".env was selected.")
    MONGO_CONN = config["COSMOS_DB_MONGO_URI"]

In [None]:
import pymongo
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from abc import ABC, abstractmethod


class DatabaseService(ABC):
    @abstractmethod
    def store_data(self, data):
        pass

    @abstractmethod
    def retrieve_data(self, query, num_results):
        pass


SEARCH_INDEX_NAME = "vectorSearchIndex"


class MongoDBService(DatabaseService):
    def __init__(self, db_name, collection_name, search_index_name=SEARCH_INDEX_NAME):
        self.db_name = db_name
        self.collection_name = collection_name
        self.search_index_name = search_index_name
        self.client = MongoClient(MONGO_CONN)
        self.db = self.client[self.db_name]
        self.collection = self.db[self.collection_name]
        self._create_database()
        self._create_search_index()
        self._create_filter_search_index()

    def _create_schema(self, docs):
        data = []
        for i, doc in enumerate(docs):
            # Create embeddings using the provided function
            embeddings = createEmbeddings(
                doc.page_content,
                openai.api_base,
                openai.api_key,
                openai.api_version,
                deployment_embedding,
            )[0]
            data.append(
                {
                    "id": i,
                    "content": doc.page_content,
                    "contentVector": embeddings,
                    "symbol": doc.metadata["symbol"],
                    "fiscal_year": doc.metadata["fiscal_year"],
                    "fiscal_quarter": doc.metadata["fiscal_quarter"],
                    "source": doc.metadata["source"],
                    "chunkid": doc.metadata["chunk"],
                }
            )
        return data

    def _create_database(self):
        # Check if the collection already exists, and create it if needed
        if self.collection_name not in self.db.list_collection_names():
            self.db.create_collection(self.collection_name)
            print("Created collection '{}'.".format(self.collection_name))
        else:
            print("Using collection: '{}'.".format(self.collection_name))

    def _create_search_index(self):
        self.db.command(
            {
                "createIndexes": self.collection_name,
                "indexes": [
                    {
                        "name": self.search_index_name,
                        "key": {"contentVector": "cosmosSearch"},
                        "cosmosSearchOptions": {
                            "kind": "vector-ivf",
                            "numLists": 1,
                            "similarity": "COS",  # TODO: Add other similarity options.
                            "dimensions": 1536,
                        },
                    }
                ],
            }
        )
    
    def _create_filter_search_index(self):
        self.db.command({
            "createIndexes": self.collection_name, 
            "indexes": [
            {
                "key": { 
                    "symbol": 1 
                    }, 
                "name": "symbol_filter" 
            },
            {
                "key": { 
                    "fiscal_year": 1 
                    }, 
                "name": "fiscal_year_filter" 
            },
            {
                "key": { 
                    "fiscal_quarter": 1 
                    }, 
                "name": "fiscal_quarter_filter" 
            }
            ] 
        }
        )  
    def _drop_data(self):
        self.collection.drop_index(self.search_index_name)
        self.client.drop_database(self.db_name)

    def store_data(self, data):
        try:
            # Insert data into the collection
            self.collection.insert_many(data)
            print("Data inserted successfully.")
        except Exception as e:
            print("Failed to insert data: ", e)

    def insert_one(self, entry):
        self.collection.insert_one(entry)

    def retrieve_data(self, query, num_results=1):
        try:
            # Perform a vector search query
            pipeline = [
                {
                    "$search": {
                        "cosmosSearch": {
                            "vector": query,
                            "path": "contentVector",
                            "k": num_results,
                        },
                        "returnStoredSource": True,
                    }
                }
            ]
            results = list(self.collection.aggregate(pipeline))
            return results
        except Exception as e:
            print("Search query failed: ", e)

In [None]:
client = MongoClient(MONGO_CONN)

In [None]:
mongodb = MongoDBService(db_name="earning_calls", collection_name="transcript_filter_vector")

In [None]:
data = mongodb._create_schema(doc_chunks)

In [None]:
mongodb.store_data(data)

In [None]:
mongodb.collection

In [None]:
mydb = mongodb.db

In [None]:
col = mydb[mongodb.collection_name]

In [None]:
# Verify all indexes are present
for i in col.list_indexes():
    print(i)