In [8]:
import os
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from abc import ABC, abstractmethod

In [9]:
docx_loader = DirectoryLoader('./DATA', glob='**/*.docx')

In [10]:
docs = docx_loader.load()

In [11]:
import re
def extract_info_from_filename(filename):
    '''
    Input: filename ("MSFTTranscriptFY23Q4")
    Output: Extract stock symbol, year and quarter from filename
    '''
    pattern = r'([A-Z]+)TranscriptFY(\d{2})Q(\d)'
    match = re.search(pattern, filename)
    
    if match:
        symbol = match.group(1)
        fiscal_year = match.group(2)
        fiscal_quarter = match.group(3)
        return symbol, fiscal_year, fiscal_quarter
    else:
        return None

In [12]:
doc_chunks = []

for doc in docs:
    source = doc.metadata["source"]
    symbol, fiscal_year, fiscal_quarter = extract_info_from_filename(source)

    text_splitter = RecursiveCharacterTextSplitter(
        separators=[
            "\n## ",
            "\n### ",
            "\n#### ",
            "\n##### ",
            "\n###### ",
            "```\n\n",
            "\n\n***\n\n",
            "\n\n---\n\n",
            "\n\n___\n\n",
            "\n\n",
            "\n",
            " ",
            "",
        ]
    )

    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={"source": source, "symbol": symbol, "fiscal_year": fiscal_year, "fiscal_quarter": fiscal_quarter, "chunk": i}
        )
        doc_chunks.append(doc)

In [15]:
len(doc_chunks)

60

In [16]:
print(doc_chunks[2].metadata)
print(doc_chunks[2].page_content)

{'source': 'DATA\\MSFTTranscriptFY23Q1.docx', 'symbol': 'MSFT', 'fiscal_year': '23', 'fiscal_quarter': '1', 'chunk': 2}
CBRE is optimizing its field service operations, gaining cost efficiencies. Darden is using our solutions to increase both guest frequency and spend at its restaurants. And, Tillamook is scaling its growth and improving supply chain visibility. 



All-up, more than 400,000 organizations now use our business applications. 

Now, on to industry solutions. 

We’re seeing increased adoption of our industry and cross-industry clouds. 

 Bank of Queensland chose our Cloud for Financial Services to deliver new digital experiences for its customers. Our Cloud for Sustainability is off to a fast start, as organizations like Telstra use the solution to track their environmental footprint. 

New updates provide insights on hard to measure Scope 3 carbon emissions. 

And we’re seeing record growth in healthcare, driven in part by our Nuance DAX ambient intelligence solution, whi

In [17]:
from dotenv import dotenv_values
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
import openai
import pandas as pd
import numpy as np
import time
import requests

# specify the name of the .env file name 
env_name = "../../.env" # change to your own .env file name
config = dotenv_values(env_name)

if config['KEYS_FROM'] == "KEYVAULT":
    print('keyvault was selected.')
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)

    openai.api_type = client.get_secret("OPENAI-API-TYPE").value
    openai.api_key = client.get_secret("OPENAI-API-KEY").value
    openai.api_base = client.get_secret("OPENAI-API-BASE").value
    openai.api_version = client.get_secret("OPENAI-API-VERSION").value
    deployment_embedding = client.get_secret("OPENAI-DEPLOYMENT-EMBEDDING").value
else:
    openai.api_type = config["OPENAI_API_TYPE"] 
    openai.api_key = config["OPENAI_API_KEY"]
    openai.api_base = config["OPENAI_API_BASE"] 
    openai.api_version = config["OPENAI_API_VERSION"] 
    deployment_embedding = config["OPENAI_DEPLOYMENT_EMBEDDING"]


def createEmbeddings(text, endpoint, api_key, api_version, embedding_model_deployment):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    request_payload = {
        'input': text
    }
    embedding_response = requests.post(request_url, json=request_payload, headers=headers, timeout=None)
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")





keyvault was selected.


In [19]:
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from dotenv import dotenv_values

config = dotenv_values('../../.env')

if config['KEYS_FROM'] == "KEYVAULT":
    print('keyvault was selected.')
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)
    MONGO_CONN = client.get_secret("COSMOS_DB_MONGODB_URI").value,
else:
    print('.env was selected.')
    MONGO_CONN = config["COSMOS_DB_URI"]


.env was selected.


In [20]:
import pymongo
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from abc import ABC, abstractmethod


class DatabaseService(ABC):
    @abstractmethod
    def store_data(self, data):
        pass

    @abstractmethod
    def retrieve_data(self, query, num_results):
        pass

SEARCH_INDEX_NAME = 'vectorSearchIndex'


class MongoDBService(DatabaseService):
    def __init__(self, db_name, collection_name, search_index_name=SEARCH_INDEX_NAME):
        self.db_name = db_name
        self.collection_name = collection_name
        self.search_index_name = search_index_name
        self.client = MongoClient(MONGO_CONN)
        self.db = self.client[self.db_name]
        self.collection = self.db[self.collection_name]
        self._create_database()
        self._create_search_index()

    
    def _create_schema(self, docs):
        data = []
        for i, doc in enumerate(docs):
            # Create embeddings using the provided function
            embeddings = createEmbeddings(doc.page_content,
                    openai.api_base,
                    openai.api_key,
                    openai.api_version,
                    deployment_embedding
                )[0]
            data.append({
                'id': i,
                'content': doc.page_content,
                'contentVector': embeddings,
                'symbol': doc.metadata['symbol'],
                'fiscal_year': doc.metadata['fiscal_year'],
                'fiscal_quarter': doc.metadata['fiscal_quarter'],
                'source': doc.metadata['source'],
                'chunkid': doc.metadata['chunk']
            })
        return data

    def _create_database(self):
        # Check if the collection already exists, and create it if needed
        if self.collection_name not in self.db.list_collection_names():
            self.db.create_collection(self.collection_name)
            print("Created collection '{}'.".format(self.collection_name))
        else:
            print("Using collection: '{}'.".format(self.collection_name))

    def _create_search_index(self):
        self.db.command({
            'createIndexes': self.collection_name,
            'indexes': [
              {
                'name': self.search_index_name,
                'key': {
                  "contentVector": "cosmosSearch"
                },
                'cosmosSearchOptions': {
                  'kind': 'vector-ivf',
                  'numLists': 1,
                  'similarity': 'COS', # TODO: Add other similarity options.
                  'dimensions': 1536
                }
              }
            ]
          }); 

    def _drop_data(self):
        self.collection.drop_index(self.search_index_name)
        self.client.drop_database(self.db_name)
    def store_data(self, data):
        try:
            # Insert data into the collection
            self.collection.insert_many(data)
            print("Data inserted successfully.")
        except Exception as e:
            print("Failed to insert data: ", e)
    
    def insert_one(self, entry):
        self.collection.insert_one(entry)

    def retrieve_data(self, query, num_results=1):
        try:
            # Perform a vector search query
            pipeline = [
                {
                    "$search": {
                        "cosmosSearch": {
                            "vector": query,
                            "path": "contentVector",
                            "k": num_results
                        },
                        "returnStoredSource": True
                    }
                }
            ]
            results = list(self.collection.aggregate(pipeline))
            return results
        except Exception as e:
            print("Search query failed: ", e)

In [21]:
client = MongoClient(MONGO_CONN)

In [22]:
mongodb = MongoDBService(db_name='testdb',collection_name='testcol')

Using collection: 'testcol'.


In [23]:
data = mongodb._create_schema(doc_chunks)


In [24]:
mongodb.store_data(data)

Data inserted successfully.


In [25]:
mongodb.collection

Collection(Database(MongoClient(host=['c.appliedaimongodb.mongocluster.cosmos.azure.com:10260'], document_class=dict, tz_aware=False, connect=True, tls=True, authmechanism='SCRAM-SHA-256', retrywrites=False, maxidletimems=120000), 'testdb'), 'testcol')

In [26]:
mongodb._create_search_index()

In [3]:
from pymongo import MongoClient
mongo_client2 = MongoClient('mongodb+srv://mongodb:1919Shattuck@appliedaimongodb.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000')

In [4]:
db = mongo_client2['testdb']
collection = db['testcol']
cursor = collection.find({})
for doc in cursor:
    doc = doc
    print(doc)

{'_id': ObjectId('65b441f46806ea4139c73150'), 'id': 0, 'content': "Microsoft FY23 First Quarter Earnings Conference Call\n\nBrett Iversen, Satya Nadella, Amy Hood\n\nTuesday, October 25, 2022\n\nBRETT IVERSEN: \n\nGood afternoon and thank you for joining us today. On the call with me are Satya Nadella, chairman and chief executive officer, Amy Hood, chief financial officer, Alice Jolla, chief accounting officer, and Keith Dolliver, deputy general counsel.\n\nOn the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to supplement our prepared remarks during today’s call and provides the reconciliation of differences between GAAP and non-GAAP financial measures. \n\nOn this call we will discuss certain non-GAAP items. The non-GAAP financial measures provided should not be considered as a\xa0substitute for or superior to the measures of financial performance prepared in accordance with GAAP.\xa0They are include

In [5]:
doc

{'_id': ObjectId('65b83e1e452f81afcc7df762'),
 'id': 59,
 'content': 'The thing that perhaps even in the last quarter, and I had that in my remarks, is most exciting is how, with Microsoft Fabric, especially for the analytics workloads, we’ve brought together compute, storage, governance with a very disruptive business model. \n\nI mean, to give you a flavor for it, right, you have your data in an Azure data lake. You can bring SQL compute to it. You can bring Spark to it. You can bring Azure AI or Azure OpenAI to it, right? The fact is you have storage separated from all these compute meters, and they’re all interchangeable, right? You don’t have to buy each of these separately. That’s the disruptive business model. \n\n\n\nI feel that Microsoft is very well positioned with the way our data architecture lays out, our business model around data, and how people will plan to use data with AI services. That’s kind of what I mean by getting your data estate in order, and it’s just not gett