In [61]:
%pip install --upgrade --quiet  pymongo

Note: you may need to restart the kernel to use updated packages.


In [45]:
import os

INDEX_NAME = "izzy-test-index"
NAMESPACE = "izzy_test_db.izzy_test_collection"
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")

In [46]:
# Set up the OpenAI Environment Variables
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://XXXX.openai.azure.com/"
os.environ["AZURE_OPENAI_API_KEY"] = "XXXX"
os.environ[
    "OPENAI_EMBEDDINGS_DEPLOYMENT"
] = "smart-agent-embedding-ada"  # the deployment name for the embedding model
os.environ["OPENAI_EMBEDDINGS_MODEL_NAME"] = "text-embedding-ada-002"  # the model name

In [None]:
%pip install langchain-community
%pip install langchain_openai
%pip install langchain_text_splitters

from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.azure_cosmos_db import (
    AzureCosmosDBVectorSearch,
    CosmosDBSimilarityType,
    CosmosDBVectorSearchType,
)
from langchain_openai import AzureOpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter


SOURCE_FILE_NAME = "./pradap.txt"

loader = TextLoader(SOURCE_FILE_NAME)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

openai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment="smart-agent-embedding-ada",
    openai_api_version="2023-05-15",
)


In [55]:
from pymongo import MongoClient
from urllib.parse import quote_plus

username = quote_plus("XXXX")
password = quote_plus("XXXX")
host = "mongodbcluster-cosmosconf2024.global.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
port = "10255"

CONNECTION_STRING = f"mongodb+srv://{username}:{password}@{host}"

client: MongoClient = MongoClient(CONNECTION_STRING)
collection = client[DB_NAME][COLLECTION_NAME]

model_deployment = os.getenv(
    "OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
)
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")

vectorstore = AzureCosmosDBVectorSearch.from_documents(
    docs,
    openai_embeddings,
    collection=collection,
    index_name=INDEX_NAME,
)

# Read more about these variables in detail here. https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search
num_lists = 100
dimensions = 1536
similarity_algorithm = CosmosDBSimilarityType.COS
kind = CosmosDBVectorSearchType.VECTOR_IVF
m = 16
ef_construction = 64
ef_search = 40
score_threshold = 0.1

vectorstore.create_index(
    num_lists, dimensions, similarity_algorithm, kind, m, ef_construction
)

{'raw': {'defaultShard': {'numIndexesBefore': 2,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'note': 'all indexes already exist',
   'ok': 1}},
 'ok': 1}

In [56]:
# perform a similarity search between the embedding of the query and the embeddings of the documents
query = "Microsoft Fabric features"
docs = vectorstore.similarity_search(query)

In [57]:
print(docs[0].page_content)

Microsoft Fabric 
Microsoft Fabric is an all-in-one analytics solution for enterprises. It covers everything from data movement to data science, Real-Time Analytics, and business intelligence. It offers a comprehensive suite of services, including data lake, data engineering, and data integration, all in one place. It’s a unified platform that brings together a diverse range of technologies and tools into a single solution.

Key Features:


In [58]:
vectorstore = AzureCosmosDBVectorSearch.from_connection_string(
    CONNECTION_STRING, NAMESPACE, openai_embeddings, index_name=INDEX_NAME
)

# perform a similarity search between a query and the ingested documents
query = "Microsoft Fabric features"
docs = vectorstore.similarity_search(query)

print(docs[0].page_content)

Microsoft Fabric 
Microsoft Fabric is an all-in-one analytics solution for enterprises. It covers everything from data movement to data science, Real-Time Analytics, and business intelligence. It offers a comprehensive suite of services, including data lake, data engineering, and data integration, all in one place. It’s a unified platform that brings together a diverse range of technologies and tools into a single solution.

Key Features:


In [59]:
vectorstore = AzureCosmosDBVectorSearch(
    collection, openai_embeddings, index_name=INDEX_NAME
)

# perform a similarity search between a query and the ingested documents
query = "Microsoft Fabric features"
docs = vectorstore.similarity_search(query)

print(docs[0].page_content)

Microsoft Fabric 
Microsoft Fabric is an all-in-one analytics solution for enterprises. It covers everything from data movement to data science, Real-Time Analytics, and business intelligence. It offers a comprehensive suite of services, including data lake, data engineering, and data integration, all in one place. It’s a unified platform that brings together a diverse range of technologies and tools into a single solution.

Key Features:
