In [None]:
from beir.datasets.data_loader import GenericDataLoader
data_path = "datasets/hotpotqa"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

In [None]:
# Get the top n dictionary entries
corpus_select = dict(list(corpus.items())[:5])

for c in corpus_select.values():
    print(c) 
    print(type(c))

In [None]:
print(corpus_select.items())


In [None]:
import json
# Convert the dictionary to the desired JSON array format
corpus_json_array = [{'id': int(key), 'text': value['text']} for key, value in corpus_select.items()]

# Convert the list to a JSON string
json_output = json.dumps(corpus_json_array, indent=4)

print(json_output)

In [None]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv

# Load the environment variables from .env
load_dotenv()

# 1. Set up Azure OpenAI and Cosmos DB credentials
embeddings_endpoint = os.environ.get('AOAI_ENDPOINT')
embeddings_api_version = "2024-02-01"
embeddings_api_key = os.environ.get('AOAI_API_KEY')

# Initialize the Azure OpenAI client
endpoint = "https://swc-aoai-dev-01.openai.azure.com/"
model_name = "text-embedding-3-large"

openai_client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=endpoint,
    api_key=embeddings_api_key
)

# Function to get embeddings from Azure OpenAI
def get_embeddings(text):
    response = openai_client.embeddings.create(
        model=model_name,
        input=text,
        dimensions=3072
    )
    return response.data[0].embedding
    
print(get_embeddings("Hello, how are you?"))

In [None]:
# Convert the dictionary to the desired JSON array format and add vector_text attribute
json_array = []
for key, value in corpus_select.items():
    text = value['text']
    vector_text = get_embeddings(text)
    if vector_text is not None:
        json_array.append({'id': key, 'text': text, 'vector1': vector_text})

# Convert the list to a JSON string
json_output = json.dumps(json_array, indent=4)

print(json_output)

In [None]:
from azure.cosmos import CosmosClient, PartitionKey

# Initialize the Cosmos client
connection_string = os.environ.get('COSMOSDB_CONN_STR')
client = CosmosClient.from_connection_string(connection_string)

# Define the database and container
database_name = 'EmbeddingsDB'
container_name = 'hotpotqa'
database = client.get_database_client(database_name)
container = database.get_container_client(container_name)

In [None]:
from azure.cosmos import CosmosClient, PartitionKey, exceptions
# Define the database and container
database_name = 'EmbeddingsDB'
container_name = 'hotpotqa'
database = client.get_database_client(database_name)

# Define indexing policy and vector embedding policy if needed
vector_embedding_policy = {
    "vectorEmbeddings": [ 
        { 
            "path": "/vectorized_text", 
            "dataType": "float32", 
            "distanceFunction": "euclidean", 
            "dimensions":  3072
        }
    ] 
}

indexing_policy = {
    "indexingMode": "consistent",
    "automatic": True,
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/\"_etag\"/?"
        },
        {
            "path": "/vectorized_text/*"
        }
    ],
    "fullTextIndexes": [],
    "vectorIndexes": [
        {
        "path": "/vectorized_text",
        "type": "diskANN",
        "quantizationByteSize": 128,
        "IndexingSearchListSize": 100
        }
    ]
}

# Create container if not exists
try:
    container = database.create_container_if_not_exists(
        id=container_name,
        partition_key=PartitionKey(path='/id'),
        indexing_policy=indexing_policy,
        vector_embedding_policy=vector_embedding_policy
    )
    print(f'Container with id \'{container_name}\' created')
except exceptions.CosmosHttpResponseError as e:
    raise e

In [None]:
# Convert the dictionary to the desired JSON array format and add vector_text attribute
json_array = []
for key, value in corpus.items():
    text = value['text']
    vector_text = get_embeddings(text)
    if vector_text is not None:
        json_array.append({'id': key, 'text': text, 'vectorized_text': vector_text})

# Insert documents in batches of 100
batch_size = 100
for i in range(0, len(json_array), batch_size):
    batch = json_array[i:i + batch_size]
    print("Current batch: " + str(i+batch_size))
    for item in batch:
        container.upsert_item(item)

print("Data inserted into Cosmos DB successfully.")

In [None]:
len(corpus)

In [None]:
# Convert the dictionary to the desired JSON array format and add vector_text attribute
# Insert documents in batches of 100
import time
# Function to insert a batch of items with retry logic
def insert_batch_with_retry(batch, retries=3, delay=5):
    for attempt in range(retries):
        try:
            for item in batch:
                container.upsert_item(item)
            return
        except exceptions.CosmosHttpResponseError as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                raise

batch_size = 100
for i in range(0, len(corpus.items()), batch_size):
    for key, value in corpus.items():
        text = value['text']
        vector_text = get_embeddings(text)
        if vector_text is not None:
            batch = {'id': key, 'text': text, 'vectorized_text': vector_text}
            insert_batch_with_retry(batch, 3, 5)
            #container.upsert_item({'id': key, 'text': text, 'vectorized_text': vector_text})
    print("Current batch: " + str(i+batch_size))
print("Data inserted into Cosmos DB successfully.")