In [None]:
import csv
import os
import random
import time
from azure.cosmos import CosmosClient        # pip install azure-cosmos

# 1. Connect to Azure Cosmos DB

In [None]:
client    = CosmosClient('AZURE_COSMOS_DB_ENDPOINT', 'AZURE_COSMOS_DB_ACCOUNT_KEY')
database  = client.get_database_client('AZURE_COSMOS_DB_DATABASE_NAME')
container = database.get_container_client('AZURE_COSMOS_DB_CONTAINER_NAME')

# 2. Retrieve all currently available serialnumbers in Azure Cosmos DB

In [None]:
# Define a list to store all retrieved serialnumbers
serialnumbers = []

# Retrieve all serialnumbers from database
datasets = container.query_items(query = 'SELECT DISTINCT c.SerialNumber FROM c', enable_cross_partition_query = True)

# Store the retrieved serialnumbers to list
for dataset in datasets:
    serialnumbers.append(dataset["SerialNumber"])

# 3. Run the operation

In [None]:
# The database contains 100.000 data sets
number_of_all_datasets = 100000

# Declare list to store mean query durations per iteration
query_durations = []

In [None]:
# Declare 10 iterations a 10.000 data sets
for _ in range(10):
    
    # Declare a list to store the mean operation durations of an iteration
    iteration_durations = []
    
    # 10.000 data sets are deleted in each delete iteration
    for _ in range(10000):
        
        # Select a random serial number for which the corresponding data record is to be deleted
        random_serialnumber = random.choice(serialnumbers)
        
        # Run the CRUD-Operation
    
        # Record the current timestamp before running the operation  
        query_start_time = time.time()
        
        # First, the ID and partition key of the data set to be updated must be retrieved
        sql_query = f"SELECT c.id, c.ArticleName FROM c WHERE c.SerialNumber = '{random_serialnumber}'"

        datasets = container.query_items(
            query = 'SELECT c.id, c.ArticleName FROM c WHERE c.SerialNumber = @serial_number', 
            enable_cross_partition_query = True
        )
        
        # Delete the retrieved data set
        for dataset in datasets:
            container.delete_item(
                dataset['id'], 
                partition_key=dataset['ArticleName']
            )
        

        # Record the current timestamp after running the operation
        query_end_time = time.time()

        # Calculate the duration time for this operation and append this to list
        query_duration = query_end_time - query_start_time
        iteration_durations.append(query_duration)

        # Remove the affected serial number from the list    
        serialnumbers.remove(random_serialnumber)
    
    # Calculate the mean duration for the iteration of 10.000 data sets and store in query_durations list        
    mean_duration = sum(iteration_durations) / len(iteration_durations)
    query_durations.append([mean_duration, 1, number_of_all_datasets])
    
    number_of_all_datasets -= 10000

# 4. Saving the recorded operation times in the CSV result file

In [None]:
# Define the file path for the CSV file in the "Experiment_Results" directory
filepath = os.path.join("Experiment_Results", "delete_data.csv")

# Check if the file already exists
file_exists = os.path.isfile(filepath)

# Open the CSV file in append mode; create the file if it does not exist
with open(filepath, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # If the file does not exist, write the header row
    if not file_exists:
        writer.writerow(['DurationTime', 'NumberOfProcessedDatasets', 'NumberOfDatasetsInDatabase'])
    
    # Write the rows of data from the query_durations list
    writer.writerows(query_durations)
