## Multiple device readings - Cross partition or not?
Test dataset to evaluate RU cost for collocating IOT readings in a single document, multiple documents with N readings and cross-partition effect. </br>

- Rely on containers created from [PrepareContainers.ipynb](./PrepareContainers.ipynb)

### Prepare key references for notebook
Environment variables COSMOS_ACCOUNT_URI and COSMOS_ACCOUNT_KEY should exist

In [1]:
from azure.cosmos import CosmosClient, PartitionKey
from configparser import ConfigParser
from faker import Faker

import os
import json
import uuid

parser = ConfigParser()
parser.read('../NotebookConfig.cfg')

cosmosAccountURI = parser.get('CosmosDB', 'COSMOSDB_ACCOUNT_URI')
cosmosAccountKey = parser.get('CosmosDB', 'COSMOSDB_ACCOUNT_KEY')

databaseName = 'Models'
containerName = 'Cnt5PhysicalPartitions'
partitionKeypath = '/PartitionKey'
osPath = './OutputFiles/'

Faker.seed(0)
fake = Faker(['en-US'])

if not os.path.exists(osPath):
    os.mkdir(osPath)

### Create DB and collection for sample data
- Resources should already exists, so just get references for database and container.
  - If not, a container with a single physical partition will be created.
- Adjust the RUs to your need...

In [2]:
client = CosmosClient(cosmosAccountURI, cosmosAccountKey)
db = client.create_database_if_not_exists(databaseName)

pkPath = PartitionKey(path=partitionKeypath)
ctr = db.create_container_if_not_exists(id=containerName, partition_key=pkPath, offer_throughput=1000, default_ttl=None) 
# ctr.replace_throughput(throughput=2000)

### Repeatable References
Contains documents that should be used as a reference data set across cell/operations. </br>
For example: Patient, IOT devices, tax payer information, ...

In [3]:
from collections import OrderedDict
maxRange = 10000
IOTSources = []

os.makedirs(os.path.dirname('./OutputFiles/'), exist_ok=True)
with open('./OutputFiles/' + containerName + '_referenceData.json', 'w') as jsonFile:
    for i in range(maxRange):
        entity = {            
            'Name': fake.bothify('????_############')
            , 'Type': fake.random_element(elements=('Type1', 'Type2', 'Type3', 'Type4', 'Type5', 'Type6'))
        }
        IOTSources.append(entity)

        # Save patients for reference
        json.dump(entity, jsonFile)
        if (i < maxRange):
            jsonFile.write(',\n')

### Load documents with Partition key Name_Type_YYYY_MM_DD
- 1440 readings for a single document (1 reading per minute)
- 30 days
- Average RU cost with default indexing: ~ 877 RUs
- Save generated documents in output directory for reference

In [4]:
## Loading daily readings per document, multiple partition key (based on yyyy-mm-dd)
from datetime import datetime, timedelta

RUCharges = []
daysRange = 30
iotRange = 1000

dtBase = datetime(year=2023, month=1, day=20)

for iot in range(iotRange):
    IOTSrc = IOTSources[fake.random_int(min=0, max=9999)]

    for day in range(daysRange):
        docs = []        
        readings = []

        # *** Produce 1440 readings - 1 for each minute
        for m in range(1440):
            readings.append(
                {
                    'Dimension': fake.random_element(elements=('D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9'))
                    , 'Metric': fake.random_number(digits=5)
                    , 'Timestamp': (dtBase + timedelta(days=day,minutes=m)).isoformat()  # fake.date_time_this_year().isoformat()
                })

        doc = {
            'id': str(uuid.uuid4())
            , 'PartitionKey': IOTSrc['Name'] + '_' + IOTSrc['Type'] + '_' + (dtBase + timedelta(days=day)).strftime('%Y_%m_%d')
            , 'Name': IOTSrc['Name']
            , 'Type': IOTSrc['Type']
            , 'JobId': 100
            , 'Entity': IOTSrc
            , 'DateTime': (dtBase + timedelta(days=day)).isoformat()
            , 'Timestamp': (dtBase + timedelta(days=day)).timestamp()
            , 'Readings': readings
            , 'class': fake.random_element(elements=OrderedDict([("A", 0.40), ("B", 0.35), ("C", 0.15), ("D", 0.05), ("E", 0.05)]))
        }

        ctr.create_item(doc)
        RUCharges.append(float(ctr.client_connection.last_response_headers['x-ms-request-charge']))
        #print(ctr.client_connection.last_response_headers['x-ms-request-charge'])

        # Store docs and reset readings
        docs.append(doc)

        with open('./OutputFiles/' + containerName + '_' + IOTSrc['Name'] + '_' + IOTSrc['Type'] + '_' + (dtBase + timedelta(days=day)).strftime('%Y_%m_%d') + '_docs.json', 'w') as jf:
            for d in docs:
                json.dump(d, jf)
                #jf.write('\n')

KeyboardInterrupt: 

In [5]:
print('Average RU cost: ' + str(sum(RUCharges) / len(RUCharges)))

Average RU cost: 876.3230000000002


### Load documents with Partition key Name_Type
- 1440 readings for a single document (1 reading per minute)
- 30 days
- Average RU cost with default indexing: ~ 877 RUs
- Save generated documents in output directory for reference

In [6]:
## Loading daily readings per document, single partition key
from datetime import datetime, timedelta

RUCharges = []
daysRange = 30
iotRange = 1

dtBase = datetime(year=2022, month=10, day=1)

for iot in range(iotRange):
    IOTSrc = IOTSources[fake.random_int(min=0, max=9999)]

    for day in range(daysRange):
        docs = []        
        readings = []

        # *** Produce 1440 readings - 1 for each minute
        for m in range(1440):
            readings.append(
                {
                    'Dimension': fake.random_element(elements=('D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9'))
                    , 'Metric': fake.random_number(digits=5)
                    , 'Timestamp': (dtBase + timedelta(days=day,minutes=m)).isoformat()  # fake.date_time_this_year().isoformat()
                })

        doc = {
            'id': str(uuid.uuid4())
            , 'PartitionKey': IOTSrc['Name'] + '_' + IOTSrc['Type']
            , 'Name': IOTSrc['Name']
            , 'Type': IOTSrc['Type']
            , 'JobId': 100
            , 'Entity': IOTSrc
            , 'DateTime': (dtBase + timedelta(days=day)).isoformat()
            , 'Timestamp': (dtBase + timedelta(days=day)).timestamp()
            , 'Readings': readings
            , 'class': fake.random_element(elements=OrderedDict([("A", 0.40), ("B", 0.35), ("C", 0.15), ("D", 0.05), ("E", 0.05)]))
        }

        ctr.create_item(doc)
        RUCharges.append(float(ctr.client_connection.last_response_headers['x-ms-request-charge']))
        # print(ctr.client_connection.last_response_headers['x-ms-request-charge'])

        # Store docs and reset readings
        docs.append(doc)

        with open('./OutputFiles/' + containerName + '_' + IOTSrc['Name'] + '_' + IOTSrc['Type'] + '_' + (dtBase + timedelta(days=day)).strftime('%Y_%m_%d') + '_docs.json', 'w') as jf:
            for d in docs:
                json.dump(d, jf)
                #jf.write('\n')

In [7]:
print('Average RU charge: ' + str(sum(RUCharges) / len(RUCharges)))

Average RU charge: 876.2913333333332


### Load documents with Partition key Name_Type_YYYY_MM_DD
- 1440 readings for a single document (1 reading per minute)
- 365 days
- Average RU cost with default indexing: ~ 877 RUs
- Save generated documents in output directory for reference

In [8]:
## Loading daily readings per document, multiple partition key (based on yyyy-mm-dd)
from datetime import datetime, timedelta

RUCharges = []
daysRange = 365
iotRange = 1

dtBase = datetime(year=2022, month=1, day=1)

for iot in range(iotRange):
    IOTSrc = IOTSources[fake.random_int(min=0, max=9999)]

    for day in range(daysRange):
        docs = []        
        readings = []

        # *** Produce 1440 readings - 1 for each minute
        for m in range(1440):
            readings.append(
                {
                    'Dimension': fake.random_element(elements=('D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9'))
                    , 'Metric': fake.random_number(digits=5)
                    , 'Timestamp': (dtBase + timedelta(days=day,minutes=m)).isoformat()  # fake.date_time_this_year().isoformat()
                })

        doc = {
            'id': str(uuid.uuid4())
            , 'PartitionKey': IOTSrc['Name'] + '_' + IOTSrc['Type'] + '_' + (dtBase + timedelta(days=day)).strftime('%Y_%m_%d')
            , 'Name': IOTSrc['Name']
            , 'Type': IOTSrc['Type']
            , 'JobId': 100
            , 'Entity': IOTSrc
            , 'DateTime': (dtBase + timedelta(days=day)).isoformat()
            , 'Timestamp': (dtBase + timedelta(days=day)).timestamp()
            , 'Readings': readings
            , 'class': fake.random_element(elements=OrderedDict([("A", 0.40), ("B", 0.35), ("C", 0.15), ("D", 0.05), ("E", 0.05)]))
        }

        ctr.create_item(doc)
        RUCharges.append(float(ctr.client_connection.last_response_headers['x-ms-request-charge']))
        #print(ctr.client_connection.last_response_headers['x-ms-request-charge'])

        # Store docs and reset readings
        docs.append(doc)

        with open('./OutputFiles/' + containerName + '_' + IOTSrc['Name'] + '_' + IOTSrc['Type'] + '_' + (dtBase + timedelta(days=day)).strftime('%Y_%m_%d') + '_docs.json', 'w') as jf:
            for d in docs:
                json.dump(d, jf)
                #jf.write('\n')

In [9]:
print('Average RU charge: ' + str(sum(RUCharges) / len(RUCharges)))

Average RU charge: 876.3602465753447


### Load documents with Partition key Name_Type
- 1440 readings for a single document (1 reading per minute)
- 365 days
- Average RU cost with default indexing: ~ 877 RUs
- Save generated documents in output directory for reference

In [10]:
## Loading daily readings per document, single partition key
from datetime import datetime, timedelta

RUCharges = []
daysRange = 365
iotRange = 1

dtBase = datetime(year=2022, month=1, day=1)

for iot in range(iotRange):
    IOTSrc = IOTSources[fake.random_int(min=0, max=9999)]

    for day in range(daysRange):
        docs = []        
        readings = []

        # *** Produce 1440 readings - 1 for each minute
        for m in range(1440):
            readings.append(
                {
                    'Dimension': fake.random_element(elements=('D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9'))
                    , 'Metric': fake.random_number(digits=5)
                    , 'Timestamp': (dtBase + timedelta(days=day,minutes=m)).isoformat()  # fake.date_time_this_year().isoformat()
                })

        doc = {
            'id': str(uuid.uuid4())
            , 'PartitionKey': IOTSrc['Name'] + '_' + IOTSrc['Type']
            , 'Name': IOTSrc['Name']
            , 'Type': IOTSrc['Type']
            , 'JobId': 100
            , 'Entity': IOTSrc
            , 'DateTime': (dtBase + timedelta(days=day)).isoformat()
            , 'Timestamp': (dtBase + timedelta(days=day)).timestamp()
            , 'Readings': readings
            , 'class': fake.random_element(elements=OrderedDict([("A", 0.40), ("B", 0.35), ("C", 0.15), ("D", 0.05), ("E", 0.05)]))
        }

        ctr.create_item(doc)
        RUCharges.append(float(ctr.client_connection.last_response_headers['x-ms-request-charge']))
        #print(ctr.client_connection.last_response_headers['x-ms-request-charge'])

        # Store docs and reset readings
        docs.append(doc)

        with open('./OutputFiles/' + containerName + '_' + IOTSrc['Name'] + '_' + IOTSrc['Type'] + '_' + (dtBase + timedelta(days=day)).strftime('%Y_%m_%d') + '_docs.json', 'w') as jf:
            for d in docs:
                json.dump(d, jf)
                #jf.write('\n')

In [11]:
print('Average RU charge: ' + str(sum(RUCharges) / len(RUCharges)))

Average RU charge: 876.3362739726043


### Checking the result
Executing the following querin in Data Explorer will confirm data load was properly executed...

SELECT SUBSTRING(c.PartitionKey, 0, 23) AS PK, COUNT(c.id) AS Total <br>
FROM c <br>
GROUP BY SUBSTRING(c.PartitionKey, 0, 23) <br>

Example output:
<br>
<br>
    {<br>
        "PK": "vulT_488002475746_Type6",<br>
        "Total": 365<br>
    },<br>
    {<br>
        "PK": "lraA_973264882039_Type3",<br>
        "Total": 30<br>
    },<br>
    {<br>
        "PK": "fYuo_086671929026_Type3",<br>
        "Total": 365<br>
    },<br>
    {<br>
        "PK": "XkZk_119381357891_Type5",<br>
        "Total": 30<br>
    }<br>



### Query cost
- [Check IOTReadingsv2_CSharp.dib](./IOTReadingsv2_CSharp.dib)

### Clean up
- Files in the Output folder
- NOT deleting containers, as those can be reused
  - For document deletion with TTL, referer to [PrepareContainers.ipynb](./PrepareContainers.ipynb)

In [12]:
# Assume objects are instantiated
# db.delete_container(containerName)

import glob

files = glob.glob('.\OutputFiles\*')
for f in files:
    os.remove(f)