### NoSQL Data Store for file metadata

Generating fake file metadata to review design with Cosmos DB

In [1]:
from azure.cosmos import CosmosClient, PartitionKey
from configparser import ConfigParser
from faker import Faker

import os
import json
import uuid

parser = ConfigParser()
parser.read('../NotebookConfig.cfg')

cosmosAccountURI = parser.get('CosmosDB', 'COSMOSDB_SQLSLWD_ACCOUNT_URI')
cosmosAccountKey = parser.get('CosmosDB', 'COSMOSDB_SQLSLWD_ACCOUNT_KEY')

databaseName = 'Learn'
containerName = 'FileMetadata'
partitionKeypath = '/File'

Faker.seed(0)
fake = Faker(['en-US'])

In [2]:
# Shared database throughput for test environment

client = CosmosClient(cosmosAccountURI, cosmosAccountKey)
db = client.create_database_if_not_exists(databaseName, offer_throughput=400)

pkPath = PartitionKey(path=partitionKeypath)
ctr = db.create_container_if_not_exists(id=containerName, partition_key=pkPath) 

In [3]:
for i in range(1000000):
    file = {
        'id': str(uuid.uuid4())
        , 'File': fake.file_path(depth=fake.random_int(min=1,max=10))
        , 'JobId': fake.random_int(min=1,max=9999999999)
        , 'Size': fake.random_int(min=1,max=9999999999999999) # Up to 8.88 TB
        , 'SizeOnDisk': fake.random_int(min=1,max=99999999999999) # Up to 8.8 TB
        , 'CreatedAt': fake.random_int(min=946684800,max=1674009647) # year 2000 until today - Unix Epoch
        , 'ModifiedAt': fake.random_int(min=946684800,max=1674009647) # year 2000 until today - Unix Epoch
        , 'AccessedAt': fake.random_int(min=946684800,max=1674009647) # year 2000 until today - Unix Epoch
        , 'Extension': fake.file_extension() # May have a different extention from the name, test is test...
    }

    ctr.create_item(file)

ServiceResponseError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

Comparing insert cost with defaul indexing and index on 'File' property

In [4]:
pkPath = PartitionKey(path=partitionKeypath)
ctr2 = db.create_container_if_not_exists(id='FileMetadata2', partition_key=pkPath)

file = {
    'id': str(uuid.uuid4())
    , 'File': fake.file_path(depth=fake.random_int(min=1,max=10))
    , 'JobId': fake.random_int(min=1,max=9999999999)
    , 'Size': fake.random_int(min=1,max=9999999999999999) # Up to 8.88 TB
    , 'SizeOnDisk': fake.random_int(min=1,max=99999999999999) # Up to 8.8 TB
    , 'CreatedAt': fake.random_int(min=946684800,max=1674009647) # year 2000 until today - Unix Epoch
    , 'ModifiedAt': fake.random_int(min=946684800,max=1674009647) # year 2000 until today - Unix Epoch
    , 'AccessedAt': fake.random_int(min=946684800,max=1674009647) # year 2000 until today - Unix Epoch
    , 'Extension': fake.file_extension() # May have a different extention from the name, test is test...
}

ctr.create_item(file)
print ('RU - Default indexing: ', float(ctr.client_connection.last_response_headers['x-ms-request-charge']))
ctr2.create_item(file)
print ('RU - Minimal indexing: ', float(ctr2.client_connection.last_response_headers['x-ms-request-charge']))

RU - Dfault indexing:  8.57
RU - Minimal indexing:  5.9


### Sample queries on top of 1M documents (document store)

SELECT c.Extension, SUM((c.Size / 1073741824.0)) AS TotalGB, COUNT(c.id) AS NumFiles <br/>
FROM c GROUP BY c.Extension<br/>
-- 22027 RUs / All document fetched from backend<br/>

SELECT SUM((c.Size / 1073741824.0)) AS TotalGB, COUNT(c.id) as TotalDocs<br/>
FROM c WHERE startswith(c.File, '/book/')<br/>
-- 35 RUs / 1017 documents retrieved<br/>

SELECT c.Extension, SUM((c.Size / 1073741824.0)) AS TotalGB, COUNT(c.id) as TotalDocs<br/>
FROM c WHERE startswith(c.File, '/book/')<br/>
GROUP BY c.Extension <br/>
-- 38.19 RUs / 1017 documents retrieved<br/>

### Sample queries on top of Analytical Store

SELECT f.Extension, SUM((f.Size / 1073741824.0)) AS TotalGB, COUNT(f.size) AS NumFiles<br/> 
FROM OPENROWSET(<br/> 
       'CosmosDB',<br/>
       'Account=CosmosAccount;Database=Learn;Key=YOURKEY',<br/>
       FileMetadata) as f<br/>
GROUP BY f.Extension<br/>
<br/>
Statement ID: {x} | Query hash: y | Distributed request ID: {z}. Total size of data scanned is 10 megabytes, total size of data moved is 13 megabytes, total size of data written is 0 megabytes. (32 records affected)<br/>
Total execution time: 00:00:06.031<br/>
<br/><br/>
SELECT f.Extension, SUM((f.Size / 1073741824.0)) AS TotalGB, COUNT(f.size) AS NumFiles <br/>
FROM OPENROWSET(<br/> 
       'CosmosDB',<br/>
       'Account=CosmosAccount;Database=Learn;Key=YOURKEY',<br/>
       FileMetadata) as f<br/>
WHERE f.[File] like '/book/%'<br/>
GROUP BY f.Extension<br/><br/>
Statement ID: {x} | Query hash: y | Distributed request ID: {z}. Total size of data scanned is 69 megabytes, total size of data moved is 1 megabytes, total size of data written is 0 megabytes. (1 record affected)<br/>
Total execution time: 00:00:05.991<br/>

In [63]:
# Clean up code
# Assuming objects are instantiated

# db.delete_container(containerName)