In [None]:
# Install required libraries
%pip install azure-cosmos
%pip install faker
%pip install jsonschema

In [1]:
from azure.cosmos import CosmosClient, PartitionKey
from configparser import ConfigParser
from faker import Faker

import os
import json
import uuid

parser = ConfigParser()
parser.read('../NotebookConfig.cfg')

cosmosAccountURI = parser.get('CosmosDB', 'COSMOSDB_ACCOUNT_URI')
cosmosAccountKey = parser.get('CosmosDB', 'COSMOSDB_ACCOUNT_KEY')

databaseName = 'Learn'
containerName = 'MetadataPipeline'
partitionKeypath = '/PartitionKey'
osPath = './OutputFiles/'

Faker.seed(0)
fake = Faker(['en-US'])

if not os.path.exists(osPath):
    os.mkdir(osPath)

In [2]:
client = CosmosClient(cosmosAccountURI, cosmosAccountKey)
db = client.create_database_if_not_exists(databaseName)

pkPath = PartitionKey(path=partitionKeypath)
ctr = db.create_container_if_not_exists(id=containerName, partition_key=pkPath, offer_throughput=400) 

### Load sample documents

In [4]:
mappings = []
for i in range(20):
    mappings.append(
            {
                'mapId': i
                , 'mapSrcTag': 'tag_' + str(i)
                , 'mapSrcColumn': 'column_' + str(i)
                , 'mapTrgTag': 'tag_' + str(i)
                , 'mapTrgColumn': 'destcolumn_' + str(i)
                , 'dataType': 'String'                
            })

for j in range(5):
    doc = {
        'id': 'map.' + str(j)
        , 'PartitionKey': 'dom_example1'
        , 'description': 'my awesome mapper'
        , 'entity': 'what that parquet holds?'
        , 'timestamp': fake.date_time_this_year().timestamp()
        , 'dsMap': mappings    
    }

    ctr.create_item(doc)

In [11]:
# Copy docs across partitions
sourcePartition = 'dom_example1'
destPartition = 'dom_example2'

items = list(ctr.query_items(
        query="SELECT * FROM c WHERE c.PartitionKey= '" + str(sourcePartition) + "'",
        enable_cross_partition_query=False
    ))

for item in items:    
    item['PartitionKey'] = destPartition
    ctr.create_item(item)   


## Schema validation

In [23]:
import jsonschema
import json

mappingSchema = {
    "type": "object",
    "properties": {
        "mapId": {"type": "number"},
        "mapSrcTag": {"type": "string"},
        "mapSrcColumn": {"type": "string"},
        "mapTrgTag": {"type": "string"},
        "mapTrgColumn": {"type": "string"},
        "dataType": {"type": "string"},                        
    },
    "required": ["mapId", "mapSrcTag", "mapSrcColumn", "mapTrgTag", "mapTrgColumn", "dataType"]
}

sourcePartition = 'dom_example1'
items = list(ctr.query_items(
        query="SELECT * FROM c WHERE c.PartitionKey= '" + str(sourcePartition) + "'",
        enable_cross_partition_query=False
    ))

for item in items:
    for map in item['dsMap']:
        try: 
            jsonschema.validate(map, mappingSchema)
            print('All Good!')
        except jsonschema.exceptions.ValidationError as e:
            print ('Not valid!', e)


Not valid! 'dataType' is a required property

Failed validating 'required' in schema:
    {'properties': {'dataType': {'type': 'string'},
                    'mapId': {'type': 'number'},
                    'mapSrcColumn': {'type': 'string'},
                    'mapSrcTag': {'type': 'string'},
                    'mapTrgColumn': {'type': 'string'},
                    'mapTrgTag': {'type': 'string'}},
     'required': ['mapId',
                  'mapSrcTag',
                  'mapSrcColumn',
                  'mapTrgTag',
                  'mapTrgColumn',
                  'dataType'],
     'type': 'object'}

On instance:
    {'WRONG': 'String',
     'mapId': 0,
     'mapSrcColumn': 'column_0',
     'mapSrcTag': 'tag_0',
     'mapTrgColumn': 'destcolumn_0',
     'mapTrgTag': 'tag_0'}
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All Good!
All 