#### Bulk Export Entities to file
This notebook demonstrates an approach to bulk exporting entities by collection.  This notebook queries for all collections in a Purview Account, iterates through each one and exports contents to the local filesystem.

#### Configuration

In [1]:
import yaml, requests, json

# Open the config file to read the client details
with open("../config/config.yaml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

client_id = config[0]['client_id']
client_secret = config[0]['client_secret']
tenant_id = config[0]['tenant_id']
purview_account_name = config[0]['purview_account_name']
scope = f"{config[0]['resource']}/.default"
purview_endpoint = f"https://{purview_account_name}.purview.azure.com"
authority = f"https://login.microsoftonline.com/{tenant_id}"

access_token = ""

#### Functions

In [7]:
# Function to construct header for HTTP requests
# Parameters:  None
# Returns: Dictionary with header information
##
def getHeaders():
    headers = {
        'Authorization': f'Bearer {access_token}',
        'Content-Type': 'application/json'
    }
    return headers;

In [2]:
# Function to search catalog for a given collection and a guid offset.  Returns a searchresult object
# Parameters:  
#   limit:   Batch size to return (50-1000)
#   collection_id:    The collection to search
#   guid:    search offset (will filter for results greater than this)
# Returns:  Ordered dict object with Search Results sorted by guid (https://learn.microsoft.com/en-us/rest/api/purview/catalogdataplane/discovery/query?tabs=HTTP#searchresult)
##
def queryCollection(limit, collection_id, guid):

    search_uri = f"{purview_endpoint}/catalog/api/search/query?api-version=2022-03-01-preview"
    payload = json.dumps({
        "keywords": None,
        "limit": limit,
        "filter": {
            "and": [
                {
                    "collectionId": collection_id
                },
                {
                    "id": {
                        "operator": "gt",
                        "value": guid
                    }

                }
            ]
        },
        "offset": 0,
        "limit": limit,
        "orderby": [{
            "id": "asc"
        }]
    })

    response = json.loads(requests.request("POST", search_uri, data=payload, headers=getHeaders()).content)
    return response

In [4]:
# Function to return entity details for a given list of guids.  Returns an Array of AlasEntities
# Paremeters
#   guids:  List of guids to return
# Returns:  AtlasEntity Array object 
##
def getEntities(guids):
    entities_uri = f"{purview_endpoint}/catalog/api/atlas/v2/entity/bulk?"
    
    for guid in guids:
        entities_uri = entities_uri + f"guid={guid}&"

    entities_uri = entities_uri + "minExtInfo=true"  
    headers=getHeaders()
 
    entity_results = json.loads(requests.request("GET", entities_uri, headers=headers).content)
    
    return entity_results['entities']

#### OAuth Login
Perform Authentication using the Microsoft Authentication Library and get a bearer token for subsequent API calls

In [5]:
from msal import ConfidentialClientApplication

app = ConfidentialClientApplication(client_id, authority=authority, client_credential=client_secret)
result = app.acquire_token_for_client(scopes=scope)

access_token = result['access_token']

#### Collection query
Get the available collections and add to a list

In [9]:
uri = f"{purview_endpoint}/account/collections?api-version=2019-11-01-preview"

headers = getHeaders()
collections_list = json.loads(requests.request("GET", uri, headers=headers).content)

print(f"Number of collections found: {len(collections_list['value'])} ")

Number of collections found: 7 


#### Extract assets
Extract assets from each collection and save to a file.  The batch-size determines how many entities are queried for at a time and written to a file.  

In [14]:
import math, os

batch_size = 100

folder_name = f"export-{purview_account_name}"
# Create the export directory if it doesn't exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Iterate through each collection
for collection in collections_list['value']:

    collection_id = collection['name']
    guid_offset = "0"

    print(f"{collection['friendlyName']} ({collection['name']})")

    #Execute a search against the collection (use the offset of 0 to get the first batch)
    results = queryCollection(batch_size, collection['name'], guid_offset)

    # Get the total number of results from the search
    total_num_of_entities_in_collection = int(results.get("@search.count"))
    print(f"Total number of entities in collection {collection['friendlyName']}: {total_num_of_entities_in_collection}")

    # Calculate the number of batches needed
    number_of_batches = total_num_of_entities_in_collection / batch_size
    number_of_batches = math.ceil(number_of_batches)
    print(f"number of batches needed with batch size of {batch_size}: {number_of_batches}")

    # Loop through each batch 
    for batch in range(int(number_of_batches)):
        print(f"Processing batch {batch+1} of {int(number_of_batches)}")

        batch_guids = []
        # For this batch grab all the entities and append to a list
        for entity in results['value']:

            # For each entity we find store the guid
            this_guid = {"guid": f"{entity['id']}"}
            batch_guids.append(entity['id'])

        # Once we have a full list of guids call the list by guids
        entities = getEntities(batch_guids)

        # Dump the entity dict into a json object       
        entities_json = json.dumps(entities)

        # Export the json to a file
        with open(f"{folder_name}\{collection['name']}-{batch}.json", "w+") as outfile:
            outfile.write(entities_json)

        # Set the next offset guid and re-run the query
        guid_offset = this_guid

        results = queryCollection(batch_size, collection_id, guid_offset)

pvdemoarfj5-pv (pvdemoarfj5-pv)
Total number of entities in collection pvdemoarfj5-pv: 65
number of batches needed with batch size of 100: 1
Processing batch 1 of 1
Sales (igvbjq)
Total number of entities in collection Sales: 19
number of batches needed with batch size of 100: 1
Processing batch 1 of 1
Marketing (jwvcaz)
Total number of entities in collection Marketing: 11
number of batches needed with batch size of 100: 1
Processing batch 1 of 1
Share (gefl2r)
Total number of entities in collection Share: 0
number of batches needed with batch size of 100: 0
API Collection (api-collection)
Total number of entities in collection API Collection: 4
number of batches needed with batch size of 100: 1
Processing batch 1 of 1
camden (6anio9)
Total number of entities in collection camden: 50
number of batches needed with batch size of 100: 1
Processing batch 1 of 1
API Sub Collection (j6wacz)
Total number of entities in collection API Sub Collection: 0
number of batches needed with batch size 