#### Bulk Glossary Updates
This notebook demonstrates an approach to bulk updating entities with glossary terms, but could be applied to other types of updates.  This example includes code to synthetically create a number of entities and remove them for the purposes of scale testing.

#### Configuration

In [None]:
import yaml, requests, json

# Open the config file to read the client details
with open("../config/config.yaml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

client_id = config[0]['client_id']
client_secret = config[0]['client_secret']
tenant_id = config[0]['tenant_id']
purview_account_name = config[0]['purview_account_name']
scope = f"{config[0]['resource']}/.default"
purview_endpoint = f"https://{purview_account_name}.purview.azure.com"
authority = f"https://login.microsoftonline.com/{tenant_id}"

access_token = ""

In [None]:
# Collection reference
collection_id = "api-bulk-glossary"
collection_friendly_name = "API Bulk Glossary"

# Search and glossary parameters
fqdn_prefix = "mssql://test-db.database.windows.net/test-server/test-database/test-schema"    # FQDn pattern to search for
term_guid = "ea23276b-5d3e-475d-b880-7f8ce3f7eb51"      # Term guid to apply to results

#### Functions

In [None]:
# Function to construct header for HTTP requests
# Parameters:  None
# Returns: Dictionary with header information
##
def getHeaders():
    headers = {
        'Authorization': f'Bearer {access_token}',
        'Content-Type': 'application/json'
    }
    return headers;

In [None]:
# Function to search catalog for a given fqdn pattern and a guid.  Returns a searchresult object
# Parameters:  
#   limit:   Batch size to return (50-1000)
#   fqdn:    The FQDN prefix to search for
#   guid:    search offset (will filter for results greater than this)
# Returns:  Ordered dict object with Search Results sorted by guid (https://learn.microsoft.com/en-us/rest/api/purview/catalogdataplane/discovery/query?tabs=HTTP#searchresult)
##
def queryForNextBatch(limit, fqdn, guid):

    search_uri = f"{purview_endpoint}/catalog/api/search/query?api-version=2022-03-01-preview"
    payload = json.dumps({
        "keywords": None,
        "limit": limit,
        "filter": {
            "and": [
                {
                    "id": {
                        "operator": "gt",
                        "value": guid
                    }

                },
                {
                    "attributeName": "qualifiedName",
                    "operator": "startswith",
                    "attributeValue": fqdn           
                }
            ]
        },
        "offset": 0,
        "limit": limit,
        "orderby": [{
            "id": "asc"
        }]
    })

    response = json.loads(requests.request("POST", search_uri, data=payload, headers=getHeaders()).content)
    return response

#### OAuth Login
Perform Authentication using the Microsoft Authentication Library and get a bearer token for subsequent API calls

In [None]:
from msal import ConfidentialClientApplication

app = ConfidentialClientApplication(client_id, authority=authority, client_credential=client_secret)
result = app.acquire_token_for_client(scopes=scope)

access_token = result['access_token']

#### Intial Setup
(Optional) Create a collection and populate it with some assets

In [None]:
# Initial setup:  Create the collection and keep a reference to it

# Create a collection for the assets
uri = f"{purview_endpoint}/account/collections/{collection_id}?api-version=2019-11-01-preview"
payload = json.dumps({
    "friendlyName": collection_friendly_name,
    "parentCollection": {
        "referenceName": purview_account_name
    }
})
response = json.loads(requests.request("PUT", uri, headers=getHeaders(), data=payload).content)

In [None]:
num_entities_to_create = 10000         # Define the number of entities to create (creates Azure SQL tables)
entities = []                         # Initialise a list to store each entity JSON

# Loop and create the required asset definitions
i = 1
while i <= num_entities_to_create:

    id = str(i).zfill(5)
    entity = {
      "typeName": "azure_sql_table",
      "guid": f"-{i}",
      "attributes": {
        "name": f"Table {id}", 
        "qualifiedName": f"{fqdn_prefix}/Test-Table-{id}",
        "description": f"Bulk created asset Test-Table-{id}"
      }
    }
    entities.append(entity)
    i += 1

# Define the (collection) bulk create or update URI and push the JSON payload to it
uri = f'{purview_endpoint}/catalog/api/collections/{collection_id}/entity/bulk?api-version=2022-03-01-preview'
payload = json.dumps({
  "referredEntities": {},
  "entities": entities
})
response = json.loads(requests.request("POST", uri, headers=getHeaders(), data=payload).content)

Create a glossary term

In [None]:
# Create a glossary terms and keep a reference to it
add_term_uri = f"{purview_endpoint}/account/collections/{collection_id}?api-version=2019-11-01-preview"


# Create a collection for the assets
uri = f"{purview_endpoint}/account/collections/{collection_id}?api-version=2019-11-01-preview"
payload = json.dumps({
    "friendlyName": collection_friendly_name,
    "parentCollection": {
        "referenceName": purview_account_name
    }
})
response = json.loads(requests.request("PUT", uri, headers=getHeaders(), data=payload).content)

#### Search, retrieve and bulk update
Retrieve each asset in the collection and update with a glossary term

In [None]:
import math

uri = f"{purview_endpoint}/catalog/api/atlas/v2/glossary/terms/{term_guid}/assignedEntities"
batch_size = 500

# Get the first batch (search using the fqdn and using zero guid, effectively starting from the first ordered guid)
results = queryForNextBatch(batch_size, fqdn_prefix, "0")

# Get the total number of results from the search
total_num_of_entities = int(results.get("@search.count"))
print(f"Total number of entities to update: {total_num_of_entities}")

# Calculate the number of batches needed
number_of_batches = total_num_of_entities / batch_size
number_of_batches = math.ceil(number_of_batches)

print(f"number of batches needed with batch size of {batch_size}: {int(number_of_batches)}")

# Loop through each batch 
for batch in range(int(number_of_batches)):
  print(f"Processing batch {batch+1} of {int(number_of_batches)}")
  
  # Reset the list of guids
  batch_guids = []

  # Get the list of guids from the current query
  for guids in results['value']:
    this_guid = {"guid": f"{guids['id']}"}
    batch_guids.append(this_guid)

  # For this guid batch, call the bulk assign API
  payload = json.dumps(batch_guids)
  response = requests.request("POST", uri, headers=getHeaders(), data=payload)
  
  # Any throttling or timeouts will show up in the response so
  # we can catch exceptions here if response other than 204.
  # We still have the guids for this batch so could implement some retry logic,
  # but for now just print the response
  #print(response)

  # Query for the next batch passing in the last guid processed
  results = queryForNextBatch(batch_size, fqdn_prefix, this_guid['guid'])




#### Cleanup
Delete the assets that were created and the collection

In [None]:
num_entities_for_deletion = 1
delete_batch_size = 100

while num_entities_for_deletion > 0:
  entities_for_deletion = queryForNextBatch(delete_batch_size, fqdn_prefix, "0")
  num_entities_for_deletion = entities_for_deletion['@search.count']

  print(f"{num_entities_for_deletion} remaining for deletion")
  
  delete_uri = f"{purview_endpoint}/catalog/api/atlas/v2/entity/bulk?"
  for i in entities_for_deletion['value']:
    delete_uri = delete_uri + f"guid={i['id']}&"

  json.loads(requests.request("DELETE", delete_uri[:-1], headers=getHeaders()).content)

In [None]:
#Delete the collection
delete_uri = f"{purview_endpoint}/collections/{collection_id}?api-version=2019-11-01-preview"
headers = getHeaders()
requests.request("DELETE", delete_uri, headers=headers)