# IBM Cloud - Watson Assistant and Discovery Backup

In [1]:
# The code was removed by Watson Studio for sharing.

In [11]:
# Insert project credentials here by clicking the three ellipses on the toolbar above then 'Insert project token'

In [7]:
%%capture
!pip install ibm-watson
import json
import ibm_watson
import threading
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import ApiException
# ^This section installs/imports necessary Python packages

### Credentials

In [13]:
############################
# Do not delete this block
wa_credentials = ''
disc_credentials = ''
cos_credentials = ''
############################

In [14]:
################# CREDENTIALS #################
# Use in case you need to backup multiple instances
# For a single instance, delete the extra sets of {wa_version:'wa-version', wa_apikey:'wa-apikey', wa_url:'wa-url'}
# Each set of credentials wrapped in brackets {} signifies one instance of the service
# Add as many sets of credentials as you would like
# If you do not want to backup a service, delete the credentials.

wa_credentials = [{'wa_version':'yyyy-mm-dd', 'wa_apikey':'123mykey', 'wa_url':'https://something.com/something'},
                  {'wa_version':'yyyy-mm-dd', 'wa_apikey':'123mykey', 'wa_url':'https://something.com/something'}]

disc_credentials = [{'disc_version':'yyyy-mm-dd', 'disc_apikey':'123mykey', 'disc_url':'https://something.com/something'},
                    {'disc_version':'yyyy-mm-dd', 'disc_apikey':'123mykey', 'disc_url':'https://something.com/something'}]

In [4]:
# The code was removed by Watson Studio for sharing.

### This section provides functions needed to get all document IDs from a given Discovery collection, from https://github.ibm.com/ba/all-the-disco-ids

In [5]:
def pmap_helper(fn, output_list, input_list, i):
    output_list[i] = fn(input_list[i])

def pmap(fn, input):
    input_list = list(input)
    output_list = [None for _ in range(len(input_list))]
    threads = [threading.Thread(target=pmap_helper,
                                args=(fn, output_list, input_list, i),
                                daemon=True)
               for i in range(len(input_list))]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    return output_list

def all_document_ids(discovery,
                     environmentId,
                     collectionId):
    doc_ids = []
    alphabet = "0123456789abcdef"
    chunk_size = 10000

    def maybe_some_ids(prefix):
        need_results = True
        while need_results:
            try:
                response = discovery.query(environmentId,
                                           collectionId,
                                           count=chunk_size,
                                           filter="extracted_metadata.sha1::"
                                           + prefix + "*",
                                           return_fields="extracted_metadata.sha1").get_result()
                need_results = False
            except Exception as e:
                print("will retry after error", e)

        if response["matching_results"] > chunk_size:
            return prefix
        else:
            return [item["id"] for item in response["results"]]

    prefixes_to_process = [""]
    while prefixes_to_process:
        prefix = prefixes_to_process.pop(0)
        prefixes = [prefix + letter for letter in alphabet]
        # `pmap` here does the requests to Discovery concurrently to save time.
        results = pmap(maybe_some_ids, prefixes)
        for result in results:
            if isinstance(result, list):
                doc_ids += result
            else:
                prefixes_to_process.append(result)

    return doc_ids
############################################

## Watson Assistant Backup

In [None]:
if wa_credentials != '':
    for creds in wa_credentials:
        wa_version = creds['wa_version']
        wa_apikey = creds['wa_apikey']
        wa_url = creds['wa_url']

        if(wa_version == '' or wa_apikey == '' or wa_url == ''):
            print("No or invalid Watson Assistant credentials detected for this instance. Skipping.")
        else:
            print("Starting Watson Assistant backup...")
            
            authenticator = IAMAuthenticator(wa_apikey)

            assistant_service=ibm_watson.AssistantV1(
                version = wa_version,
                authenticator = authenticator
            )

            assistant_service.set_service_url(wa_url);

            # Get all workspace IDs
            try:
                list_wrkspc_response = assistant_service.list_workspaces().get_result()['workspaces']
                all_wrkspc_ids = []
            except ApiException as ex:
                print("Method failed with status code " + str(ex.code) + ": " + ex.message)

            print("Getting workspace IDs...")
            for space in list_wrkspc_response:
                print("Backing up Workspace "+ space['workspace_id'] + "...")
                all_wrkspc_ids.append(space['workspace_id'])

            for id in all_wrkspc_ids:
                workspace_response = []

                try:
                    workspace_response = assistant_service.get_workspace(
                        workspace_id = id,
                        export='true'
                    ).get_result()
                except ApiException as ex:
                    print("Method failed with status code " + str(ex.code) + ": " + ex.message)

                intents = workspace_response['intents']
                intentsCSV = ''
                for intent in intents:
                    intent_name = intent['intent']
                    for example in intent['examples']:
                        intentsCSV += example['text'] + ',' + intent_name + '\n'

                entities = workspace_response['entities']
                entitiesCSV = ''
                for entity in entities:
                    entity_name = entity['entity']
                    for value in entity['values']:
                        entitiesCSV += entity_name + ','
                        entitiesCSV += value['value'] + ','
                        if value['type'] == 'synonyms':
                            if len(value['synonyms']) > 0:
                                for synonym in value['synonyms']:
                                    entitiesCSV += synonym + ','
                        if value['type'] == 'patterns':
                            entitiesCSV += '/' + value['patterns'][0] + '/'
                        entitiesCSV = entitiesCSV.rstrip(',')
                        entitiesCSV += '\n'

                project.save_data("wa_" + id + "_workspace.json", json.dumps(workspace_response), set_project_asset=True, overwrite=True)
                project.save_data("wa_" + id + "_intents.csv", intentsCSV, set_project_asset=True, overwrite=True)
                project.save_data("wa_" + id + "_entities.csv", entitiesCSV, set_project_asset=True, overwrite=True)

                print("Workspace " + id + " done.")
            print("Completed Watson Assistant backup.")
######## End Watson Assistant Backup ########

Starting Watson Assistant backup...
Getting workspace IDs...
Backing up Workspace 267787d9-476e-41c8-97e6-00af092d04a7...
Backing up Workspace 6ba5b98d-9a5e-4108-9b83-95a2fb10f9fc...
Backing up Workspace 7810f37b-d4f2-4c51-ab61-21db10ce89d1...
Backing up Workspace 4a80c2a9-5369-4eb4-a197-8f33aaadd6dc...
Backing up Workspace 41f1f401-0314-46d5-8197-d85fbf067ea9...
Workspace 267787d9-476e-41c8-97e6-00af092d04a7 done.
Workspace 6ba5b98d-9a5e-4108-9b83-95a2fb10f9fc done.
Workspace 7810f37b-d4f2-4c51-ab61-21db10ce89d1 done.
Workspace 4a80c2a9-5369-4eb4-a197-8f33aaadd6dc done.
Workspace 41f1f401-0314-46d5-8197-d85fbf067ea9 done.
Completed Watson Assistant backup.
Starting Watson Assistant backup...
Getting workspace IDs...
Backing up Workspace 3d3b2f7b-5690-4509-ad38-a5e505bffa44...
Backing up Workspace 029ce3e9-cc0f-41b3-96f9-76cf9f6f2468...
Backing up Workspace 4ab087d3-45a9-4d09-86b2-82fee0e5d859...
Backing up Workspace 5518b7f4-2fbe-4769-8857-f78a832b398d...
Backing up Workspace 72345af3

## Discovery Backup

In [None]:
if disc_credentials != '':
    for creds in disc_credentials:
        disc_version = creds['disc_version']
        disc_apikey = creds['disc_apikey']
        disc_url = creds['disc_url']

        if(disc_version == '' or disc_apikey == '' or disc_url == ''):
            print("No or invalid Discovery credentials detected for this instance. Skipping.")
        else:
            print("Beginning Discovery backup...")

            authenticator = IAMAuthenticator(disc_apikey)

            discovery_service = ibm_watson.DiscoveryV1(
                version=disc_version,
                authenticator=authenticator
            )

            discovery_service.set_service_url(disc_url)

            environments = discovery_service.list_environments().get_result()
            environmentId = environments["environments"][1]["environment_id"]
            allCollections = discovery_service.list_collections(environmentId).get_result()['collections']

            # This script will loop through every collection in the given instance and save each document.
            # If you only want a specific collection to be backed up, remove the outer loop (below) and manually specify the collection ID.
            for collection in allCollections:
                collectionId = collection['collection_id']
                print("Backing up collection " + collectionId + "...")
                allDocIds = all_document_ids(discovery_service, environmentId, collectionId)

                try:
                    training_data = discovery_service.list_training_data(environmentId, collectionId).get_result()
                except ApiException as ex:
                    print("Discovery query failed with status code " + str(ex.code) + ": " + ex.message)

                project.save_data("wds_" + collectionId + "_trainingdata.json", json.dumps(training_data), set_project_asset=True, overwrite=True)

                for documentId in allDocIds:
                    filterId = '_id:' + documentId
                    try:
                        discQuery = discovery_service.query(environmentId, collectionId, filter=filterId).get_result()['results'][0]
                    except ApiException as ex:
                        print("Discovery query failed with status code " + str(ex.code) + ": " + ex.message)

                    project.save_data("wds_document_" + documentId + ".json", json.dumps(discQuery), set_project_asset=True, overwrite=True)

                print("Collection " + collectionId + " successfully backed up.")

            print("Completed Discovery backup.")
######## End Discovery Backup ########