In [None]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id={'project-id'}, project_access_token={'project-access-token'})
pc = project.project_context


# Watson Discovery Service Document Reingestion
By [Morgan Langlais](https://github.com/modlanglais/)


# TODO: Directions

In [None]:
%%capture
!pip install ibm-watson

In [None]:
from ibm_watson import DiscoveryV1

import json
import csv
import json
import threading

# @hidden_cell
# Insert the appropriate Discovery credentials here
environmentId = {'environment-id'}
collectionId = {'collection-id'}
discovery = DiscoveryV1(
    {'version'},
    iam_apikey={'iam-apikey'})

collection = discovery.get_collection(environmentId, collectionId).get_result()
totalDocuments = collection['document_counts']['available']
print("**Total number of documents in collection " + collectionId + ": " + str(totalDocuments))

## This section gets a list of all the document IDs in a given collection

In [None]:
def pmap_helper(fn, output_list, input_list, i):
    output_list[i] = fn(input_list[i])

def pmap(fn, input):
    input_list = list(input)
    output_list = [None for _ in range(len(input_list))]
    threads = [threading.Thread(target=pmap_helper,
                                args=(fn, output_list, input_list, i),
                                daemon=True)
               for i in range(len(input_list))]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    return output_list

def all_document_ids(discovery,
                     environmentId,
                     collectionId):
    """
    Return a list of all of the document ids found in a
    Watson Discovery collection.

    The arguments to this function are:
    discovery      - an instance of DiscoveryV1
    environment_id - an environment id found in your Discovery instance
    collection_id  - a collection id found in the environment above
    """
    doc_ids = []
    alphabet = "0123456789abcdef"   # Hexadecimal digits, lowercase
    chunk_size = 10000

    def maybe_some_ids(prefix):
        """
        A helper function that does the query and returns either:
        1) A list of document ids
        2) The `prefix` that needs to be subdivided into more focused queries
        """
        need_results = True
        while need_results:
            try:
                response = discovery.query(environmentId,
                                           collectionId,
                                           count=chunk_size,
                                           filter="extracted_metadata.sha1::"
                                           + prefix + "*",
                                           return_fields="extracted_metadata.sha1").get_result()
                need_results = False
            except Exception as e:
                print("will retry after error", e)

        if response["matching_results"] > chunk_size:
            return prefix
        else:
            return [item["id"] for item in response["results"]]

    prefixes_to_process = [""]
    while prefixes_to_process:
        prefix = prefixes_to_process.pop(0)
        prefixes = [prefix + letter for letter in alphabet]
        # `pmap` here does the requests to Discovery concurrently to save time.
        results = pmap(maybe_some_ids, prefixes)
        for result in results:
            if isinstance(result, list):
                doc_ids += result
            else:
                prefixes_to_process.append(result)

    return doc_ids


allDocIds = all_document_ids(discovery,
                           environmentId,
                           collectionId)

for doc_id in allDocIds:
    print(doc_id)


## Iterates through each document in the collection and transforms the document according to the block of code below

In [None]:
for documentId in allDocIds:
    print("**Updating document #" + documentId + "...")
    print("(" + str(allDocIds.count(documentId)) + "/" + str(len(allDocIds)) + ")")
    filterId = '_id:' + documentId
    
    # 1.) Get document from Discovery collection
    discQuery = discovery.query(environmentId, collectionId, filter=filterId).get_result()['results'][0]
    # print("**Document #" + documentId + " query results: ")
    # print(json.dumps(discQuery, indent=2))
    filename = discQuery['extracted_metadata']['filename']
    filetype = discQuery['extracted_metadata']['file_type']
    
    #****************TRANSFORM HERE****************#
    # 2.) Apply changes to your document however you would like. In this example, I move metadata fields up one level
    transformedDoc = discQuery
    if 'metadata' in transformedDoc:
        metadatafield = transformedDoc['metadata']
        for field in metadatafield:
            transformedDoc[field] = metadatafield[field]
        del transformedDoc['metadata']
        
    # Deleting reserved, Watson-populated fields
    del transformedDoc['id']
    del transformedDoc['result_metadata']
    
    print("**Transformed document: ")
    print(json.dumps(transformedDoc, indent=2))
  #******************END TRANSFORM******************#
    
    projectFileName = 'currentDiscoveryDoc.json'
    project.save_data(projectFileName, json.dumps(transformedDoc), set_project_asset=True, overwrite=True)
    
    # 3.) Push updated document to Discovery.
    discUpdate = discovery.update_document(environmentId, collectionId, documentId, file=project.get_file(projectFileName), filename=filename, accept_json=True).get_result()
    # print(json.dumps(discUpdate, indent=2))
    
    print("**Finished updating document #" + documentId + "...")
    print()
    print("**************************************************************************")
    print()

## Get each document status
#### Note: Filename may appear as `null` in the results, but this is normal and expected as the Discovery .update() function takes some time

In [None]:
for documentId in allDocIds:
    print("**After Reingestion Document Status: ")
    filterId = '_id:' + documentId
    discQueryStatus = discovery.get_document_status(environmentId, collectionId, documentId).get_result()
    print(json.dumps(discQueryStatus, indent=2))

In [None]:
collection = discovery.get_collection(environmentId, collectionId).get_result()
totalDocuments = collection['document_counts']['available']
print("**Total number of documents in collection " + collectionId + ": " + str(totalDocuments))