# Loading JSON Output of Speech-to-Text from Cloud Object Storage

In [1]:
# @hidden_cell
# The following code contains the credentials for a file in your IBM Cloud Object Storage.
# You might want to remove those credentials before you share your notebook.
json_credentials = {
    'IAM_SERVICE_ID': 'iam-ServiceId-d10c178c-3cd8-47b4-84a0-7c3dd2358a37',
    'IBM_API_KEY_ID': 'va2FMddlCqG1PWfwaD7GEmYCHPj2zWatAMO94faKpMns',
    'ENDPOINT': 'https://s3-api.us-geo.objectstorage.service.networklayer.com',
    'IBM_AUTH_ENDPOINT': 'https://iam.bluemix.net/oidc/token',
    'BUCKET': 'watsonmltutorial-donotdelete-pr-avritkhxkv7wgw',
    'FILE': 'nixon_speech.json'
}

In [2]:
from ibm_botocore.client import Config
import ibm_boto3

cos = ibm_boto3.client(
    service_name='s3',
    ibm_api_key_id=json_credentials['IBM_API_KEY_ID'],
    ibm_service_instance_id=json_credentials['IAM_SERVICE_ID'],
    ibm_auth_endpoint=json_credentials['IBM_AUTH_ENDPOINT'],
    config=Config(signature_version='oauth'),
    endpoint_url=json_credentials['ENDPOINT']
)

In [3]:
cos.download_file(Bucket=json_credentials['BUCKET'],Key='nixon_speech.json',Filename='nixon_speech.json')

# Parsing out and constructing JSON data into a useful dictionary (only 1 transcript)

Storing and printing out the different parts of the data returned from Watson's STT services. From this data we can start to run analytics. We would want to save the "transcript" portion back out to the PCP, the rest of the data we will probably not want to store. The "confidence" and the "words" will be used for creating and testing the custom language and acoustic models. 

In [4]:
import json

# dictionary full of nested dicts because of meta-data, parsing out
json_dict = json.loads(open("nixon_speech.json").read())
result = json_dict.get("result")
results = result.get("results")

# watson's json response breaks data up into partial dictionaries and needs to be restitched
word_confidence_hub = []
partial_confidence_hub = []
full_transcript = ""
chunks = []

# commence the stitching
for partial_dict in results:
    # each partial dict has two values. first is the content, second is boolean True
    alternatives = partial_dict.get("alternatives")[0]

    # gather all of the partial content
    word_confidence = alternatives.get("word_confidence")
    partial_confidence = alternatives.get("confidence")
    partial_transcript = alternatives.get("transcript")

    # stitch partial content into larger data sets
    partial_confidence_hub.append(partial_confidence)
    full_transcript += partial_transcript
    # the word confidence is nested... un-nesting it
    for word in word_confidence:
        word_confidence_hub.append(word)

    # add the partial_dict for reference purposes
    chunks.append(partial_dict)

# calculating the overall confidence level from partial confidences
confidence = sum(partial_confidence_hub) / len(partial_confidence_hub)

# reconstructing the dictionary into something more useful and returning
return_dict = {"transcript": full_transcript, "confidence": confidence, "words": word_confidence_hub, "chunks": chunks}

In [5]:
print(return_dict["transcript"])



In [6]:
print(return_dict["confidence"])

0.8109411764705887


In [7]:
# would print out all of the words, with their confidence... commenting out to save space
# print(return_dict["words"])

In [8]:
# would print out all of the sub-dictionaries, the audio is segmented and scored by the transcription service to preserve accuracy, we would want this for testing and to determine speaker
# print(return_dict["chunks"])

# Using Watson's Tonal Analysis service

https://cloud.ibm.com/apidocs/tone-analyzer?language=python

In [9]:
from watson_developer_cloud import ToneAnalyzerV3 as TA

tone_analyzer = TA(version='2017-09-21', iam_apikey='P641bf4cTjQ_uJUr8LAa6nfichCNIwzjF-RZrha5mDP9', url='https://gateway.watsonplatform.net/tone-analyzer/api')
json_data = tone_analyzer.tone(tone_input=return_dict["transcript"], content_type="text/plain")

## formatting the returned dictionary into readible dict (each tone is a key)

In [10]:
# formatting json data into a dictionary
result_string = str(json_data)
json_acceptable_string = result_string.replace("''", "\"")
dict = json.loads(json_acceptable_string)

In [11]:
# reconfiguring json dictionary into better format
result = dict.get("result")
tone_dict = result["document_tone"]
tone_dict = tone_dict["tones"]
new_dict = {}
for tone in tone_dict:
    tone_name = tone["tone_name"]
    tone_score = tone["score"]
    new_dict[tone_name] = tone_score

print(new_dict)

{'Analytical': 0.578542, 'Anger': 0.628574}


# Using Watson's Personality Insights service

https://cloud.ibm.com/apidocs/personality-insights

In [12]:
from watson_developer_cloud import PersonalityInsightsV3 as PI

personality_insights = PI(version='2017-10-13', iam_apikey='AwdMEkC22JRpgiowRaSTyCGJzVoWULI9lc6EypsRoOYS', url='https://gateway.watsonplatform.net/personality-insights/api')
json_data = personality_insights.profile(return_dict["transcript"], content_type="text/plain", raw_scores=True, consumption_preferences=True)

In [13]:
# formatting json data into dictionary
result_string = str(json_data)
json_acceptable_string = result_string.replace("''","\"")
dict = json.loads(json_acceptable_string)

In [14]:
# the result dictionary for personality insights returns the following dictionaries
result = dict.get("result")
for key in result:
    print(key)

values
word_count
personality
needs
consumption_preferences
processed_language


In [15]:
# creating variables for each sub-dictionary
values = result.get("values")
personality = result.get("personality")
consumption_preferences = result.get("consumption_preferences")
warnings = result.get("warnings")
word_count = result.get("word_count")
needs = result.get("needs")
processed_language = result.get("processed_language")

In [16]:
# values sub-dictionary
for value in values:
    name = value["name"]
    category = value["category"] # this will always be 'values'
    significant = value["significant"] # binary, either true or false
    raw_score = value["raw_score"]
    trait_id = value["trait_id"] # same as the name, but denoted as 'value_' + name
    percentile = value["percentile"]
    
    print(name)
    print("\t", "score: ", raw_score, "\t", "percentile: ", percentile, "\t", "is significant: ", significant, "\n")

Conservation
	 score:  0.6360536368389874 	 percentile:  0.25703308781549816 	 is significant:  True 

Openness to change
	 score:  0.7896335121987585 	 percentile:  0.5553296069536555 	 is significant:  True 

Hedonism
	 score:  0.5916598630822453 	 percentile:  0.010919858577356112 	 is significant:  True 

Self-enhancement
	 score:  0.6237624348849744 	 percentile:  0.029916994024364763 	 is significant:  True 

Self-transcendence
	 score:  0.8114295445612572 	 percentile:  0.09311628707734587 	 is significant:  True 



In [17]:
# personality sub-dictionary (contains another sub-dict)
for p in personality:
    name = p["name"]
    category = p["category"] # will always be personality
    children = p["children"]
    raw_score = p["raw_score"]
    trait_id = p["trait_id"]
    significant = p["significant"]
    percentile = p["percentile"]
    for child in children:
        child_name = child["name"]
        child_category = child["category"] # will always be personality
        child_significant = child["significant"]
        child_raw_score = child["raw_score"]
        child_trait_id = child["trait_id"]
        child_percentile = child["percentile"]