# Install required libraries

In [4]:
#imports.... Run this each time after restarting the Kernel
!pip install watson_developer_cloud
# Run pip install only the first time, once installed on your Spark machine, no need to re-run unless you want to upgrade
!pip install --upgrade --force-reinstall wordcloud
!pip install --user --upgrade pixiedust

Collecting wordcloud
Collecting matplotlib (from wordcloud)
  Using cached https://files.pythonhosted.org/packages/81/31/4e261379e0cd4e9bbacfc96b124ebac0706b44374bd1d34ef899796f741b/matplotlib-2.2.2-cp35-cp35m-manylinux1_x86_64.whl
Collecting numpy>=1.6.1 (from wordcloud)
  Using cached https://files.pythonhosted.org/packages/ea/31/991207e6234b46a1228be970735ead9d6f06a298917d6f718c5e32e835bb/numpy-1.14.2-cp35-cp35m-manylinux1_x86_64.whl
Collecting pillow (from wordcloud)
  Using cached https://files.pythonhosted.org/packages/07/52/8e27b9c54cb70d379244771a58483928b3a02db3c657d466ed84eb18f22b/Pillow-5.1.0-cp35-cp35m-manylinux1_x86_64.whl
Collecting six>=1.10 (from matplotlib->wordcloud)
  Using cached https://files.pythonhosted.org/packages/67/4b/141a581104b1f6397bfa78ac9d43d8ad29a7ca43ea90a2d863fe3056e86a/six-1.11.0-py2.py3-none-any.whl
Collecting kiwisolver>=1.0.1 (from matplotlib->wordcloud)
  Using cached https://files.pythonhosted.org/packages/7e/31/d6fedd4fb2c94755cd101191e581af30e

# Import libraries

In [5]:
# Cloud Object Storage 
from botocore.client import Config
import ibm_boto3

# General Python libraries
import requests
import json
import io
from os.path import join, dirname

# Import Watson services
import watson_developer_cloud as watson
from watson_developer_cloud import SpeechToTextV1, NaturalLanguageUnderstandingV1, NaturalLanguageClassifierV1, ToneAnalyzerV3
from watson_developer_cloud.natural_language_understanding.features import (
    v1 as Features)


# Import libraries for analytics part
import pixiedust
from pixiedust.display import *

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import matplotlib.pyplot as plt

from pyspark.sql import functions as F
from pyspark.sql.functions import col

# Cloud Object Storage read-only credentials

In [6]:
credentials_os = {
  "apikey": "kfJa4az-NCI5itJaVJOmr2V4T3JnhS7E-0dxMact1HY2",
  "cos_hmac_keys": {
    "access_key_id": "d333abb1cb604803a16a1de30258be1c",
    "secret_access_key": "ee495ee9dd54eda842bf3b00f7a00d1a26c6bcb0cd7f9934"
  },
  "endpoints": "https://cos-service.bluemix.net/endpoints",
  "iam_apikey_description": "Auto generated apikey during resource-key operation for Instance - crn:v1:bluemix:public:cloud-object-storage:global:a/8739a0c318b37263a932b45c1947965d:7ce353a1-fa6f-4e25-a311-e29b0b2a8ad8::",
  "iam_apikey_name": "auto-generated-apikey-d333abb1-cb60-4803-a16a-1de30258be1c",
  "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Reader",
  "iam_serviceid_crn": "crn:v1:bluemix:public:iam-identity::a/8739a0c318b37263a932b45c1947965d::serviceid:ServiceId-cb9435ef-c3f6-42ed-9f8e-855d3345a4b8",
  "resource_instance_id": "crn:v1:bluemix:public:cloud-object-storage:global:a/8739a0c318b37263a932b45c1947965d:7ce353a1-fa6f-4e25-a311-e29b0b2a8ad8::"
}

credentials_os['BUCKET'] = 'watsoncallcenterthink18c8855d42de924dc38ccd02f3a8a50d7f'

# Burner credentials for Watson Services

Credentials will only be available till April 26, 2018; afterward you need to create your own services and train your own classifier


In [12]:

credentials_nlu = {
  "url": "https://gateway.watsonplatform.net/natural-language-understanding/api",
  "username": "f3c6267a-e3d0-4a1c-86fa-1d88b43df149",
  "password": "AafrKOBcU5mX"
}

# Ground truth used - simple tester "call_center_gt_NLC_V2.csv"
# https://github.com/rustyoldrake/call_center_instrumentation_analytics/blob/master/call_center_gt_NLC_V2.csv
credentials_nlc = {
    "classifier_id": "f7ea68x308-nlc-917",
    "url": "https://gateway.watsonplatform.net/natural-language-classifier/api",
    "username": "8f94db77-2cec-45ad-aa07-a76ff9c2a3d5",
    "password": "tzdUqQIeVwCZ",
}

credentials_tone = {
  "url": "https://gateway.watsonplatform.net/tone-analyzer/api",
  "username": "a21a52c9-f61a-42c1-babd-5e164ac05355",
  "password": "SzFi0IaqUek6"
}



# Sample Audio Files

In [7]:
# List of files which were transcribed by STT and enriched with NLU
file_list = ['sample1-addresschange-positive.ogg',
             'sample2-address-negative.ogg',
             'sample3-shirt-return-weather-chitchat.ogg',
             'sample4-angryblender-sportschitchat-recovery.ogg',
             'sample5-calibration-toneandcontext.ogg',
             'jfk_1961_0525_speech_to_put_man_on_moon.ogg',
             'May 1 1969 Fred Rogers testifies before the Senate Subcommittee on Communications.ogg']

# Setup Object Storage

In [8]:
def set_up_object_storage(credentials_object_storage):
    endpoints = requests.get(credentials_object_storage['endpoints']).json()

    iam_host = (endpoints['identity-endpoints']['iam-token'])
    cos_host = (endpoints['service-endpoints']['cross-region']['us']['public']['us-geo'])

    auth_endpoint = "https://" + iam_host + "/oidc/token"
    service_endpoint = "https://" + cos_host


    client = ibm_boto3.client(
        's3',
        ibm_api_key_id = credentials_object_storage['apikey'],
        ibm_service_instance_id = credentials_object_storage['resource_instance_id'],
        ibm_auth_endpoint = auth_endpoint,
        config = Config(signature_version='oauth'),
        endpoint_url = service_endpoint
       )
    return client

client = set_up_object_storage(credentials_os)

# Set up Watson Speech To Text

In [10]:

# speech_to_text = SpeechToTextV1(
#     username = credentials_stt['username'],
#     password = credentials_stt['password'],
#     url = 'https://stream.watsonplatform.net/speech-to-text/api',
# )

def get_transcript(audio):
    transcript = json.dumps(speech_to_text.recognize(audio=audio, content_type='audio/ogg', timestamps=True,
        word_confidence=True), indent=2)
    return transcript

def analyze_sample(sample):
    streaming_body = client.get_object(Bucket = credentials_os['BUCKET'], Key=sample)['Body'] #http
    audio = streaming_body.read()
    text = get_transcript(audio)
    # client.put_object(Bucket = credentials_os['BUCKET'], Key = sample.split('.')[0] + '_text.json', Body = text) ## Already done for Demo purposes
    return text



# Setup Watson Natural Language Understanding

In [14]:
features = { "concepts":{},"entities":{},"keywords":{},"categories":{},"emotion":{},"sentiment":{},"semantic_roles":{} }

natural_language_understanding = NaturalLanguageUnderstandingV1(
    version = '2017-02-27',
    username = credentials_nlu['username'],
    password = credentials_nlu['password']
)

chunk_size=25 # This CHUNK size is used to disaggregate a transcript 
#e.g. in this case a 290 word transcript would have 10 chunks - 9 with 30 words and 1 with 20 words - approximates 'time domain' for this lab

def chunk_transcript(transcript, chunk_size):
    transcript = transcript.split(' ')
    return [ transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size) ] # chunking data

def process_text_chunks(text):
    transcript=''
    for sentence in json.loads(text)['results']:
        transcript = transcript + sentence['alternatives'][0]['transcript'] # concatenate sentences
    transcript = chunk_transcript(transcript, chunk_size) # chunk the transcript
    return  transcript

def analyze_transcript_chunks(features, file_name):
    streaming_body = client.get_object(Bucket = credentials_os['BUCKET'], Key=file_name.split('.')[0]+'_text.json')['Body']
    transcript=streaming_body.read().decode("utf-8")
    nlu_analysis={}
    for chunk in process_text_chunks(transcript):
        chunk = ' '.join(chunk)
        nlu_analysis[chunk] = natural_language_understanding.analyze(features, chunk, return_analyzed_text=True, language='en')
    outfilename = file_name.split('.')[0]+'_NLUchunks.json'
    # print("writing file: ", outfilename, " to cloud object storage" )
    # res=client.put_object(Bucket = credentials_os['BUCKET'], Key=outfilename, Body= json.dumps(nlu_analysis)) Already done for Demo purposes
    return nlu_analysis


def post_analysis_chunks(result):
    for chunk in result.keys():
        categories = result[chunk]['categories']
        print('\nchunk: ', chunk)
        for category in categories:
            print('label: ', category['label'], ', score: ', category['score']) #add table instead of prints

In [38]:
result = analyze_transcript_chunks(features, file_list[1])
post_analysis_chunks(result)

writing file:  sample2-address-negative_NLUchunks.json  to cloud object storage

chunk:  I am yeah I want to change my address naw you know this is the third time this week I've had to call in to
label:  /finance/personal finance/lending/credit cards , score:  0.923957
label:  /technology and computing/hardware/computer , score:  0.262557
label:  /shopping/resources/contests and freebies , score:  0.151283

chunk:  it changed and hopefully you guys don't screwed up again okay yeah it's Bob Smith in Sacramento and my old addresses one twenty three oak
label:  /family and parenting/children , score:  0.43009
label:  /education/school , score:  0.286429
label:  /business and industrial , score:  0.233864

chunk:  street and my new address is four five six pine street yep that's right and is it is nine zero two one zero and I
label:  /business and industrial , score:  0.695409
label:  /art and entertainment , score:  0.284013
label:  /travel/tourist facilities/hotel , score:  0.138168

chu

# Setup Watson Natural Language Classifier

In [34]:

natural_language_classifier = NaturalLanguageClassifierV1(
    username = credentials_nlc['username'],
    password = credentials_nlc['password'])

chunk_size = 25
# Used to SPLIT up - "CHUNK" the aggregate transcript into smaller pieces    

def process_text(text):
    transcript=''
    for sentence in json.loads(text)['results']:
        transcript = transcript + sentence['alternatives'][0]['transcript'] # concatenate sentences
    transcript = chunk_transcript(transcript, chunk_size) # chunk the transcript
    return transcript

def classify(file_name):
    streaming_body = client.get_object(Bucket = credentials_os['BUCKET'], Key = file_name.split('.')[0]+'_text.json')['Body']
    transcript=streaming_body.read().decode("utf-8")
    analysis = {}
    for chunk in process_text(transcript):
        chunk = ' '.join(chunk)
        analysis[chunk] = natural_language_classifier.classify(credentials_nlc['classifier_id'], chunk)
    ## client.put_object(Bucket = credentials_os['BUCKET'], Key = file_name.split('.')[0]+'_nlc', Body= json.dumps(analysis)) # Done already for Demo purposes
    return analysis


def classify_transcript(file_name):
    status = natural_language_classifier.get_classifier(credentials_nlc['classifier_id'])
    if status['status'] == 'Available':
        classes = classify(file_name)
    return classes

def post_analysis_nlc(result):
    for chunk in result.keys():
        classes = result[chunk]['classes']
        print('\nchunk: ', chunk)
        for clas in classes:
            print('Class Name: ', clas['class_name'], ', Confidence: ', clas['confidence']) #add table instead of prints

In [36]:
analysis = classify_transcript(file_list[1])
post_analysis_nlc(analysis)


chunk:  I am yeah I want to change my address naw you know this is the third time this week I've had to call in to
Class Name:  request-change-info , Confidence:  0.7420795123387997
Class Name:  request-disconnect , Confidence:  0.0794980547604088
Class Name:  social-cue-exit , Confidence:  0.06097694501924905
Class Name:  standard-conversation , Confidence:  0.03937208079223737
Class Name:  request-strong-signal-manager-request , Confidence:  0.021583128302510794
Class Name:  social-cue-uncomfortable , Confidence:  0.01363430645104178
Class Name:  strong-signal-anger , Confidence:  0.011877407330384978
Class Name:  strong-signal-miscommunication , Confidence:  0.010010336017601321
Class Name:  chit-chat , Confidence:  0.0074814516333400185
Class Name:  strong-signal-joy , Confidence:  0.006816879256816359

chunk:  it changed and hopefully you guys don't screwed up again okay yeah it's Bob Smith in Sacramento and my old addresses one twenty three oak
Class Name:  standard-conversation

# Setup Watson Tone Analyzer


In [18]:

tone_analyzer = ToneAnalyzerV3(version = '2016-05-19',
                               username = credentials_tone['username'],
                               password = credentials_tone['password'])


chunk_size=25

def analyze_transcript(file_name):
    transcript = client.get_object(Bucket = credentials_os['BUCKET'], Key = file_name.split('.')[0]+'_text.json')['Body']
    transcript = transcript.read().decode("utf-8")
    tone_analysis={}
    for chunk in process_text(transcript):
        if len(chunk) > 2:
            chunk = ' '.join(chunk)
            tone_analysis[chunk] = tone_analyzer.tone(chunk, content_type='text/plain')
    # res=client.put_object(Bucket = credentials_os['BUCKET'], Key= file_name.split('.')[0]+'_tone.json', Body = json.dumps(tone_analysis))
    return tone_analysis

def print_tones(tones):
    for tone in tones:
        print(tone)

def post_analysis(result):
    for chunk in result.keys():
        tone_categories = result[chunk]['document_tone']['tone_categories']
        print('\nchunk: ', chunk)
        for tone_category in tone_categories:
            print_tones(tone_category['tones'])

In [40]:
result = analyze_transcript(file_list[6])
post_analysis(result) 


chunk:  some clay or some dough do you round up friends for a game of tag or see how fast you go it's great to be
{'tone_name': 'Anger', 'tone_id': 'anger', 'score': 0.023173}
{'tone_name': 'Disgust', 'tone_id': 'disgust', 'score': 0.030529}
{'tone_name': 'Fear', 'tone_id': 'fear', 'score': 0.059977}
{'tone_name': 'Joy', 'tone_id': 'joy', 'score': 0.743975}
{'tone_name': 'Sadness', 'tone_id': 'sadness', 'score': 0.108564}
{'tone_name': 'Analytical', 'tone_id': 'analytical', 'score': 0.0}
{'tone_name': 'Confident', 'tone_id': 'confident', 'score': 0.0}
{'tone_name': 'Tentative', 'tone_id': 'tentative', 'score': 0.939968}
{'tone_name': 'Openness', 'tone_id': 'openness_big5', 'score': 0.430547}
{'tone_name': 'Conscientiousness', 'tone_id': 'conscientiousness_big5', 'score': 0.986883}
{'tone_name': 'Extraversion', 'tone_id': 'extraversion_big5', 'score': 0.786764}
{'tone_name': 'Agreeableness', 'tone_id': 'agreeableness_big5', 'score': 0.784756}
{'tone_name': 'Emotional Range', 'tone_id':

# Analytics

In [21]:
# Method to parse NLU response file from Cloud Object Storage
# and return sentiment score, sentiment label, and keywords
# This method works for the scenario of one NLU call per call (file)
def getNLUresponse(COSclient, bucket, files):
    nlu_results = []
    for filename in files:
        # Extract NLU enriched filename from the original file name
        nlu_filename = filename.split('.')[0]+'_NLU.json'
        print("Processing NLU response from file: ", nlu_filename)
        streaming_body = COSclient.get_object(Bucket=bucket, Key=nlu_filename)['Body']
        nlu_response = json.loads(streaming_body.read().decode("utf-8"))
        #print(json.dumps(nlu_response,indent=2))
        if nlu_response and nlu_response['sentiment'] \
        and nlu_response['sentiment']['document'] and nlu_response['sentiment']['document']['label']:
            sentiment_score = nlu_response['sentiment']['document']['score']
            sentiment_label = nlu_response['sentiment']['document']['label']
            keywords = list(nlu_response['keywords'])
        else:
            sentiment_score = 0.0
            sentiment_label = None
            keywords = null
        nlu_results.append((filename,sentiment_score,sentiment_label,keywords))
    return (nlu_results)

In [22]:
# Method to parse NLU Emotion Tone response file from Cloud Object Storage
def getChunkNLU(nlu_response):
    #print(json.dumps(nlu_response,indent=2))
    if nlu_response and nlu_response['sentiment'] \
    and nlu_response['sentiment']['document'] and nlu_response['sentiment']['document']['label']:
        sentiment_score = nlu_response['sentiment']['document']['score']
        sentiment_label = nlu_response['sentiment']['document']['label']
        keywords = list(nlu_response['keywords'])
    else:
        sentiment_score = 0.0
        sentiment_label = None
        keywords = null
    
    return sentiment_score, sentiment_label, keywords

In [25]:
# Method to parse NLU response file from Cloud Object Storage
# and return sentiment score, sentiment label, and keywords
# This method handles the scenario when call is broken into multiple chunks
def getNLUresponseChunks(COSclient, bucket, files):
    nlu_results = []
    for filename in files:
        # Extract NLU enriched filename from the original file name
        nlu_filename = filename.split('.')[0]+'_NLUchunks.json'
        print("Processing NLU response from file: ", nlu_filename)
        streaming_body = COSclient.get_object(Bucket=bucket, Key=nlu_filename)['Body']
        nlu_chunks_response = json.loads(streaming_body.read().decode("utf-8"))
        if nlu_chunks_response and len(nlu_chunks_response)>0:
            chunkidx = 0
            for chunk in nlu_chunks_response:
                chunk_nlu = getChunkNLU(nlu_chunks_response[chunk])
                print('chunk nlu: ', chunk_nlu)
                print('type of chunk nlu: ', type(chunk_nlu))
                chunkidx = chunkidx + 1
                tmp_results = (filename, chunkidx, chunk_nlu)
                l = list((filename,chunkidx)) + list(chunk_nlu)
                nlu_results.append(l)
        
    return (nlu_results)

In [26]:
## Alternative call to handle the case when the NLU response has been broken into chunks of 25 words each
nlu_header=['filename','chunkidx','sentiment_score','sentiment_label','keywords']
nlu_results = getNLUresponseChunks(client, credentials_os['BUCKET'], file_list)

Processing NLU response from file:  sample1-addresschange-positive_NLUchunks.json
chunk nlu:  (0.0, 'neutral', [{'text': 'old address', 'relevance': 0.989977}, {'text': 'phone number', 'relevance': 0.832675}, {'text': 'oak', 'relevance': 0.632994}])
type of chunk nlu:  <class 'tuple'>
chunk nlu:  (0.693769, 'positive', [{'text': 'phone number', 'relevance': 0.949776}, {'text': 'thing', 'relevance': 0.808227}, {'text': 'options', 'relevance': 0.638675}, {'text': 'changes', 'relevance': 0.635845}])
type of chunk nlu:  <class 'tuple'>
chunk nlu:  (0.712912, 'positive', [{'text': 'good morning', 'relevance': 0.928237}, {'text': 'Ryan Smith', 'relevance': 0.864158}, {'text': 'Sacramento California', 'relevance': 0.717062}, {'text': 'help', 'relevance': 0.268779}, {'text': 'address', 'relevance': 0.266876}])
type of chunk nlu:  <class 'tuple'>
chunk nlu:  (0.991182, 'positive', [{'text': 'yep', 'relevance': 0.978438}, {'text': 'thanks', 'relevance': 0.790658}, {'text': 'address', 'relevance'

In [27]:
callcenterlogs_nluDF = spark.createDataFrame(nlu_results, nlu_header)

### Sentiment plots using PixieDust
Leverage PixieDust to plot sentiment labels as a pie-chart showing how many positive, negative, and neutral calls are received.

In [28]:
## Ignore any records with null sentiment label
callcenterlogs_nluDF = callcenterlogs_nluDF.where(col('sentiment_label').isNotNull())
perlabel_sentimentDF = callcenterlogs_nluDF.groupBy('sentiment_label')\
                              .agg(F.count('filename')\
                              .alias('num_calls'))

In [29]:
# Call Pixiedust to visualize sentiment data
display(callcenterlogs_nluDF)