
# Notebook 3 – Natural Language Classifier (NLC)
IBM Watson Natural Language Classifier uses machine learning algorithms to return the top matching predefined classes for short text input. 

*YOU* Create and train a classifier to connect predefined classes to example texts so that the service can apply those classes to new inputs.

https://www.ibm.com/watson/services/natural-language-classifier/ 
https://www.ibm.com/watson/developercloud/natural-language-classifier/api/v1 


## Install dependencies

In [None]:
#imports.... Run this each time after restarting the Kernel
#!pip install watson_developer_cloud
import watson_developer_cloud as watson
import json
from botocore.client import Config
import ibm_boto3
import requests

### Add Credentials

Copy paste the following snippet to next cell, and add your own set of crdentials there:

```code
credentials_os = {
  "apikey": "",
  "cos_hmac_keys": {
    "access_key_id": "",
    "secret_access_key": ""
  },
  "endpoints": "",
  "iam_apikey_description": "",
  "iam_apikey_name": "",
  "iam_role_crn": "",
  "iam_serviceid_crn": "",
  "resource_instance_id": ""
}

credentials_os['BUCKET'] = '<bucket_name_from_your_COS' # copy bucket name from COS


credentials_nlc = {
    "classifier_id": "",
    "url": "",
    "username": "",
    "password": ""
}

```

In [None]:
credentials_os = {
    
}

credentials_os['BUCKET'] = '<bucket_name_from_your_COS' # copy bucket name from COS


### Training the NLC classifier

Training a classifier can take 10m for small ground truth CSV's and longer for more complex. You can use credentials and pre-trained classifier provide by us for this tutorial.



In [23]:
# Temp BURNER - "call_center_gt_NLC_V2.csv"
# Credentials will only be available till March 23, 2018; afterward you need to train your own classifier
credentials_nlc = {
    "classifier_id": "f7ea68x308-nlc-917",
    "url": "https://gateway.watsonplatform.net/natural-language-classifier/api",
    "username": "280b9633-d8c0-4ed2-9ee6-1b2c139516fb",
    "password": "xeDbLU87jHZZ"
}


In [24]:
# The code was removed by DSX for sharing.

In [25]:
endpoints = requests.get(credentials_os['endpoints']).json()

iam_host = (endpoints['identity-endpoints']['iam-token'])
cos_host = (endpoints['service-endpoints']['cross-region']['us']['public']['us-geo'])

auth_endpoint = "https://" + iam_host + "/oidc/token"
service_endpoint = "https://" + cos_host


client = ibm_boto3.client(
    's3',
    ibm_api_key_id = credentials_os['apikey'],
    ibm_service_instance_id = credentials_os['resource_instance_id'],
    ibm_auth_endpoint = auth_endpoint,
    config = Config(signature_version='oauth'),
    endpoint_url = service_endpoint
   )




### NLC

- `process_text()` goes throught the text and fetch sentences and concatenate transcript based on chunk size
- `classify()` calls natural language classifier endpoint and classify the text fields in transcript

In [26]:
#NLC

from watson_developer_cloud import NaturalLanguageClassifierV1

natural_language_classifier = NaturalLanguageClassifierV1(
    username = credentials_nlc['username'],
    password = credentials_nlc['password'])

chunk_size = 25
# Used to SPLIT up - "CHUNK" the aggregate transcript into smaller pieces

def chunk_transcript(transcript, chunk_size):
    transcript = transcript.split(' ')
    return [ transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size) ] # chunking data
    

def process_text(text):
    transcript=''
    for sentence in json.loads(text)['results']:
        transcript = transcript + sentence['alternatives'][0]['transcript'] # concatenate sentences
    transcript = chunk_transcript(transcript, chunk_size) # chunk the transcript
    return transcript

def classify(file_name):
    streaming_body = client.get_object(Bucket = credentials_os['BUCKET'], Key = file_name.split('.')[0]+'_text.json')['Body']
    transcript=streaming_body.read().decode("utf-8")
    analysis = {}
    for chunk in process_text(transcript):
        chunk = ' '.join(chunk)
        analysis[chunk] = natural_language_classifier.classify(credentials_nlc['classifier_id'], chunk)
    client.put_object(Bucket = credentials_os['BUCKET'], Key = file_name.split('.')[0]+'_nlc', Body= json.dumps(analysis))
    return analysis


def classify_transcript(file_name):
    status = natural_language_classifier.get_classifier(credentials_nlc['classifier_id'])
    if status['status'] == 'Available':
        classes = classify(file_name)
    return classes


In [27]:
file_list = ['sample1-addresschange-positive.ogg',
             'sample2-address-negative.ogg',
             'sample3-shirt-return-weather-chitchat.ogg',
             'sample4-angryblender-sportschitchat-recovery.ogg',
             'sample5-calibration-toneandcontext.ogg',
             'jfk_1961_0525_speech_to_put_man_on_moon.ogg',
             'May 1 1969 Fred Rogers testifies before the Senate Subcommittee on Communications.ogg'
            ]


classify_transcript(file_list[0])

{'bye bye ': {'classes': [{'class_name': 'standard-conversation',
    'confidence': 0.9322813077613376},
   {'class_name': 'strong-signal-joy', 'confidence': 0.015946408018823546},
   {'class_name': 'strong-signal-satisfaction',
    'confidence': 0.011253473850558509},
   {'class_name': 'social-cue-exit', 'confidence': 0.008802616238827679},
   {'class_name': 'chit-chat', 'confidence': 0.007338829116471957},
   {'class_name': 'strong-signal-anger', 'confidence': 0.007010593800530975},
   {'class_name': 'strong-signal-miscommunication',
    'confidence': 0.004514950562333546},
   {'class_name': 'request-strong-signal-manager-request',
    'confidence': 0.0039424477335365075},
   {'class_name': 'request-disconnect', 'confidence': 0.0034904947081333022},
   {'class_name': 'social-cue-uncomfortable',
    'confidence': 0.0027621427078705625}],
  'classifier_id': 'f7ea68x308-nlc-917',
  'text': 'bye bye ',
  'top_class': 'standard-conversation',
  'url': 'https://gateway.watsonplatform.net/n

In [28]:
for filename in file_list:
    print("\n\nprocessing file: ", filename)
    analysis = classify_transcript(filename)
    print(analysis)



processing file:  sample1-addresschange-positive.ogg
{"that's right my phone number is five five five one two one two yes that's me my old address is number one two three oak": {'text': "that's right my phone number is five five five one two one two yes that's me my old address is number one two three oak", 'url': 'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/f7ea68x308-nlc-917', 'classifier_id': 'f7ea68x308-nlc-917', 'top_class': 'standard-conversation', 'classes': [{'class_name': 'standard-conversation', 'confidence': 0.9563011926937023}, {'class_name': 'request-change-info', 'confidence': 0.015043552630646849}, {'class_name': 'strong-signal-satisfaction', 'confidence': 0.007370224143331118}, {'class_name': 'request-strong-signal-manager-request', 'confidence': 0.005135413774182448}, {'class_name': 'chit-chat', 'confidence': 0.0030000239663056268}, {'class_name': 'social-cue-uncomfortable', 'confidence': 0.0026690398804625226}, {'class_name': 's