In [None]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='537e4f12-aeaa-4390-940a-8a2c406af632', project_access_token='p-9fcf5c343afe96973ef0268a085d5ff38e3cd0b9')
pc = project.project_context

# Watson Assistant Anonymous Log Metrics

In [None]:
!curl -O https://raw.githubusercontent.com/cognitive-catalyst/WA-Testing-Tool/master/log_analytics/getAllLogs.py
!curl -O https://raw.githubusercontent.com/cognitive-catalyst/WA-Testing-Tool/master/log_analytics/extractConversations.py

%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter("ignore")

!pip install ibm-watson

import json
import pandas as pd
import getAllLogs
import extractConversations

In [None]:
# Custom functions to re-use code throughout notebook
def turn_dict_to_df(df,col_names):
    df = pd.DataFrame.from_dict(df)
    df.reset_index(level=0, inplace=True)
    df.columns = col_names
    return df

## 1. Configuration and log collection <a class="anchor" id="config"></a>
This section will configure your DB2 connection, log query parameters, and will extract the logs from your Watson Assistant instance.

> **Action Required:** Update each of the variables marked with 'XXXXXXXX'.  The comments in the cells guide you in the configuration.

In [None]:
# Define the customer name. This prefix will be used for saving CSV & JSON files.
custName = 'csmbot'

# Set the start date for the log fetch. If you are using the DB2 connection in Section 1.1, this will be defined automatically.
log_fetch_start = '2020-05-15'

### 1.2 Retrieve logs from the Watson Assistant instance
This section will retrieve the user logs from the Assistant `/logs` API.

> **Action Required:** Update the fields below marked 'XXXXXXXX' based on the credentials of your Assistant. 
Solutions using an Assistant layer (v2 API) should set `workspace_id=None` and provide `assistant_id`. Otherwise, define workspace and comment out assistant_id.




In [None]:
# Extract logs from your assistant. Complete this information.
iam_apikey = 'myapikey' 
url = "https://gateway.watsonplatform.net/assistant/api"# Set the URL to the region, e.g. https://api.us-east.assistant.watson.cloud.ibm.com
assistant_id = 'myassistantid'
workspace_id = None

In [None]:
# @hidden_cell
# Extract logs from your assistant. Complete this information.
iam_apikey = 'kutJc0CZM6SoaWvkpzyc5eMZfkbApBXiYJBZRBeDLeR5' 
url = "https://gateway.watsonplatform.net/assistant/api"# Set the URL to the region, e.g. https://api.us-east.assistant.watson.cloud.ibm.com
assistant_id = 'd72e5428-e42b-40dd-98f5-b1e7f5f87f45'
workspace_id = None

In [None]:
# If not using assistant_id, comment out the 2nd line below. 
log_filter="language::en,response_timestamp>=" + "2020-01-01" \
+",request.context.system.assistant_id::" + assistant_id

#Change the number of logs retrieved, default settings will return 100,000 logs (200 pages of 500)
page_size_limit=500
page_num_limit=200

version="2020-09-24" # Watson Assistant API version

rawLogsJson = getAllLogs.getLogs(iam_apikey, url, workspace_id, log_filter, page_size_limit, page_num_limit, version)
rawLogsPath= custName + "_logs.json"

# getAllLogs.writeLogs(rawLogsJson, rawLogsPath) # Saves the logs locally
project.save_data(file_name = rawLogsPath,data = json.dumps(rawLogsJson),overwrite=True); # Saves the logs in Studio/COS
print('\nSaved log data to {}'.format(rawLogsPath))

### 1.3 Load logs from JSON file (Defunct)
If you have previously saved the JSON file, you can uncomment this section to load the logs. Otherwise, comment this section out and continue.

In [None]:
# #If you have previously stored your logs on the file system, you can reload them here by uncommenting these lines
# rawLogsPath= custName+"_logs.json"
# rawLogsJson = extractConversations.readLogs(rawLogsPath)

### 1.4 Format logs
Now that the logs have been retrieved, this section will transform the data out of JSON format and into a Pandas dataframe. 

> **Optional:** If you wish to add any custom fields (such as a context variable), add it the first line `customFieldNames` below. Otherwise, run this cell as-is.

In [None]:
# Optionally provide a comma-separated list of custom fields you want to extract, in addition to the default fields
customFieldNames = ''

# Unique conversation identifier across all records. This is default. For a multi-skill assistant you will need to provide your own key.
primaryLogKey = "response.context.conversation_id"
conversationKey='conversation_id' # Name of the correlating key as it appears in the data frame columns (remove 'response.context.')

# These custom fields are added to the list. They are used for extracting metrics in the notebook. Do not change these.
customFieldNames = customFieldNames + ",response.context.vgwSIPFromURI,response.context.vgwSessionID,request.context.vgwSMSFailureReason,\
request.context.vgwSMSUserPhoneNumber,response.output.vgwAction.parameters.transferTarget,response.context.language,\
response.context.metadata.user_id,response.output.generic"

allLogsDF = extractConversations.extractConversationData(rawLogsJson, primaryLogKey, customFieldNames)
conversationsGroup = allLogsDF.groupby(conversationKey,as_index=False)

# Splits the response_timestamp into month, day, and year fields that can be used for easier data filtering/visualizations 
allLogsDF["full_date"] = pd.to_datetime(allLogsDF["response_timestamp"])
allLogsDF['month'] = allLogsDF['full_date'].dt.month
allLogsDF['day'] = allLogsDF['full_date'].dt.day
allLogsDF['year'] = allLogsDF['full_date'].dt.year

print("Total log events:",len(allLogsDF))
allLogsDF.head()

In [None]:
# Print the column names
# allLogsDF.columns

## 2.1 Core Metrics <a class="anchor" id="core-metrics"></a>
These metrics apply to all Watson Assistant solutions. For voice solutions, additional metrics are in the next section.
* [2.1.2 Coverage Metric](#coverage-metric)
* [2.1.3 Search Skill Responses](#search-skill)
* [2.1.4 Escalation Requests](#escalation-metric)
* [2.1.5 Active Users](#active-users)
* [2.1.6 Top Intents & Average Confidence Scores](#top-intents-scores)
* [2.1.7 Top Entities](#top-entities)

In [None]:
# dict{} that we will send to CSV for use in Watson Studio Cognos Dashboard
metrics_dict = {}

# These should match the count in the Watson Assistant Analytics tooling.
totalConvs = len(allLogsDF[conversationKey].unique())
print("Total messages:     ", len(allLogsDF))
print("Total conversations:", totalConvs)

### 2.1.2 Coverage Metric <a class="anchor" id="coverage-metric"></a>
Coverage is the measurement of the portion of total user messages that your assistant is attempting to respond to. For example, any messages that respond with "Sorry I didn't understand" from the anything_else node is considered uncovered.

> **Action Required:** Define the node ids in `anything_else_nodes` list that represent any responses for uncovered messages. This can be found by exporting the Skill from the Assistant tooling, and searching the JSON for the relevant `dialog_node`. 

In [None]:
# Define the node_id for anything_else and other uncovered nodes
anything_else_nodes = ['Anything else'] 

# coveredDF = allLogsDF
allLogsDF.rename(columns={'input.text': 'input_text'}, inplace=True)
coverage = []

for row in allLogsDF.itertuples():
    appended = False 
    nodes = row.nodes_visited
    for node in nodes:
        if node in anything_else_nodes:
            coverage.append('uncovered') # Mark as uncovered if message hit node in anything_else_nodes list
            appended = True
            break
    if (row.input_text == '' or row.input_text == 'vgwHangUp' or row.input_text == 'vgwPostResponseTimeout') and not appended:
        coverage.append('system_message') # Mark greetings and voicegateway actions as system_messages
        appended = True
    if not appended:
        coverage.append('covered') # else, everything else is covered

allLogsDF['coverage'] = coverage
allLogsDF.rename(columns={'input_text': 'input.text'}, inplace=True)
coveredDF = allLogsDF[allLogsDF['coverage'] == 'covered']
uncoveredDF = allLogsDF[allLogsDF['coverage'] == 'uncovered']

print('Covered messages:   ', len(coveredDF))
print('Uncovered messages: ', len(allLogsDF[allLogsDF['coverage'] == 'uncovered']))
print('System messages:    ', len(allLogsDF[allLogsDF['coverage'] == 'system_message']))
print('\nCoverage metric:    ','{:.0%}'.format(len(coveredDF) / filteredMessages))

# coveredMsgs[['input_text','output.text','coverage']].tail(10)

metrics_dict['coverage'] = [len(coveredDF) / filteredMessages] # Put into metrics dict

In [None]:
#uncoveredDF[['input.text','output.text']].head(10)

### 2.1.2 Search Skill Responses <a class="anchor" id="search-skill"></a>
Watson Assistant has multiple response types including `text`, `option`, `image`, `pause`, or `search skill`. Each of these types are marked within `output.generic.response_type` inside the log data. This cell will calculate the number of Search Skill responses.

In [None]:
# Run this cell
response_type = []

for row in allLogsDF['output.generic']:
    search_skill = False
    for response in row: # each output can have multiple responses
        if response['response_type'] == 'search_skill':
            response_type.append('search_skill')
            search_skill = True
            break
                
    if not search_skill: # if the response was not a search skill, append other to the list
        response_type.append('other')
        
allLogsDF['response_type'] = response_type # Add in response_type column to allLogsDF
searchSkillDF = allLogsDF[allLogsDF['response_type'] == 'search_skill'] # Set new DF 
print('Total Search Skill responses:',len(searchSkillDF))
print('Percentage of total messages: {:.0%}'.format(len(searchSkillDF) / len(allLogsDF) ))

searchSkillDF[['input.text','response_type']].head().reset_index(drop=True) # Print the list of user inputs that caused search skill

In [None]:
# Saves to CSV
project.save_data(file_name = custName + "_search-skill-inputs.csv",data = searchSkillDF.to_csv(index=False),overwrite=True); # This saves in COS. Comment out if running locally

### 2.1.3 Escalation Requests <a class="anchor" id="escalation-metric"></a>

Escalation refers to any time a user is prompted to contact a live person (e.g. 1-800 number). If the assistant has an integration with a live handoff service (e.g. ZenDesk), this is considered escalation. For Voice Interaction solutions, we calculate `call containment` in the next section by counting the number of actual call transfers in the logs.

> **Action Required:** Define the node id in `escalation_node` for a node that represents any responses to an escalation request (e.g. `#General-Agent-Escalation`). This can be found by exporting the Skill from the Assistant tooling, and searching the JSON for the relevant dialog_node.
 

In [None]:
# Define the escalation node
escalation_node = "XXXXXXXX" 
node_visits_escalated = allLogsDF[[escalation_node in x for x in allLogsDF['nodes_visited']]]

escalationMetric = len(node_visits_escalated)/filteredMessages
metrics_dict['escalation'] = [escalationMetric] # Put into metrics dict
print("Total visits to escalation node:",len(node_visits_escalated))
print("Percent of total messages escalated:",'{:.0%}'.format(escalationMetric))

### 2.1.4 Active Users <a class="anchor" id="active-users"></a>
How many unique users used the assistant?

In [None]:
uniqueUsers = allLogsDF["metadata.user_id"].nunique()
metrics_dict['uniqueUsers'] = [uniqueUsers] # inserts into metrics dict
print('Total unique users: {}'.format(uniqueUsers))

### 2.1.5 Top Intents & Average Confidence Scores <a class="anchor" id="top-intents-scores"></a>

In [None]:
# Using pandas aggregators to count how often each intent is selected and its average confidence
intentsDF = filteredLogsDF.groupby('intent',as_index=False).agg({
   'input.text': ['count'], 
   'intent_confidence': ['mean']
})

intentsDF.columns=["intent","count","confidence"] #Flatten the column headers for ease of use

intentsDF = intentsDF[intentsDF['intent'] !=''] # Remove blanks, usually VGW tags + greetings
intentsDF = intentsDF.sort_values('count',ascending=False)
intentsDF = intentsDF.reset_index(drop=True)
intentsDF.head(5) # If you want specific number shown, edit inside head(). If you want to show all, remove head() 

### 2.1.6 Top Entities (Defunct) <a class="anchor" id="top-entities"></a>

In [None]:
entityDF = allLogsDF[allLogsDF["entities"] != ""]
#intentsDF = intentsDF[intentsDF['intent'] !=''] # Remove blanks, usually VGW tags + greetings
entityDF["entities"].iloc[0]

# 3. Export Logs  <a class="anchor" id="export-logs"></a>
The transformed log data inside the Pandas dataframe will be saved to CSV files and DB2 on Cloud database. These logs can be used for further data exploration and for creating visualizations in Cognos Dashboard in Watson Studio.

* [3.1 Saving CSV files to Cloud Object Storage](#export-to-csv)  CSV files will be saved to the project's Data Assets and Cloud Object Storage.
* [3.2 Loading into DB2 on Cloud database](#export-to-db2) The data will be saved to a table on your DB2 instance. 

## 3.1 Saving CSV files to Cloud Object Storage <a class="anchor" id="export-to-csv"></a>
The data will be saved into a CSV file in Cloud Object Storage, accessible via your project's assets folder in Watson Studio. There will be three distinct CSV files saved:
* `_logs.csv` will contain all of the data within the allLogs dataframe
* `_KeyMetrics.csv` will contain the calculated metrics such as coverage, escalation, containment rate, etc.
* `_uncovered_msgs.csv` will contain the selection of uncovered messages. This file can be used for making improvements to intent training and dialog responses.


### 3.1.1 Save all logs to CSV

In [None]:
# allLogsDF.to_csv(custName+'_logs.csv',index=False) # This saves if running notebook locally. Comment out for Studio. 
print('Saving all logs to {}'.format(custName+ "_logs.csv"))
project.save_data(file_name = custName + "_logs.csv",data = allLogsDF.to_csv(index=False),overwrite=True); # This saves in COS. Comment out if running locally

### 3.1.2 Save KPIs to CSV

In [None]:
metricsDF = pd.DataFrame(metrics_dict)
# metricsDF.to_csv(custName + "_KeyMetrics.csv",index=False) # This saves if running notebook locally. Comment out for Studio. 
project.save_data(file_name = custName + "_KeyMetrics.csv",data = metricsDF.to_csv(index=False),overwrite=True); # This saves in COS. Comment out if running locally
print('Saving key metrics to {}'.format(custName+ "_KeyMetrics.csv"))
metricsDF

### 3.1.3 Save uncovered messages to CSV
Improve Coverage by analyzing these uncovered messages. This might require adding training data to Intents or customizing STT models.

In [None]:
print('\nSaved', len(uncoveredDF), 'messages to:', custName + "_uncovered_msgs.csv")
# uncoveredDF.to_csv(custName + "_uncovered_msgs.csv",index=False, header=['Utterance','Response','Intent','Confidence'])

project.save_data(file_name = custName + "_uncovered_msgs.csv",data = uncoveredDF.to_csv(index=False),overwrite=True); # This saves in COS. Comment out if running locally

### End of Notebook v2.1 (last modified on 7-2-20)