In [17]:
%%capture
!pip install wordcloud
!pip install ibm-watson

In [18]:
import watson_nlp
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(0)

In [19]:
import json
import pandas as pd
from time import process_time
pd.options.display.max_colwidth = 400
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import watson_nlp
from datetime import datetime

In [20]:
from watson_core.data_model.streams.resolver import DataStreamResolver
from watson_core.toolkit import fileio
from watson_nlp.blocks.classification.svm import SVM
from watson_nlp.workflows.classification import Ensemble
from watson_core.toolkit.quality_evaluation import QualityEvaluator, EvalTypes

In [21]:
url = "https://ibm.box.com/shared/static/sb0yx48iapdoo1x1quf39gokqujxxqix.csv"
complaint_df = pd.read_csv(url, error_bad_lines=False)
text_col = 'text'
complaint_df.head(5)

Unnamed: 0,workspace_id,category_name,document_id,dataset,text,uri,element_metadata,label,label_type
0,heidloff_net,WatsonEmbed,page6_elem0,heidloff_net,TensorFlow Object Detection is a powerful technology to recognize different objects in images including their positions.,heidloff_net-page6_elem0-1,{},NotWatsonEmbed,Standard
1,heidloff_net,WatsonEmbed,page1_elem1,heidloff_net,* NLP [https://www.ibm.com/docs/en/watson-libraries?topic=models-catalog],heidloff_net-page1_elem1-12,{},WatsonEmbed,Standard
2,heidloff_net,WatsonEmbed,page2_elem1,heidloff_net,The NLP containers also provides a gRCP interface [https://github.com/IBM/watson-automation#grpc].,heidloff_net-page2_elem1-69,{},WatsonEmbed,Standard
3,heidloff_net,WatsonEmbed,page1_elem6,heidloff_net,Serving Watson NLP on Kubernetes with KServe ModelMesh.,heidloff_net-page1_elem6-0,{},WatsonEmbed,Standard
4,heidloff_net,WatsonEmbed,page2_elem5,heidloff_net,"$ curl ""http://localhost:1080/text-to-speech/api/v1/voices""",heidloff_net-page2_elem5-32,{},WatsonEmbed,Standard


In [22]:
train_test_df = complaint_df

In [23]:
# 80% training data
train_orig_df = train_test_df.groupby('label').sample(frac=0.8, random_state=6)
print("Training data:")
print("Number of training samples: {}".format(len(train_orig_df)))
print("Samples by product group:\n{}".format(train_orig_df['label'].value_counts()))

# 20% test data
test_orig_df = train_test_df.drop(train_orig_df.index)
print("\nTest data:")
print("Number of test samples: {}".format(len(test_orig_df)))
print("Samples by product group:\n{}".format(test_orig_df['label'].value_counts()))

# re-index after sampling
train_orig_df = train_orig_df.reset_index(drop=True)
test_orig_df = test_orig_df.reset_index(drop=True)

Training data:
Number of training samples: 541
Samples by product group:
WatsonEmbed       278
NotWatsonEmbed    263
Name: label, dtype: int64

Test data:
Number of test samples: 135
Samples by product group:
WatsonEmbed       69
NotWatsonEmbed    66
Name: label, dtype: int64


In [24]:
def prepare_data(df):
    df_out = df[['text', 'label']].reset_index(drop=True)
    df_out = df_out.rename(columns={"text": "text", 'label': 'labels'})
    df_out['labels'] = df_out['labels'].map(lambda label: [label,])
    return df_out
    
train_df = prepare_data(train_orig_df)
train_file = './train_data.json'
train_df.to_json(train_file, orient='records')
    
test_df = prepare_data(test_orig_df)
test_file = './test_data.json'
test_df.to_json(test_file, orient='records')

train_df.head(2)

Unnamed: 0,text,labels
0,Building machine and deep learning models from scratch is often not trivial not for developers and sometimes not even for data scientists.,[NotWatsonEmbed]
1,However the curated catalog doesn’t have to be used or can be used in addition to a custom catalog.,[NotWatsonEmbed]


In [25]:
test_df.explode('labels')

Unnamed: 0,text,labels
0,It’s also possible to deploy in addition to the Watson NLP runtime multiple models both predefined models as well as custom models.,WatsonEmbed
1,FROM ${EMOTION_MODEL} as model2,NotWatsonEmbed
2,- name: NAMESPACE,NotWatsonEmbed
3,Standalone containers: One pod with one container including the NLP runtime and the models,WatsonEmbed
4,* ibmcloud CLI documentation [https://cloud.ibm.com/docs/codeengine?topic=codeengine-cli#cli-secret-create],NotWatsonEmbed
...,...,...
130,* Running IBM Watson Text to Speech in Containers [http://heidloff.net/article/running-ibm-watson-text-to-speech-in-containers/],WatsonEmbed
131,stopwords = watson_nlp.download_and_load('text_stopwords_classification_ensemble_en_stock'),WatsonEmbed
132,For example the ‘deploy’ stage cannot be handled generically which is why custom specific scripts need to be provided.,NotWatsonEmbed
133,df_out['labels'] = df_out['labels'].map(lambda label: [label]),NotWatsonEmbed


In [26]:
import plotly.express as px
import plotly.io as pio
plotly_template = pio.templates["plotly_dark"]
pio.templates["plotly_dark_custom"] = pio.templates["plotly_dark"]

complaints_total_figure = px.bar(test_df.explode('labels')['labels'].value_counts())
complaints_total_figure.update_layout(template=plotly_template,barmode='stack',title_text='Financial news dataset', title_x=0.5)
complaints_total_figure.show()

In [27]:
syntax_model = watson_nlp.load(watson_nlp.download('syntax_izumo_en_stock'))
use_model = watson_nlp.load(watson_nlp.download('embedding_use_en_stock'))

In [28]:
training_data_file = train_file

data_stream_resolver = DataStreamResolver(target_stream_type=list, expected_keys={'text': str, 'labels': list})
training_data = data_stream_resolver.as_data_stream(training_data_file)

text_stream, labels_stream = training_data[0], training_data[1]
syntax_stream = syntax_model.stream(text_stream)

use_train_stream = use_model.stream(syntax_stream, doc_embed_style='raw_text')
use_svm_train_stream = watson_nlp.data_model.DataStream.zip(use_train_stream, labels_stream)

In [29]:
def predict_product(text):
    ensemble_preds = ensemble_model.run(text)
    predicted_ensemble = ensemble_preds.to_dict()["classes"][0]["class_name"]
    return (predicted_ensemble, predicted_ensemble)

In [30]:
stopwords = watson_nlp.download_and_load('text_stopwords_classification_ensemble_en_stock')

ensemble_model = Ensemble.train(train_file, 'syntax_izumo_en_stock', 'embedding_glove_en_stock', 'embedding_use_en_stock', stopwords=stopwords, cnn_epochs=5)

Epoch 1/5
9/9 - 13s - loss: 4.6545 - categorical_accuracy: 0.6562 - 13s/epoch - 1s/step
Epoch 2/5
9/9 - 11s - loss: 3.6832 - categorical_accuracy: 0.8762 - 11s/epoch - 1s/step
Epoch 3/5
9/9 - 11s - loss: 2.9498 - categorical_accuracy: 0.9409 - 11s/epoch - 1s/step
Epoch 4/5
9/9 - 11s - loss: 2.3606 - categorical_accuracy: 0.9575 - 11s/epoch - 1s/step
Epoch 5/5
9/9 - 11s - loss: 1.8939 - categorical_accuracy: 0.9667 - 11s/epoch - 1s/step


In [31]:
predictions = test_orig_df[text_col].apply(lambda text: predict_product(text))
predictions_df = pd.DataFrame.from_records(predictions, columns=('Predicted SVM', 'Predicted Ensemble'))
result_df = test_orig_df[[text_col, "label"]].merge(predictions_df, how='left', left_index=True, right_index=True)

In [33]:
project.save_data('ensemble_model', data=ensemble_model.as_file_like_object(), overwrite=True)

#ensemble_model = watson_nlp.load(project.get_file('ensemble_model'))

{'file_name': 'ensemble_model',
 'message': 'File saved to project storage.',
 'bucket_name': 'consumercomplaintsclassification3-donotdelete-pr-63vpnrjflig8vl',
 'asset_id': '9c2aab6e-737d-444f-9368-4ff1872d4e3e'}

In [34]:
from sklearn.metrics import classification_report
actual = result_df['label']

In [35]:
predicted_ensemble = result_df['Predicted Ensemble']
matrix = classification_report(actual,predicted_ensemble,labels=['WatsonEmbed', 'NotWatsonEmbed'])
print('Classification report for Ensemble classifier: \n',matrix)

Classification report for Ensemble classifier: 
                 precision    recall  f1-score   support

   WatsonEmbed       0.99      0.99      0.99        69
NotWatsonEmbed       0.98      0.98      0.98        66

      accuracy                           0.99       135
     macro avg       0.99      0.99      0.99       135
  weighted avg       0.99      0.99      0.99       135

