# End-to-end NLP: News Headline classifier

### Setup execution role and session

In [1]:
import numpy as np
import pandas as pd

In [2]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()

arn:aws:iam::349934754982:role/service-role/AmazonSageMaker-ExecutionRole-20190314T102350
CPU times: user 543 ms, sys: 55.5 ms, total: 598 ms
Wall time: 1.69 s


### Download News Aggregator Dataset available at the public UCI dataset repository

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip

In [None]:
!unzip NewsAggregatorDataset.zip

In [None]:
!rm -rf __MACOSX/

In [6]:
#ls

#### Let's visualize the dataset

In [3]:
import pandas as pd
import mxnet
import re
import numpy as np
import os

In [4]:
column_names = ["TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
news_dataset = pd.read_csv('newsCorpora.csv', names=column_names, header=None, delimiter='\t')
news_dataset.head()

Unnamed: 0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


#### For this exercice we'll only use the title (Headline) of the news story and the category as our target variable

In [5]:
df=news_dataset[['TITLE',"CATEGORY"]]

In [6]:
from collections import Counter
Counter(df['CATEGORY'])

Counter({'b': 115967, 't': 108344, 'e': 152469, 'm': 45639})

The dataset has four categories: Business (b), Science & Technology (t), Entertainment (e) and Health & Medicine (m).

#### Dummy encode the labels

In [7]:
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
encoder = preprocessing.LabelEncoder()

docs = df["TITLE"].values

encoder.fit(df["CATEGORY"].values)
encoded_Y = encoder.transform(df["CATEGORY"].values)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = to_categorical(encoded_Y)

Using MXNet backend


In [8]:
#bucket = <bucket> # custom bucket name.
s3_bucket = sess.default_bucket()
s3_prefix = 'news'

In [9]:
list(encoder.classes_)

['b', 'e', 'm', 't']

In [10]:
encoded_Y

array([0, 0, 0, ..., 2, 2, 2])

#### Tokenize documents and set fixed sequence lengths for input feature dimension.

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(vocab_size)
# pad documents to a max length of 4 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

75286
422419


In [12]:
docs[0]

'Fed official says weak data caused by weather, should not slow taper'

### Import word embeddings

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip && unzip glove.6B.zip

In [None]:
rm 2pageSessions.csv glove.6B.200d.txt glove.6B.50d.txt glove.6B.300d.txt glove.6B.zip

##### Create embedding matrix

In [None]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [13]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./vectors.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 71291 word vectors.


In [14]:
#embeddings_index

In [15]:
#print(t.word_index)

In [16]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [18]:
mkdir ./data/ ./data/embeddings/

In [19]:
#embedding_matrix.dump("ingredients-embedding-matrix.dat")
np.save(file="./data/embeddings/docs-embedding-matrix",
        arr=embedding_matrix,
        allow_pickle=False)
print(embedding_matrix.shape)

(75286, 100)


### Train, test split

In this section we will prep the data for ingestion for the algortihm. Split the data set in train and test samples and uplad the data to S3

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_docs, dummy_y, test_size=0.2, random_state=42)

In [21]:
!mkdir data/train/ data/test/

In [22]:
np.save('./data/train/train_X.npy', X_train)
np.save('./data/train/train_Y.npy', y_train)
np.save('./data/test/test_X.npy', X_test)
np.save('./data/test/test_Y.npy', y_test)

In [23]:
traindata_s3_prefix = '{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '{}/data/test'.format(s3_prefix)
embeddings_s3_prefix='{}/data/embeddings'.format(s3_prefix)
output_s3 = 's3://{}/{}/models/'.format(s3_bucket, s3_prefix)
code_location_s3 = 's3://{}/{}/codes'.format(s3_bucket, s3_prefix)

In [24]:
train_s3 = sess.upload_data(path='./data/train/', bucket=s3_bucket, key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path='./data/test/', bucket=s3_bucket, key_prefix=testdata_s3_prefix)
embeddings_s3 = sess.upload_data(path='./data/embeddings/', bucket=s3_bucket, key_prefix=embeddings_s3_prefix)


In [25]:
inputs = {'train':train_s3, 'test': test_s3, 'embeddings': embeddings_s3}

print(inputs)

{'train': 's3://sagemaker-ap-southeast-1-349934754982/news/data/train', 'test': 's3://sagemaker-ap-southeast-1-349934754982/news/data/test', 'embeddings': 's3://sagemaker-ap-southeast-1-349934754982/news/data/embeddings'}


In [26]:
import sagemaker
from sagemaker.mxnet import MXNet

### Define hyperparameters to push to algorithm

In [27]:
hyperparameters = {'epochs': 5, 'vocab_size':vocab_size, 'num_classes':encoder.classes_.size}

In [28]:
mxnet_estimator = MXNet(entry_point='keras_script_mxnet.py',
                       source_dir='./tf-src',
                        role=role,
                        train_instance_type='ml.p2.xlarge',
                        train_instance_count=1,
                        framework_version='1.3.0',
                        py_version='py3',
                        hyperparameters=hyperparameters)
mxnet_estimator.fit(inputs)

INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-2019-04-02-16-54-36-116


2019-04-02 16:54:39 Starting - Starting the training job...
2019-04-02 16:54:40 Starting - Launching requested ML instances......
2019-04-02 16:55:47 Starting - Preparing the instances for training......
2019-04-02 16:57:05 Downloading - Downloading input data...
2019-04-02 16:57:23 Training - Downloading the training image..
[31m2019-04-02 16:57:47,898 sagemaker-containers INFO     Imported framework sagemaker_mxnet_container.training[0m
[31m2019-04-02 16:57:47,944 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_MODULE_NAME': 'keras_script_mxnet', 'SM_MODULE_DIR': 's3://sagemaker-ap-southeast-1-349934754982/sagemaker-mxnet-2019-04-02-16-54-36-116/source/sourcedir.tar.gz', 'SM_OUTPUT_DIR': '/opt/ml/output', 'SM_INPUT_CONFIG_DIR': '/opt/ml/input/config', 'SM_FRAMEWORK_MODULE': 'sagemaker_mxnet_container.training:main', 'SM_CHANNEL_TEST': '/opt/ml/input/data/test', 'SM_FRAMEWORK_PARAMS': '{}', 'SM_MODEL_DIR': '/opt/ml/model', 'SM_LOG_LEVEL': '20', 'SM_CHAN


  force_init=force_init)[0m
[31m[16:57:56] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:109: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)[0m
[31m - 68s - loss: 0.2403 - acc: 0.9040[0m
[31mEpoch 2/5[0m
[31m - 63s - loss: 0.2278 - acc: 0.9120[0m
[31mEpoch 3/5[0m
[31m - 64s - loss: 0.2253 - acc: 0.9143[0m
[31mEpoch 4/5[0m
[31m - 64s - loss: 0.2260 - acc: 0.9149[0m
[31mEpoch 5/5[0m
[31m - 63s - loss: 0.2267 - acc: 0.9155[0m
[31m[17:03:14] src/executor/../common/exec_utils.h:475: Bucketing: data /out_1_target1 has a shape [32,4], which is larger than already allocated shape [16,4]. Need to re-allocate. Consider putting default bucket key to be the bucket taking the largest input for better memory sharing.[0m
[31m[17:03:14] src/executor/../common/exec_utils.h:475: Bucketing: data /embed_input1 has a shape [32,40], which is larger than already allocated s

In [29]:
import boto3
s3 = boto3.resource('s3')

key = mxnet_estimator.model_data[mxnet_estimator.model_data.find("/", 5)+1:]
s3.Bucket(s3_bucket).download_file(key, 'model.tar.gz')

In [30]:
model_path='model.tar.gz'
from sagemaker.mxnet import MXNet, MXNetModel

sagemaker_model = MXNetModel(model_data = model_path,
                             role = role,
                             entry_point = 'default_classifier.py',
                             py_version='py3')

In [31]:
predictor = mxnet_estimator.deploy(initial_instance_count=1,
                           instance_type='ml.t2.medium')

INFO:sagemaker:Creating model with name: sagemaker-mxnet-2019-04-02-16-54-36-116
INFO:sagemaker:Creating endpoint with name sagemaker-mxnet-2019-04-02-16-54-36-116


---------------------------------------------------------------------------!

In [32]:
display(predictor.accept, predictor.content_type, predictor.deserializer, predictor.endpoint, predictor.sagemaker_session, predictor.serializer)

'application/json'

'application/json'

<sagemaker.predictor._JsonDeserializer at 0x7f1cca0c2eb8>

'sagemaker-mxnet-2019-04-02-16-54-36-116'

<sagemaker.session.Session at 0x7f1c2d747908>

<sagemaker.predictor._JsonSerializer at 0x7f1cca0c2e48>

In [None]:
example_doc=['Senate prepares to vote on dueling plans to end shutdown']
# integer encode the document
encoded_example = t.texts_to_sequences(example_doc)

# pad documents to a max length of 4 words
max_length = 40
padded_example = pad_sequences(encoded_example, maxlen=max_length, padding='post')

In [35]:
predictor.predict(padded_example.tolist())

[[0.9064793586730957,
  0.005665271542966366,
  0.006633899174630642,
  0.08122153580188751]]

## HPO

In [41]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

In [42]:
#hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)}
hyperparameter_ranges = {'epochs': IntegerParameter(5, 10)}

In [43]:
objective_metric_name = 'loss'
objective_type = 'Minimize'
metric_definitions = [{'Name': 'loss',
                       'Regex': 'loss = ([0-9\\.]+)'}]

In [47]:
hyperparameters = {'epochs': 5, 'vocab_size':vocab_size, 'num_classes':encoder.classes_.size}

In [48]:
mxnet_estimator = MXNet(entry_point='keras_script_mxnet.py',
                       source_dir='./tf-src',
                        role=role,
                        train_instance_type='ml.m4.xlarge',
                        train_instance_count=1,
                        framework_version='1.3.0',
                        py_version='py3',
                        hyperparameters=hyperparameters)

In [51]:
tuner = HyperparameterTuner(mxnet_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=5,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [52]:
tuner.fit(inputs)

INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-mxnet-190402-1720
