# End-to-end NLP: News Headline classifier

### Setup execution role and session

In [None]:
import numpy as np
import pandas as pd

In [None]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()

### Download News Aggregator Dataset available at the public UCI dataset repository

In [3]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip

--2019-03-12 22:53:26--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/zip]
Saving to: ‘NewsAggregatorDataset.zip’


2019-03-12 22:53:40 (2.43 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203/29224203]



In [4]:
!unzip NewsAggregatorDataset.zip

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [5]:
!rm -rf __MACOSX/

In [6]:
#ls

#### Let's visualize the dataset

In [8]:
import pandas as pd
import mxnet
import re
import numpy as np
import os

In [9]:
column_names = ["TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
news_dataset = pd.read_csv('newsCorpora.csv', names=column_names, header=None, delimiter='\t')
news_dataset.head()

Unnamed: 0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


#### For this exercice we'll only use the title (Headline) of the news story and the category as our target variable

In [10]:
df=news_dataset[['TITLE',"CATEGORY"]]

In [11]:
from collections import Counter
Counter(df['CATEGORY'])

Counter({'b': 115967, 't': 108344, 'e': 152469, 'm': 45639})

The dataset has four categories: Business (b), Science & Technology (t), Entertainment (e) and Health & Medicine (m).

#### Dummy encode the labels

In [12]:
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
encoder = preprocessing.LabelEncoder()

docs = df["TITLE"].values

encoder.fit(df["CATEGORY"].values)
encoded_Y = encoder.transform(df["CATEGORY"].values)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = to_categorical(encoded_Y)

Using MXNet backend


In [13]:
#bucket = <bucket> # custom bucket name.
s3_bucket = sess.default_bucket()
s3_prefix = 'news'

In [14]:
list(encoder.classes_)

['b', 'e', 'm', 't']

In [15]:
encoded_Y

array([0, 0, 0, ..., 2, 2, 2])

#### Tokenize documents and set fixed sequence lengths for input feature dimension.

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(vocab_size)
# pad documents to a max length of 4 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

75286
422419


In [17]:
docs[0]

'Fed official says weak data caused by weather, should not slow taper'

### Import word embeddings

In [18]:
!wget http://nlp.stanford.edu/data/glove.6B.zip && unzip glove.6B.zip

--2019-03-12 22:54:25--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-03-12 22:54:25--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-03-12 22:57:18 (4.79 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [19]:
rm 2pageSessions.csv glove.6B.200d.txt glove.6B.50d.txt glove.6B.300d.txt glove.6B.zip

##### Create embedding matrix

In [20]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [21]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./vectors.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

FileNotFoundError: [Errno 2] No such file or directory: './vectors.txt'

In [118]:
#embeddings_index

In [23]:
#print(t.word_index)

In [22]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [26]:
mkdir ./data/embeddings/

In [27]:
#embedding_matrix.dump("ingredients-embedding-matrix.dat")
np.save(file="./data/embeddings/docs-embedding-matrix",
        arr=embedding_matrix,
        allow_pickle=False)
print(embedding_matrix.shape)

(75286, 100)


### Train, test split

In this section we will prep the data for ingestion for the algortihm. Split the data set in train and test samples and uplad the data to S3

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_docs, dummy_y, test_size=0.2, random_state=42)

In [29]:
!mkdir data/train/ data/test/ data/embeddings/

mkdir: cannot create directory ‘data/embeddings/’: File exists


In [30]:
np.save('./data/train/train_X.npy', X_train)
np.save('./data/train/train_Y.npy', y_train)
np.save('./data/test/test_X.npy', X_test)
np.save('./data/test/test_Y.npy', y_test)

In [31]:
traindata_s3_prefix = '{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '{}/data/test'.format(s3_prefix)
embeddings_s3_prefix='{}/data/embeddings'.format(s3_prefix)
output_s3 = 's3://{}/{}/models/'.format(s3_bucket, s3_prefix)
code_location_s3 = 's3://{}/{}/codes'.format(s3_bucket, s3_prefix)

In [32]:
train_s3 = sess.upload_data(path='./data/train/', bucket=s3_bucket, key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path='./data/test/', bucket=s3_bucket, key_prefix=testdata_s3_prefix)
embeddings_s3 = sess.upload_data(path='./data/embeddings/', bucket=s3_bucket, key_prefix=embeddings_s3_prefix)


In [33]:
inputs = {'train':train_s3, 'test': test_s3, 'embeddings': embeddings_s3}

print(inputs)

{'train': 's3://sagemaker-ap-southeast-1-349934754982/news/data/train', 'test': 's3://sagemaker-ap-southeast-1-349934754982/news/data/test', 'embeddings': 's3://sagemaker-ap-southeast-1-349934754982/news/data/embeddings'}


In [35]:
import sagemaker
from sagemaker.mxnet import MXNet

### Define hyperparameters to push to algorithm

In [36]:
hyperparameters = {'epochs': 5, 'vocab_size':vocab_size, 'num_classes':encoder.classes_.size}

In [69]:
mxnet_estimator = MXNet(entry_point='keras_script_mxnet.py',
                       source_dir='./tf-src',
                        role=role,
                        train_instance_type='ml.p3.8xlarge',
                        train_instance_count=1,
                        framework_version='1.3.0',
                        py_version='py3',
                        hyperparameters=hyperparameters)
mxnet_estimator.fit(inputs)

INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-2019-03-13-00-52-10-529


2019-03-13 00:52:12 Starting - Starting the training job...
2019-03-13 00:52:14 Starting - Launching requested ML instances......
2019-03-13 00:53:18 Starting - Preparing the instances for training......
2019-03-13 00:54:30 Downloading - Downloading input data
2019-03-13 00:54:30 Training - Downloading the training image...
2019-03-13 00:54:56 Training - Training image download completed. Training in progress.
[31m2019-03-13 00:54:57,349 sagemaker-containers INFO     Imported framework sagemaker_mxnet_container.training[0m
[31m2019-03-13 00:54:57,396 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_TRAINING_ENV': '{"additional_framework_parameters":{},"channel_input_dirs":{"embeddings":"/opt/ml/input/data/embeddings","test":"/opt/ml/input/data/test","train":"/opt/ml/input/data/train"},"current_host":"algo-1","framework_module":"sagemaker_mxnet_container.training:main","hosts":["algo-1"],"hyperparameters":{"epochs":5,"num_classes":4,"vocab_size":75286},"in

  force_init=force_init)[0m
[31m[00:55:10] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:109: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)[0m
[31m - 57s - loss: 0.5388 - acc: 0.7500[0m
[31mEpoch 2/5[0m
[31m - 48s - loss: 0.5384 - acc: 0.7500[0m
[31mEpoch 3/5[0m
[31m - 48s - loss: 0.5384 - acc: 0.7500[0m
[31mEpoch 4/5[0m
[31m - 48s - loss: 0.5384 - acc: 0.7500[0m
[31mEpoch 5/5[0m
[31m - 48s - loss: 0.5384 - acc: 0.7500[0m
[31m[00:59:10] src/executor/../common/exec_utils.h:475: Bucketing: data /out_1_target1 has a shape [32,4], which is larger than already allocated shape [16,4]. Need to re-allocate. Consider putting default bucket key to be the bucket taking the largest input for better memory sharing.[0m
[31m[00:59:10] src/executor/../common/exec_utils.h:475: Bucketing: data /embed_input1 has a shape [32,40], which is larger than already allocated sh

In [70]:
import boto3
s3 = boto3.resource('s3')

key = mxnet_estimator.model_data[mxnet_estimator.model_data.find("/", 5)+1:]
s3.Bucket(s3_bucket).download_file(key, 'model.tar.gz')

In [None]:
from sagemaker.mxnet import MXNet, MXNetModel

sagemaker_model = MXNetModel(model_data = model_path,
                             role = role,
                             entry_point = 'default_classifier.py',
                             py_version='py3')

In [48]:
predictor = mxnet_estimator.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-mxnet-2019-03-12-23-46-12-811
INFO:sagemaker:Creating endpoint with name sagemaker-mxnet-2019-03-12-23-46-12-811


---------------------------------------------------------------!

In [67]:
display(predictor.accept, predictor.content_type, predictor.deserializer, predictor.endpoint, predictor.sagemaker_session, predictor.serializer)

'application/json'

'text/csv'

<sagemaker.predictor._JsonDeserializer at 0x7fd7c4546f98>

'sagemaker-mxnet-2019-03-12-23-46-12-811'

<sagemaker.session.Session at 0x7fd73da5ab70>

<sagemaker.predictor._JsonSerializer at 0x7fd7c4546f28>

In [76]:
#predictor.predict(padded_example.tolist())

In [45]:
import boto3
s3 = boto3.resource('s3')

key = mxnet_estimator.model_data[mxnet_estimator.model_data.find("/", 5)+1:]
s3.Bucket(s3_bucket).download_file(key, 'model.tar.gz')

In [71]:
!tar -xvzf model.tar.gz

model-0000.params
model-symbol.json
model-shapes.json
model.hd5


In [73]:
from keras.models import load_model
loaded_model = load_model("model.hd5")

  train_symbol = func(*args, **kwargs)
  test_symbol = func(*args, **kwargs)


In [141]:
example_doc=['Senate prepares to vote on dueling plans to end shutdown']
# integer encode the document
encoded_example = t.texts_to_sequences(example_doc)

# pad documents to a max length of 4 words
max_length = 40
padded_example = pad_sequences(encoded_example, maxlen=max_length, padding='post')

In [75]:
loaded_model.predict(padded_example)

array([[ 0.27366856,  0.36816129,  0.10714847,  0.25102162]], dtype=float32)