# End-to-end NLP: News Headline classifier

### Setup execution role and session

In [1]:
import numpy as np
import pandas as pd

In [2]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()

arn:aws:iam::932240083933:role/service-role/AmazonSageMaker-ExecutionRole-20181128T130896
CPU times: user 594 ms, sys: 0 ns, total: 594 ms
Wall time: 714 ms


### Download News Aggregator Dataset available at the public UCI dataset repository

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip

--2019-01-24 18:09:38--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/zip]
Saving to: ‘NewsAggregatorDataset.zip’


2019-01-24 18:09:40 (21.0 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203/29224203]



In [3]:
!unzip NewsAggregatorDataset.zip

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [4]:
!rm -rf __MACOSX/

In [5]:
#ls

#### Let's visualize the dataset

In [3]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
import os

In [4]:
column_names = ["TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
news_dataset = pd.read_csv('newsCorpora.csv', names=column_names, header=None, delimiter='\t')
news_dataset.head()

Unnamed: 0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


#### For this exercice we'll only use the title (Headline) of the news story and the category as our target variable

In [5]:
df=news_dataset[['TITLE',"CATEGORY"]]

In [6]:
from collections import Counter
Counter(df['CATEGORY'])

Counter({'b': 115967, 't': 108344, 'e': 152469, 'm': 45639})

The dataset has four categories: Business (b), Science & Technology (t), Entertainment (e) and Health & Medicine (m).

#### Dummy encode the labels

In [7]:
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
encoder = preprocessing.LabelEncoder()

docs = df["TITLE"].values

encoder.fit(df["CATEGORY"].values)
encoded_Y = encoder.transform(df["CATEGORY"].values)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = to_categorical(encoded_Y)

Using TensorFlow backend.


In [8]:
#bucket = <bucket> # custom bucket name.
s3_bucket = sess.default_bucket()
s3_prefix = 'news'

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-932240083933


In [9]:
list(encoder.classes_)

['b', 'e', 'm', 't']

In [10]:
encoded_Y

array([0, 0, 0, ..., 2, 2, 2])

#### Tokenize documents and set fixed sequence lengths for input feature dimension.

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(vocab_size)
# pad documents to a max length of 4 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

75286
422419


In [12]:
docs[0]

'Fed official says weak data caused by weather, should not slow taper'

### Import word embeddings

In [19]:
!wget http://nlp.stanford.edu/data/glove.6B.zip && unzip glove.6B.zip

--2019-01-24 18:10:54--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-01-24 18:10:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-01-24 18:11:10 (54.0 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [20]:
rm 2pageSessions.csv glove.6B.200d.txt glove.6B.50d.txt glove.6B.300d.txt glove.6B.zip

##### Create embedding matrix

In [13]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [15]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./vectors.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 71291 word vectors.


In [118]:
#embeddings_index

In [23]:
#print(t.word_index)

In [14]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
#embedding_matrix.dump("ingredients-embedding-matrix.dat")
np.save(file="./data/embeddings/docs-embedding-matrix",
        arr=embedding_matrix,
        allow_pickle=False)
print(embedding_matrix.shape)

(75286, 100)


### Train, test split

In this section we will prep the data for ingestion for the algortihm. Split the data set in train and test samples and uplad the data to S3

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_docs, dummy_y, test_size=0.2, random_state=42)

In [17]:
!mkdir data/train/ data/test/ data/embeddings/

mkdir: cannot create directory ‘data/train/’: File exists
mkdir: cannot create directory ‘data/test/’: File exists
mkdir: cannot create directory ‘data/embeddings/’: File exists


In [18]:
np.save('./data/train/train_X.npy', X_train)
np.save('./data/train/train_Y.npy', y_train)
np.save('./data/test/test_X.npy', X_test)
np.save('./data/test/test_Y.npy', y_test)

In [19]:
traindata_s3_prefix = '{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '{}/data/test'.format(s3_prefix)
embeddings_s3_prefix='{}/data/embeddings'.format(s3_prefix)
output_s3 = 's3://{}/{}/models/'.format(s3_bucket, s3_prefix)
code_location_s3 = 's3://{}/{}/codes'.format(s3_bucket, s3_prefix)

In [20]:
train_s3 = sess.upload_data(path='./data/train/', bucket=s3_bucket, key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path='./data/test/', bucket=s3_bucket, key_prefix=testdata_s3_prefix)
embeddings_s3 = sess.upload_data(path='./data/embeddings/', bucket=s3_bucket, key_prefix=embeddings_s3_prefix)


In [21]:
inputs = {'train':train_s3, 'test': test_s3, 'embeddings': embeddings_s3}

print(inputs)

{'train': 's3://sagemaker-us-east-1-932240083933/news/data/train', 'test': 's3://sagemaker-us-east-1-932240083933/news/data/test', 'embeddings': 's3://sagemaker-us-east-1-932240083933/news/data/embeddings'}


In [22]:
import sagemaker
from sagemaker.tensorflow import TensorFlow

### Define hyperparameters to push to algorithm

In [24]:
hyperparameters = {'epochs': 5, 'vocab_size':vocab_size, 'num_classes':encoder.classes_.size}

In [29]:
estimator = TensorFlow(entry_point='keras_script.py',
                       source_dir='./tf-src',
                       role=role,
#                        training_steps=1,
#                        evaluation_steps=1,
                       #train_volume_size=10,
                       train_instance_count=1,
                       train_instance_type='ml.p3.8xlarge',
                       hyperparameters=hyperparameters,
                       framework_version='1.11.0',
                       py_version='py3',
                       script_mode=True,
                       output_path=output_s3,
                       code_location=code_location_s3,
                       base_job_name='tf-scriptmode'
#                        requirements_file='./requirements.txt'
                      )

In [30]:
%%time

estimator.fit(inputs)

INFO:sagemaker:Creating training-job with name: tf-scriptmode-2019-02-24-19-58-27-963


2019-02-24 19:58:28 Starting - Starting the training job...
2019-02-24 19:58:29 Starting - Launching requested ML instances......
2019-02-24 19:59:35 Starting - Preparing the instances for training......
2019-02-24 20:00:46 Downloading - Downloading input data
2019-02-24 20:00:46 Training - Downloading the training image.....
[31m2019-02-24 20:01:41,952 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-02-24 20:01:42,390 sagemaker-containers INFO     Invoking user script
[0m
[31mTraining Env:
[0m
[31m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "embeddings": "/opt/ml/input/data/embeddings",
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "num_classes": 4,
        "vocab_size": 

[31m - 62s - loss: 0.1498 - acc: 0.9468[0m
[31mEpoch 3/5[0m
[31m - 62s - loss: 0.1472 - acc: 0.9501[0m
[31mEpoch 4/5[0m
[31m - 62s - loss: 0.1462 - acc: 0.9520[0m
[31mEpoch 5/5[0m
[31m - 62s - loss: 0.1470 - acc: 0.9534[0m
[31m------ save model to /opt/ml/model/my_model.h5[0m
[31m2019-02-24 20:07:20,307 sagemaker-containers INFO     Reporting training SUCCESS[0m

2019-02-24 20:07:28 Uploading - Uploading generated training model
2019-02-24 20:07:28 Completed - Training job completed
Billable seconds: 410
CPU times: user 1.25 s, sys: 0 ns, total: 1.25 s
Wall time: 9min 15s


In [33]:
import boto3
s3 = boto3.resource('s3')

key = estimator.model_data[estimator.model_data.find("/", 5)+1:]
s3.Bucket(s3_bucket).download_file(key, 'model.tar.gz')

In [34]:
!tar -xvzf model.tar.gz

my_model.h5


In [37]:
!pip install --upgrade keras

Collecting keras
[?25l  Downloading https://files.pythonhosted.org/packages/5e/10/aa32dad071ce52b5502266b5c659451cfd6ffcbf14e6c8c4f16c0ff5aaab/Keras-2.2.4-py2.py3-none-any.whl (312kB)
[K    100% |████████████████████████████████| 317kB 39.1MB/s ta 0:00:01
[?25hRequirement not upgraded as not directly required: numpy>=1.9.1 in /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from keras) (1.14.5)
Collecting keras-preprocessing>=1.0.5 (from keras)
  Downloading https://files.pythonhosted.org/packages/fc/94/74e0fa783d3fc07e41715973435dd051ca89c550881b3454233c39c73e69/Keras_Preprocessing-1.0.5-py2.py3-none-any.whl
Requirement not upgraded as not directly required: scipy>=0.14 in /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from keras) (1.1.0)
Requirement not upgraded as not directly required: h5py in /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from keras) (2.8.0)
Requirement not upgraded as not directly r

In [35]:
print(tf.__version__)

1.10.0


In [36]:
from keras.models import load_model
loaded_model = load_model("my_model.h5")

ValueError: Unknown layer:name

In [114]:
#predictor = estimator.deploy(initial_instance_count=1,
#                             instance_type='ml.c5.xlarge')

IndentationError: unexpected indent (<ipython-input-114-04c0c94ec65d>, line 2)

In [141]:
example_doc=['Senate prepares to vote on dueling plans to end shutdown']
# integer encode the document
encoded_example = t.texts_to_sequences(example_doc)

# pad documents to a max length of 4 words
max_length = 40
padded_example = pad_sequences(encoded_example, maxlen=max_length, padding='post')

In [None]:
input = {
  'instances': padded_example
}
result = predictor.predict(input)

In [112]:
### Serving model from S3
from sagemaker.tensorflow.serving import Model

model = Model(model_data='s3://sagemaker-us-west-2-349934754982/news/models/tf-scriptmode-mnist-2019-01-24-21-07-37-051/output/model.tar.gz', role=role)

predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-tensorflow-serving-2019-01-24-21-29-25-836
INFO:sagemaker:Creating endpoint with name sagemaker-tensorflow-serving-2019-01-24-21-29-25-836


--------------------------------------------------------------------------------------------------*

ValueError: Error hosting endpoint sagemaker-tensorflow-serving-2019-01-24-21-29-25-836: Failed Reason:  The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.