## Inspect and processing data manually

In [1]:
%%sh
pip -q install --upgrade pip
pip -q install gensim nltk

In [2]:
import pandas as pd

In [3]:
num_lines = 1000000

data = pd.read_csv('abcnews-date-text.csv.gz', compression='gzip',
                      error_bad_lines=False, dtype='str', nrows=num_lines)

data = data.sample(frac=1)

In [4]:
data.head()

Unnamed: 0,publish_date,headline_text
351196,20071217,us rejects turkish air strike approval claims
958513,20150616,berg tpp not the bogey treaty that we think it is
394335,20080702,irrigators get water allocations cut
603197,20110325,four guilty over thurston uncles bashing death
128865,20041121,fear prevents sudanese returning to darfur


In [5]:
data = data.drop(['publish_date'], axis=1)

In [6]:
import string
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
    
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def process_text(text):
    for p in string.punctuation:
        text = text.replace(p, '')
    text = ''.join([c for c in text if not c.isdigit()])
    text = text.lower().split()
    text = [w for w in text if not w in stop_words] 
    text = [wnl.lemmatize(w) for w in text]
    return text

[nltk_data] Downloading package wordnet to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
%%time
data['headline_text'] = data['headline_text'].apply(process_text)

CPU times: user 41.2 s, sys: 442 ms, total: 41.7 s
Wall time: 41.9 s


In [8]:
data.head()

Unnamed: 0,headline_text
351196,"[u, reject, turkish, air, strike, approval, cl..."
958513,"[berg, tpp, bogey, treaty, think]"
394335,"[irrigators, get, water, allocation, cut]"
603197,"[four, guilty, thurston, uncle, bashing, death]"
128865,"[fear, prevents, sudanese, returning, darfur]"


In [9]:
%%time

from gensim import corpora
dictionary = corpora.Dictionary(data['headline_text'])



CPU times: user 11.5 s, sys: 35.3 ms, total: 11.6 s
Wall time: 12.9 s


In [10]:
print(dictionary)

Dictionary(74759 unique tokens: ['air', 'approval', 'claim', 'reject', 'strike']...)


In [11]:
dictionary.filter_extremes(keep_n=512, no_above=0.5)
print(dictionary)

Dictionary(512 unique tokens: ['air', 'claim', 'reject', 'strike', 'u']...)


In [12]:
with open('vocab.txt', 'w') as f:
    for index in range(0,len(dictionary)):
        f.write(dictionary.get(index)+'\n')

In [13]:
%%time

data['tokens'] = data.apply(lambda row: dictionary.doc2bow(row['headline_text']), axis=1)

CPU times: user 12.2 s, sys: 204 ms, total: 12.4 s
Wall time: 12.4 s


In [14]:
data = data.drop(['headline_text'], axis=1)
data.head()

Unnamed: 0,tokens
351196,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]"
958513,[]
394335,"[(5, 1), (6, 1), (7, 1)]"
603197,"[(8, 1), (9, 1), (10, 1)]"
128865,"[(11, 1)]"


In [15]:
import io, boto3
import sagemaker
import sagemaker.amazon.common as smac
from scipy.sparse import lil_matrix

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'headlines-lda-ntm-NTM'

2.0.0rc1


In [16]:
def build_protobuf_dataset(data, dictionary):
    num_lines = data.shape[0]
    num_columns = len(dictionary)
    token_matrix = lil_matrix((num_lines, num_columns)).astype('float32')
    line = 0
    for _, row in data.iterrows():
        for token_id, token_count in row['tokens']:
            token_matrix[line, token_id] = token_count
        line+=1
        
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None)
    return buf

In [17]:
def upload_protbuf_dataset(buf, bucket, prefix, key):
    obj = '{}/{}'.format(prefix, key)
    buf.seek(0)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(training_buf)
    path = 's3://{}/{}'.format(bucket,obj)
    return path

In [18]:
%%time
training_buf = build_protobuf_dataset(data, dictionary)
s3_training_path = upload_protbuf_dataset(training_buf, bucket, prefix, 'training/training.protobuf')
print(s3_training_path)

s3://sagemaker-us-east-1-886035371869/headlines-lda-ntm-NTM/training/training.protobuf
CPU times: user 2min 31s, sys: 2.06 s, total: 2min 33s
Wall time: 2min 32s


In [19]:
s3_auxiliary_path = session.upload_data(path='vocab.txt', key_prefix=prefix + '/input/auxiliary')
print(s3_auxiliary_path)

s3://sagemaker-us-east-1-886035371869/headlines-lda-ntm-NTM/input/auxiliary/vocab.txt


## Training

In [20]:
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_output)

s3://sagemaker-us-east-1-886035371869/headlines-lda-ntm-NTM/output/


In [21]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('ntm', region)
print(container)

382416733822.dkr.ecr.us-east-1.amazonaws.com/ntm:1


In [22]:
role = sagemaker.get_execution_role()

ntm = sagemaker.estimator.Estimator(container,
                                   role, 
                                   instance_count=1, 
                                   instance_type='ml.p3.2xlarge',
                                   sagemaker_session=session)

In [23]:
ntm.set_hyperparameters(num_topics=10, 
                        feature_dim=len(dictionary),
                        optimizer='adam', 
                        mini_batch_size=256,
                        epochs=100,
                        num_patience_epochs=10)

In [24]:
ntm.fit(inputs={'train': s3_training_path,
                'auxiliary': s3_auxiliary_path})

2021-03-27 18:17:37 Starting - Starting the training job...
2021-03-27 18:17:39 Starting - Launching requested ML instances......
2021-03-27 18:19:08 Starting - Preparing the instances for training.........
2021-03-27 18:20:25 Downloading - Downloading input data
2021-03-27 18:20:25 Training - Downloading the training image...
2021-03-27 18:21:07 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
[34m[03/27/2021 18:21:09 INFO 140212526044992] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/default-input.json: {'encoder_layers': 'auto', 'mini_batch_size': '256', 'epochs': '50', 'encoder_layers_activation': 'sigmoid', 'optimizer': 'adadelta', 'tolerance': '0.001', 'num_patience_epochs': '3', 'batch_norm': 'false', 'rescale_gradient': '1.0', 'clip_gradient':

In [27]:
topics = ['justice','finance','local','sports','politics?',
          'unknown1','unknown2','crime','disasters', 'international']

In [26]:
ntm_predictor = ntm.deploy(initial_instance_count=1, instance_type='ml.t2.large')

-----------------!

In [28]:
import numpy as np

def process_samples(samples, dictionary):
    num_lines = len(samples)
    num_columns = len(dictionary)
    sample_matrix = np.zeros((num_lines, num_columns)).astype('float32')
    for line in range(0, num_lines):
        s = samples[line]
        s = process_text(s)
        s = dictionary.doc2bow(s)
        for token_id, token_count in s:
            sample_matrix[line, token_id] = token_count
        line+=1
    return sample_matrix

In [29]:
# Run this cell to try your own samples

samples = [
    "Major tariffs expected to end Australian barley trade to China",
    "US woman wanted over fatal crash asks for release after coronavirus halts extradition",
    "Fifty trains out of service as fault forces Adelaide passengers to pack like sardines",
    "Germany's Bundesliga plans its return from lockdown as football world watches",
    "RFS volunteer in custody for allegedly lighting fires"
]

In [None]:
# Run this cell to load random samples from the dataset

data = pd.read_csv('abcnews-date-text.csv.gz', compression='gzip',
                      error_bad_lines=False, dtype='str', nrows=num_lines)
samples = data.sample(frac=0.001)
samples = np.array(samples.headline_text)
print(samples)

In [30]:
ntm_predictor.serializer = sagemaker.serializers.CSVSerializer()
response = ntm_predictor.predict(process_samples(samples, dictionary))
print(response)

b'{"predictions":[{"topic_weights":[0.0366471261,0.0508532301,0.0554300584,0.0420520604,0.0667201504,0.2473417073,0.1098412201,0.0530197471,0.0677292496,0.2703654766]},{"topic_weights":[0.3334089518,0.0613059364,0.0527985059,0.0520550422,0.0555767864,0.0837655291,0.0492441952,0.1960011274,0.0574527942,0.0583911054]},{"topic_weights":[0.1040436849,0.1128442958,0.0867721066,0.0695143864,0.0790253133,0.081341207,0.0940319598,0.2365107089,0.0710377619,0.0648785681]},{"topic_weights":[0.058230754,0.0997985378,0.0755585134,0.0814882964,0.124075897,0.0769889504,0.2596221566,0.0771221071,0.0699055642,0.0772091597]},{"topic_weights":[0.094705008,0.1300486177,0.0844352543,0.0948471427,0.0922646448,0.0781801119,0.0744621903,0.1861951351,0.087393932,0.0774679929]}]}'


In [31]:
import json

response = json.loads(response)

for r in response['predictions']:
    sorted_indexes = np.argsort(r['topic_weights']).tolist()
    sorted_indexes.reverse()
    top_topics = [topics[i] for i in sorted_indexes]
    top_weights = [r['topic_weights'][i] for i in sorted_indexes]
    pairs = list(zip(top_topics, top_weights))
    print(pairs[:3])

[('international', 0.2703654766), ('unknown1', 0.2473417073), ('unknown2', 0.1098412201)]
[('justice', 0.3334089518), ('crime', 0.1960011274), ('unknown1', 0.0837655291)]
[('crime', 0.2365107089), ('finance', 0.1128442958), ('justice', 0.1040436849)]
[('unknown2', 0.2596221566), ('politics?', 0.124075897), ('finance', 0.0997985378)]
[('crime', 0.1861951351), ('finance', 0.1300486177), ('sports', 0.0948471427)]


In [32]:
import pprint

pprint.pprint(response)

{'predictions': [{'topic_weights': [0.0366471261,
                                    0.0508532301,
                                    0.0554300584,
                                    0.0420520604,
                                    0.0667201504,
                                    0.2473417073,
                                    0.1098412201,
                                    0.0530197471,
                                    0.0677292496,
                                    0.2703654766]},
                 {'topic_weights': [0.3334089518,
                                    0.0613059364,
                                    0.0527985059,
                                    0.0520550422,
                                    0.0555767864,
                                    0.0837655291,
                                    0.0492441952,
                                    0.1960011274,
                                    0.0574527942,
                                    0.0583911054

In [None]:
ntm_predictor.delete_endpoint()