## Inspect and processing data manually

In [39]:
%%sh
pip -q install gensim nltk

In [40]:
import pandas as pd

In [41]:
num_lines = 100000

data = pd.read_csv('abcnews-date-text.csv.gz', compression='gzip',
                      error_bad_lines=False, dtype='str', nrows=num_lines)

data = data.sample(frac=1)

In [42]:
data.head()

Unnamed: 0,publish_date,headline_text
85287,20040416,thousands flee as suva floods
81984,20040331,mitsubishi mulls australian pullout report
11164,20030413,vic govt confronts youth alcohol abuse
77036,20040308,boy dies in wudinna farm tragedy
13852,20030426,polls open for new maryborough mp


In [43]:
data = data.drop(['publish_date'], axis=1)

In [37]:
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
    
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def process_text(text):
    for p in string.punctuation:
        text = text.replace(p, '')
    text = ''.join([c for c in text if not c.isdigit()])
    text = text.lower().split()
    text = [w for w in text if not w in stop_words] 
    text = [wnl.lemmatize(w) for w in text]
    return text

[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
%%time
data['headline_text'] = data['headline_text'].apply(process_text)

CPU times: user 4.15 s, sys: 22.4 ms, total: 4.17 s
Wall time: 4.27 s


In [45]:
data.head()

Unnamed: 0,headline_text
85287,"[thousand, flee, suva, flood]"
81984,"[mitsubishi, mull, australian, pullout, report]"
11164,"[vic, govt, confronts, youth, alcohol, abuse]"
77036,"[boy, dy, wudinna, farm, tragedy]"
13852,"[poll, open, new, maryborough, mp]"


In [46]:
%%time

from gensim import corpora
dictionary = corpora.Dictionary(data['headline_text'])



CPU times: user 1.15 s, sys: 17.2 ms, total: 1.17 s
Wall time: 1.66 s


In [47]:
print(dictionary)

Dictionary(22976 unique tokens: ['flee', 'flood', 'suva', 'thousand', 'australian']...)


In [48]:
dictionary.filter_extremes(keep_n=512)
print(dictionary)

Dictionary(512 unique tokens: ['flood', 'thousand', 'australian', 'report', 'abuse']...)


In [None]:
with open('vocab.txt', 'w') as f:
    for index in range(0,len(dictionary)):
        f.write(dictionary.get(index)+'\n')

In [35]:
# To read dictionary from vocab.txt

#--------------------OF no use------------------------

vocab_file = open("vocab.txt", "r")
dictionary =vocab_file.read()
dictionary = dictionary.split("\n")
vocab_file.close()
#dictionary

In [None]:
%%time

data['tokens'] = data.apply(lambda row: dictionary.doc2bow(row['headline_text']), axis=1)

In [None]:
data = data.drop(['headline_text'], axis=1)
data.head()

In [3]:
import io, boto3
import sagemaker
import sagemaker.amazon.common as smac
from scipy.sparse import lil_matrix

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'headlines-lda-ntm'

2.0.0rc1


In [None]:
def build_protobuf_dataset(data, dictionary):
    num_lines = data.shape[0]
    num_columns = len(dictionary)
    token_matrix = lil_matrix((num_lines, num_columns)).astype('float32')
    line = 0
    for _, row in data.iterrows():
        for token_id, token_count in row['tokens']:
            token_matrix[line, token_id] = token_count
        line+=1
        
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None)
    return buf

In [None]:
def upload_protbuf_dataset(buf, bucket, prefix, key):
    obj = '{}/{}'.format(prefix, key)
    buf.seek(0)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(training_buf)
    path = 's3://{}/{}'.format(bucket,obj)
    return path

In [16]:
s3_training_path='s3://sagemaker-us-east-1-886035371869/sagemaker-scikit-learn-2021-03-27-17-18-09-298/output/train_data/training.protobuf'

In [None]:
%%time
training_buf = build_protobuf_dataset(data, dictionary)
s3_training_path = upload_protbuf_dataset(training_buf, bucket, prefix, 'training/training.protobuf')
print(s3_training_path)

In [None]:
s3_auxiliary_path = session.upload_data(path='vocab.txt', key_prefix=prefix + '/input/auxiliary')
print(s3_auxiliary_path)

## Training

In [6]:
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_output)

s3://sagemaker-us-east-1-886035371869/headlines-lda-ntm/output/


In [7]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('lda', region)
print(container)

766337827248.dkr.ecr.us-east-1.amazonaws.com/lda:1


In [17]:
role = sagemaker.get_execution_role()

lda = sagemaker.estimator.Estimator(container,
                                   role,
                                   instance_count=1, 
                                   instance_type='ml.c5.2xlarge',
                                   output_path=s3_output)

In [18]:
lda.set_hyperparameters(num_topics=10, 
                        #feature_dim=len(dictionary), 
                        feature_dim=512,
                        #mini_batch_size=num_lines,
                        mini_batch_size=97320,
                        alpha0=0.1)

In [19]:
lda.fit(inputs={'train': s3_training_path},wait=False)

In [22]:
sm_client=boto3.client('sagemaker')
response = sm_client.describe_training_job(
    TrainingJobName='lda-2021-03-27-17-52-08-679'
)
print(response)

{'TrainingJobName': 'lda-2021-03-27-17-52-08-679', 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:886035371869:training-job/lda-2021-03-27-17-52-08-679', 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-us-east-1-886035371869/headlines-lda-ntm/output/lda-2021-03-27-17-52-08-679/output/model.tar.gz'}, 'TrainingJobStatus': 'Completed', 'SecondaryStatus': 'Completed', 'HyperParameters': {'alpha0': '0.1', 'feature_dim': '512', 'mini_batch_size': '97320', 'num_topics': '10'}, 'AlgorithmSpecification': {'TrainingImage': '766337827248.dkr.ecr.us-east-1.amazonaws.com/lda:1', 'TrainingInputMode': 'File', 'MetricDefinitions': [{'Name': 'train:progress', 'Regex': '#progress_metric: host=\\S+, completed (\\S+) %'}, {'Name': 'test:pwll', 'Regex': '#quality_metric: host=\\S+, test pwll <score>=(\\S+)'}, {'Name': 'train:throughput', 'Regex': '#throughput_metric: host=\\S+, train throughput=(\\S+) records/second'}], 'EnableSageMakerMetricsTimeSeries': False}, 'RoleArn': 'arn:aws:iam::88603537186

In [23]:
lda_predictor = lda.deploy(initial_instance_count=1, instance_type='ml.t2.large')

-----------------!

In [24]:
import numpy as np

def process_samples(samples, dictionary):
    num_lines = len(samples)
    num_columns = len(dictionary)
    sample_matrix = np.zeros((num_lines, num_columns)).astype('float32')
    for line in range(0, num_lines):
        s = samples[line]
        s = process_text(s)
        s = dictionary.doc2bow(s)
        for token_id, token_count in s:
            sample_matrix[line, token_id] = token_count
        line+=1
    return sample_matrix

In [27]:
# Run this cell to try your own samples

samples = [
    "Major tariffs expected to end Australian barley trade to China",
    "US woman wanted over fatal crash asks for release after coronavirus halts extradition",
    "Fifty trains out of service as fault forces Adelaide passengers to 'pack like sardines",
    "Germany's Bundesliga plans its return from lockdown as football world watches",
    "All AFL players to face COVID-19 testing before training resumes"
]

In [52]:
# Run this cell to load 5 random samples from the dataset
import numpy as np

data = pd.read_csv('abcnews-date-text.csv.gz', compression='gzip',
                      error_bad_lines=False, dtype='str')
samples = data.sample(frac=1)[:5]
samples = np.array(samples.headline_text)
print(samples)

['in praise of the make under'
 'tas considers incentives to cut live exports'
 'locals compare damage caused by cyclone larry to'
 'grieving father confronts health minister at grafton'
 'wild dog detection']


In [53]:
lda_predictor.serializer = sagemaker.serializers.CSVSerializer()
response = lda_predictor.predict(process_samples(samples, dictionary))
print(response)

b'{"predictions": [{"topic_mixture": [0.5590918064117432, 0.0, 0.4409082531929016, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, {"topic_mixture": [0.6972892880439758, 0.3027106821537018, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, {"topic_mixture": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, {"topic_mixture": [0.6785109043121338, 0.0, 0.0, 0.0, 0.3214890658855438, 0.0, 0.0, 0.0, 0.0, 0.0]}, {"topic_mixture": [0.767515242099762, 0.23248475790023804, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}]}'


In [54]:
import json

response = json.loads(response)
vectors = [r['topic_mixture'] for r in response['predictions']]

In [55]:
for v in vectors:
    top_topic = np.argmax(v)
    print("topic %s, %2.2f" % (top_topic, v[top_topic]))

topic 0, 0.56
topic 0, 0.70
topic 0, 1.00
topic 0, 0.68
topic 0, 0.77


In [None]:
lda_predictor.delete_endpoint()