# Read in Data

- The AG News dataset consists of 120,000 news articles, each with a title, description & label.

In [1]:
import logging
logging.basicConfig(level=logging.DEBUG)

In [2]:
import pandas as pd

train_df = pd.read_csv('../data/ag_news/train.csv')
test_df = pd.read_csv('../data/ag_news/test.csv')

train_df.sample(frac=0.00005)

Unnamed: 0,title,description,class
22586,"TiVo, ReplayTV agree to limits (SiliconValley....",SiliconValley.com - The makers of TiVo and Rep...,Sci/Tech
118113,Panthers Punter Sauerbrun Arrested for DWI (Re...,Reuters - Carolina Panthers punter\Todd Sauerb...,Sports
20155,European Envoy Visiting Turkey to Assess Situa...,With Turkey #39;s hopes of joining the Europea...,World
67787,Martha Stewart #39;s Lawyers File Appeal,NEW YORK -- Lawyers for Martha Stewart have to...,Business
92528,Flight attendants union chief wants strike,PITTSBURGH -- The president of the United Stat...,Business
117355,ICANN making available new domain names,The Internet Corporation for Assigned Names an...,Sci/Tech


# Preprocess the data for MXNet

The following `TokenPreprocessor` class can:
- Tokenize text using spacy
- Pad/slice tokenized text to a prespecified length
- Convert each token/label to a unique integer

In [3]:
from collections import Counter
import itertools
import regex as re


class TokenPreprocessor:
    def __init__(self, spacy_model, unseen_token=-1, pad_char='<padded>',max_tokens=20, unseen_label=-1):
        self.unseen_token=unseen_token
        self.pad_char = pad_char
        self.max_tokens = max_tokens
        self.unseen_label = unseen_label

    @staticmethod
    def split_utterance(string):
        """
        :param utterance: string
        :return: list of string
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        string = string.strip().lower()
        return string.split(' ')
    
    def pad_utterance(self, tokenized_utterance):
        """
        :param utterance: list of string
        :param length: desired list length
        :return: padded/sliced list
        """
        diff = len(tokenized_utterance) - self.max_tokens
        if diff > 0:
            return tokenized_utterance[:self.max_tokens]
        else:
            return tokenized_utterance + [self.pad_char] * -diff

    def build_vocab(self, data, depth=1, max_vocab_size=None):
        """
        :param data: list of data
        :param depth: depth of data list
        :param max_vocab_size:
        :return: dict and list mapping data to indices
        """
        if depth >1:
            data = list(itertools.chain.from_iterable(data)) # Make list 1D
        data_counts = Counter(data)  # Count occurrences of each word in the list

        vocabulary_inv = [x[0] for x in data_counts.most_common(max_vocab_size)]
        vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
        return vocabulary, vocabulary_inv

    def fit(self, utterances, labels):
        """
        :param utterances: list of raw utterances
        :param labels: list of raw labels
        """
        split_utterances = [self.split_utterance(utterance) for utterance in utterances]
        padded_utterances = [self.pad_utterance(utterance) for utterance in split_utterances]
        self.token_to_index, self.index_to_token = self.build_vocab(padded_utterances, depth=2)
        self.intent_to_index, self.index_to_intent = self.build_vocab(labels, depth=1)

    def transform_utterance(self, utterance):
        """
        :param utterance: raw utterance string
        :return: preprocessed utterance
        """
        split_utterance = self.split_utterance(utterance)
        padded_utterances = self.pad_utterance(split_utterance)
        return [self.token_to_index.get(token, self.unseen_token) for token in padded_utterances]

    def transform_label(self, label):
        """
        :param label: raw intent label
        :return: indexed intent label
        """
        return self.intent_to_index.get(label, self.unseen_label)

We fit the preprocessor to the training set. This builds index mappings for the tokens & labels (shown below).



In [4]:
preprocessor = TokenPreprocessor(spacy_model='en_core_web_sm')

preprocessor.fit(train_df['description'].tolist(), train_df['class'].tolist())

In [5]:
print("Label to index mappings:\t{}\n\nFirst 10 token to index mappings:\n\n{}".
      format(preprocessor.intent_to_index, 
             {k: preprocessor.token_to_index[k] for k in list(preprocessor.token_to_index)[:10]}))

Label to index mappings:	{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

First 10 token to index mappings:

{'the': 0, ',': 1, 'a': 2, 'to': 3, 'of': 4, 'in': 5, 'and': 6, '<padded>': 7, 'on': 8, 'for': 9}


Now we preprocess the train and test sets.

In [6]:
train_df['X'] = train_df['description'].apply(lambda x: preprocessor.transform_utterance(x))
train_df['Y'] = train_df['class'].apply(lambda x: preprocessor.transform_label(x))

test_df['X'] = test_df['description'].apply(lambda x: preprocessor.transform_utterance(x))
test_df['Y'] = test_df['class'].apply(lambda x: preprocessor.transform_label(x))

train_df.sample(frac=0.00005)

Unnamed: 0,title,description,class,X,Y
40227,Resistant Israeli Settlers May Get Prison,"JERUSALEM Sept. 26, 2004 - Armed settlers who ...",World,"[882, 588, 1049, 1, 102, 1318, 3976, 64, 11344...",3
101880,Free credit reports to bear ads,Ordered by Congress to give consumers free acc...,Business,"[1161, 26, 1034, 3, 568, 890, 388, 737, 3, 40,...",0
108476,Reports: Blasts Rock Madrid Gas Stations,"MADRID, Spain - Spanish media are reporting th...",World,"[683, 1, 701, 1177, 481, 35, 2025, 14, 24, 215...",3
93711,Woman #39;blessed by the holy toast #39;,A half-eaten slice of cheese on toast purporte...,Sci/Tech,"[2, 312, 12300, 8754, 4, 5778, 8, 18813, 6163,...",1
36018,Computer Associates to Pay \$225 Million to Av...,The software giant agreed to pay \$225 million...,Business,"[0, 96, 172, 168, 3, 357, 6142, 94, 3, 1794, 1...",0
38080,Rival Technologies Vie for 'Green' Car of Tomo...,Reuters - Carmakers presented new-age automobi...,Sci/Tech,"[15, 9656, 4344, 19, 2175, 14791, 24, 0, 493, ...",1


Finally lets use the preprocessor to transform some new text. Notice the padding to ensure constant input length & the handling of unknown words.

In [7]:
print("The news looks bad today. ==> {}".format(preprocessor.transform_utterance("The news looks bad today.")))
print("MXNet is awesome. No really... ==>{}".format(preprocessor.transform_utterance("MXNet is awesome. No really...")))

The news looks bad today. ==> [0, 134, 1176, 1378, 78, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
MXNet is awesome. No really... ==>[-1, 17, 18311, 100, 1983, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]


In [8]:
print(len(preprocessor.token_to_index))

49705


# Build Data Iterators

Next we use the preprocessor to transform the data & labels, convert the output to numpy arrays and build mxnet data iterators to feed batches to the model.

In [9]:
import mxnet as mx
import numpy as np

X_train = np.array(train_df['X'].tolist())
Y_train = np.array(train_df['Y'].tolist())

X_test = np.array(test_df['X'].tolist())
Y_test = np.array(test_df['Y'].tolist())

batch_n=120

train_iter = mx.io.NDArrayIter(data=X_train, label=Y_train, batch_size=batch_n, shuffle=True)
test_iter = mx.io.NDArrayIter(data=X_test, label=Y_test, batch_size=batch_n)

Lets take a look at the first batch:

In [10]:
for i, batch in enumerate(train_iter):
    if i < 1:
        print("\nBatch {}\nX:\n{}\n Y:\n{}".format(i, batch.data, batch.label))
train_iter.reset()


Batch 0
X:
[
[[4.2100e+02 6.4800e+02 1.7200e+02 ... 1.4440e+04 3.4000e+02 1.7000e+01]
 [3.8388e+04 1.9190e+03 4.4000e+01 ... 1.0000e+00 2.3300e+02 8.0000e+00]
 [3.0020e+03 1.0000e+01 1.5000e+01 ... 4.5360e+03 3.0000e+00 4.5190e+03]
 ...
 [3.9550e+03 6.0000e+00 9.4110e+03 ... 1.6400e+02 3.0000e+00 7.6000e+01]
 [2.8000e+01 2.9100e+03 2.4470e+03 ... 7.0000e+00 7.0000e+00 7.0000e+00]
 [1.4900e+02 4.8700e+02 3.9940e+03 ... 6.5000e+01 9.5270e+03 5.0000e+00]]
<NDArray 120x20 @cpu(0)>]
 Y:
[
[1. 1. 1. 0. 2. 0. 0. 3. 3. 2. 1. 3. 3. 3. 2. 2. 3. 1. 3. 3. 1. 2. 0. 1.
 0. 3. 3. 1. 0. 1. 1. 2. 1. 1. 0. 1. 0. 0. 2. 2. 0. 3. 2. 3. 2. 1. 3. 3.
 1. 1. 3. 0. 3. 0. 0. 1. 1. 3. 3. 3. 2. 3. 1. 2. 1. 0. 1. 1. 1. 2. 3. 3.
 2. 2. 3. 3. 3. 3. 0. 2. 1. 2. 1. 3. 3. 1. 3. 1. 2. 1. 1. 3. 1. 0. 1. 2.
 0. 3. 0. 0. 3. 3. 1. 1. 2. 3. 1. 0. 1. 1. 2. 3. 0. 0. 0. 0. 1. 0. 2. 2.]
<NDArray 120 @cpu(0)>]


# Build the model symbol


In [11]:
def sym_gen(sentence_size, num_embed, vocab_size, num_label, filter_list, num_filter, dropout):
    """
    :param sentence_size: number of tokens per utterance
    :param num_embed: embedding size for each token
    :param vocab_size: number of unique tokens in the training set
    :param num_label: number of output classes 
    :param filter_list: list of filter heights
    :param num_filter: number of each filter height
    :return: network symbol
    """
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')

    embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed)
    
    # reshape to (batches, channels, height, width)
    conv_input = mx.sym.reshape(data=embed_layer, shape=(0, 1, sentence_size, num_embed))

    # create convolution + (max) pooling layer for each filter operation
    pooled_outputs = []
    for i, filter_size in enumerate(filter_list):
        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
        relui = mx.sym.Activation(data=convi, act_type='relu')
        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
        pooled_outputs.append(pooli)

    # concatenate pooled outputs
    concat = mx.sym.Concat(*pooled_outputs, dim=1)
    
    # reshape to (batches, num filters)
    h_pool = mx.sym.reshape(data=concat, shape=(0, -1))
    
    h_drop = mx.sym.Dropout(data=h_pool, p=dropout)
    
    fc = mx.sym.FullyConnected(data=h_drop, num_hidden=num_label)

    return mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')

In [12]:
symbol = sym_gen(sentence_size=preprocessor.max_tokens, 
                 num_embed=16, 
                 vocab_size=len(preprocessor.token_to_index), 
                 num_label=len(preprocessor.intent_to_index),
                 filter_list=[3, 4, 5], 
                 num_filter=100, 
                 dropout=0.85)

# Train the model

- State of the art test accuracy ~ 92%
- Within 3 epochs, training on a cpu, discarding any tokens above 20 we come within ~3%.

In [13]:
module = mx.mod.Module(symbol)

module.fit(train_data=train_iter,
           eval_data=test_iter,
           eval_metric=mx.metric.Accuracy(),
           optimizer='Adam',
           optimizer_params={'learning_rate': 0.001},
           initializer=mx.initializer.Uniform(0.1),
           num_epoch=3)

INFO:root:Epoch[0] Train-accuracy=0.731250
INFO:root:Epoch[0] Time cost=23.463
INFO:root:Epoch[0] Validation-accuracy=0.880599
INFO:root:Epoch[1] Train-accuracy=0.887333
INFO:root:Epoch[1] Time cost=32.151
INFO:root:Epoch[1] Validation-accuracy=0.891276
INFO:root:Epoch[2] Train-accuracy=0.908992
INFO:root:Epoch[2] Time cost=32.255
INFO:root:Epoch[2] Validation-accuracy=0.891276


# Example Predictions

In [14]:
def predict(utterance, preprocessor, module):
    """
    :param module: trained mxnet module
    :param preprocessor: fit preprocessor
    :param utterance: raw string for prediction
    :return: list of tuple
    """
    preprocessed_utterance = preprocessor.transform_utterance(utterance)
    numpy_utterance = np.array([preprocessed_utterance])
    pred_iter = mx.io.NDArrayIter(data=numpy_utterance, label=np.array([0]), batch_size=1)
    predicted_probabilities = module.predict(pred_iter).asnumpy().tolist()[0]
    class_preds = [(preprocessor.index_to_intent[i], v) for i, v in enumerate(predicted_probabilities)]
    return class_preds

In [15]:
import random

idx = random.randint(1,test_df.shape[0])
utterance = test_df.iloc[idx].description
label = test_df.iloc[idx]['class']

print("Text:\n\n{}\n\nLabel:\n\n{}".format(utterance, label))

Text:

Reuters - With oil prices close to  #36;50 a\barrel, the Bush administration is set to allow oil refineries\to borrow crude from the government's emergency petroleum\stockpile to make up for supplies disrupted by Hurricane Ivan,\a congressional source briefed on the pending decision told\Reuters on Thursday.

Label:

Business


In [16]:
class_preds = predict(utterance, preprocessor, module)
class_preds

[('Business', 0.952357292175293),
 ('Sci/Tech', 0.00678549287840724),
 ('Sports', 4.061499930685386e-05),
 ('World', 0.04081658646464348)]

Usually we want the highest confidence prediction:

In [17]:
from operator import itemgetter

max(class_preds,key=itemgetter(1))

('Business', 0.952357292175293)