# Read in Data

In [117]:
import pandas as pd

train_df = pd.read_csv('../data/ag_news/train.csv')
test_df = pd.read_csv('../data/ag_news/test.csv')

test_df.head(5)

Unnamed: 0,title,description,label
0,Fears for T N pension after talks,Unions representing workers at Turner Newall...,Business
1,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",Sci/Tech
2,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,Sci/Tech
3,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,Sci/Tech
4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,Sci/Tech


# Preprocess the data for MXNet

In [102]:
from collections import Counter
import itertools
import spacy
import regex as re


class TokenPreprocessor:
    def __init__(self, spacy_model, unseen_token=-1, pad_char='<padded>',max_tokens=20, unseen_label=-1):
        self.unseen_token=unseen_token
        self.pad_char = pad_char
        self.max_tokens = max_tokens
        self.unseen_label = unseen_label      
        self.nlp = spacy.load(spacy_model)

    def split_utterance(self, utterance):
        """
        :param utterance: string
        :return: list of string
        """
        doc = self.nlp(utterance)
        return [token.text for token in doc]
    
    def pad_utterance(self, tokenized_utterance):
        """
        :param utterance: list of string
        :param length: desired list length
        :return: padded/sliced list
        """
        diff = len(tokenized_utterance) - self.max_tokens
        if diff > 0:
            return tokenized_utterance[:self.max_tokens]
        else:
            return tokenized_utterance + [self.pad_char] * -diff

    def build_vocab(self, data, depth=1, max_vocab_size=None):
        """
        :param data: list of data
        :param depth: depth of data list
        :param max_vocab_size:
        :return: dict and list mapping data to indices
        """
        if depth >1:
            data = list(itertools.chain.from_iterable(data)) # Make list 1D
        data_counts = Counter(data)  # Count occurrences of each word in the list

        vocabulary_inv = [x[0] for x in data_counts.most_common(max_vocab_size)]
        vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
        return vocabulary, vocabulary_inv

    def fit(self, utterances, labels):
        """
        :param utterances: list of raw utterances
        :param labels: list of raw labels
        """
        split_utterances = [self.split_utterance(utterance) for utterance in utterances]
        padded_utterances = [self.pad_utterance(utterance) for utterance in split_utterances]
        self.token_to_index, self.index_to_token = self.build_vocab(padded_utterances, depth=2)
        self.intent_to_index, self.index_to_intent = self.build_vocab(labels, depth=1)

    def transform_utterance(self, utterance):
        """
        :param utterance: raw utterance string
        :return: preprocessed utterance
        """
        split_utterance = self.split_utterance(utterance)
        padded_utterances = self.pad_utterance(split_utterance)
        return [self.token_to_index.get(token, self.unseen_token) for token in padded_utterances]

    def transform_label(self, label):
        """
        :param label: raw intent label
        :return: indexed intent label
        """
        return self.intent_to_index.get(label, self.unseen_label)

In [103]:
preprocessor = TokenPreprocessor(spacy_model='en_core_web_sm')
preprocessor.fit(train_df['description'].tolist(), train_df['label'].tolist())

In [125]:
print("Label to index mappings:\t{}\n\nFirst 10 token to index mappings:\n\n{}".
      format(preprocessor.intent_to_index, 
             {k: preprocessor.token_to_index[k] for k in list(preprocessor.token_to_index)[:10]}))

Label to index mappings:	{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

First 10 token to index mappings:

{'the': 0, ',': 1, '-': 2, 'a': 3, 'to': 4, 'of': 5, 'in': 6, 'and': 7, 'on': 8, ' ': 9}


In [105]:
print("The news looks bad today. ==> {}".format(preprocessor.transform_utterance("The news looks bad today.")))
print("MXNet is awesome. No really... ==>{}".format(preprocessor.transform_utterance("MXNet is awesome. No really...")))

The news looks bad today. ==> [14, 243, 1227, 1467, 79, 10, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
MXNet is awesome. No really... ==>[-1, 20, 34443, 10, 209, 2117, 536, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]


# Build Data Iterators

In [106]:
import mxnet as mx
import numpy as np

X_train = np.array([preprocessor.transform_utterance(utt) for utt in train_df['description'].tolist()])
Y_train = np.array([preprocessor.transform_label(label) for label in train_df['label'].tolist()])

X_test = np.array([preprocessor.transform_utterance(utt) for utt in test_df['description'].tolist()])
Y_test = np.array([preprocessor.transform_label(label) for label in test_df['label'].tolist()])

batch_n=120

train_iter = mx.io.NDArrayIter(data=X_train, label=Y_train, batch_size=batch_n, shuffle=True)
test_iter = mx.io.NDArrayIter(data=X_test, label=Y_test, batch_size=batch_n)

In [116]:
train_iter.reset()
for i, batch in enumerate(train_iter):
    if i < 1:
        print("\nBatch {}\nX:\n{}\n Y:\n{}".format(i, batch.data, batch.label))
train_iter.reset()


Batch 0
X:
[
[[2.9700e+02 2.8400e+03 2.1690e+03 ... 1.0300e+03 1.0000e+01 1.5000e+01]
 [3.6900e+02 6.0000e+00 1.6900e+02 ... 2.9840e+03 7.6000e+01 5.6500e+02]
 [6.1040e+03 3.1910e+03 1.0000e+00 ... 2.1970e+03 2.2000e+01 2.2690e+03]
 ...
 [3.9300e+02 1.0000e+00 9.1000e+01 ... 9.0000e+00 1.3162e+04 0.0000e+00]
 [2.1000e+01 2.0000e+00 1.8100e+02 ... 1.0000e+00 5.0122e+04 7.6380e+03]
 [3.6900e+02 5.0000e+00 0.0000e+00 ... 0.0000e+00 1.7240e+03 3.5840e+03]]
<NDArray 120x20 @cpu(0)>]
 Y:
[
[3. 1. 1. 0. 2. 3. 0. 0. 2. 2. 3. 1. 2. 2. 3. 1. 0. 1. 1. 2. 2. 3. 2. 1.
 1. 0. 3. 1. 3. 3. 3. 1. 2. 1. 3. 1. 0. 2. 0. 3. 3. 2. 2. 3. 3. 3. 0. 0.
 2. 2. 1. 2. 0. 2. 0. 2. 2. 1. 1. 0. 2. 2. 3. 1. 3. 0. 3. 1. 3. 2. 1. 3.
 0. 2. 2. 0. 2. 1. 1. 1. 0. 0. 3. 0. 0. 3. 0. 1. 3. 1. 3. 1. 1. 0. 2. 2.
 0. 0. 2. 2. 2. 0. 0. 3. 0. 1. 3. 1. 2. 1. 0. 0. 0. 1. 1. 1. 0. 3. 3. 3.]
<NDArray 120 @cpu(0)>]


# Build the model symbol

In [126]:
def sym_gen(sentence_size, num_embed, vocab_size, num_label, filter_list, num_filter, dropout):
    
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')
    
    X_shape = (120,sentence_size)

    # embedding layer
    embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed)
    conv_input = mx.sym.reshape(data=embed_layer, shape=(0, 1, sentence_size, num_embed))

    # create convolution + (max) pooling layer for each filter operation
    pooled_outputs = []
    for i, filter_size in enumerate(filter_list):
        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
        relui = mx.sym.Activation(data=convi, act_type='relu')
        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
        pooled_outputs.append(pooli)

    # combine all pooled outputs
    concat = mx.sym.Concat(*pooled_outputs, dim=1)
    h_pool = mx.sym.reshape(data=concat, shape=(0, -1))
    
    # dropout layer
    h_drop = mx.sym.Dropout(data=h_pool, p=dropout)

    fc = mx.sym.FullyConnected(data=h_drop, num_hidden=num_label)

    # softmax output
    sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')

    print("Shapes assuming batch size 120:\n")
    print("Data input shape: {}".format(input_x.infer_shape(data=X_shape)[1][0]))
    print("Embed output shape: {}".format(embed_layer.infer_shape(data=X_shape)[1][0]))
    print("Convolutional input shape: {}".format(conv_input.infer_shape(data=X_shape)[1][0]))
    print("Pooled output shape: {}".format(concat.infer_shape(data=X_shape)[1][0]))
    print("Reshaped pooled output shape: {}".format(h_pool.infer_shape(data=X_shape)[1][0]))
    print("Output layer shape: {}".format(fc.infer_shape(data=X_shape)[1][0]))
    
    return sm

In [141]:
symbol = sym_gen(sentence_size=20, 
                 num_embed=16, 
                 vocab_size=len(preprocessor.token_to_index), 
                 num_label=len(preprocessor.intent_to_index),
                 filter_list=[3, 4, 5], 
                 num_filter=100, 
                 dropout=0.85)

Shapes assuming batch size 120:

Data input shape: (120, 20)
Embed output shape: (120, 20, 16)
Convolutional input shape: (120, 1, 20, 16)
Pooled output shape: (120, 300, 1, 1)
Reshaped pooled output shape: (120, 300)
Output layer shape: (120, 4)


# Train the model

- State of the art test accuracy ~ 92%
- Within 5 epochs, training on a cpu, discarding any tokens above 20 we come within ~4%.

In [142]:
module = mx.mod.Module(symbol)

In [143]:
module.fit(train_data=train_iter,
           eval_data=test_iter,
           eval_metric=mx.metric.Accuracy(),
           optimizer='Adam',
           optimizer_params={'learning_rate': 0.001},
           initializer=mx.initializer.Uniform(0.1),
           num_epoch=5)

INFO:root:Epoch[0] Train-accuracy=0.614978
INFO:root:Epoch[0] Time cost=11.109
INFO:root:Epoch[0] Validation-accuracy=0.862240
INFO:root:Epoch[1] Train-accuracy=0.872283
INFO:root:Epoch[1] Time cost=31.858
INFO:root:Epoch[1] Validation-accuracy=0.889193
INFO:root:Epoch[2] Train-accuracy=0.906575
INFO:root:Epoch[2] Time cost=37.516
INFO:root:Epoch[2] Validation-accuracy=0.888411
INFO:root:Epoch[3] Train-accuracy=0.925650
INFO:root:Epoch[3] Time cost=39.815
INFO:root:Epoch[3] Validation-accuracy=0.887240
INFO:root:Epoch[4] Train-accuracy=0.938583
INFO:root:Epoch[4] Time cost=39.530
INFO:root:Epoch[4] Validation-accuracy=0.881380


# Example Predictions

In [144]:
def predict(utterance, preprocessor, module):
    """
    :param module: trained mxnet module
    :param preprocessor: fit preprocessor
    :param utterance: raw string for prediction
    """
    preprocessed_utterance = preprocessor.transform_utterance(utterance)
    numpy_utterance = np.array([preprocessed_utterance])
    pred_iter = mx.io.NDArrayIter(data=numpy_utterance, label=np.array([0]), batch_size=1)
    predicted_probabilities = module.predict(pred_iter).asnumpy().tolist()[0]
    class_preds = [(preprocessor.index_to_intent[i], v) for i, v in enumerate(predicted_probabilities)]
    return class_preds

In [150]:
class_preds = predict("Elon Musk wants to take Tesla private at $420 per share.", preprocessor, module)
class_preds

[('Business', 0.5599420666694641),
 ('Sci/Tech', 0.29694634675979614),
 ('Sports', 0.014404438436031342),
 ('World', 0.1287071704864502)]

# Retrieve Modal Prediction

In [151]:
from operator import itemgetter

max(class_preds,key=itemgetter(1))

('Business', 0.5599420666694641)