# Read in Data

In [246]:
import pandas as pd

train_df = pd.read_csv('../data/ag_news/train.csv')[:1000]
test_df = pd.read_csv('../data/ag_news/test.csv')[:300]

test_df.head(10)

Unnamed: 0,title,description,class
0,Fears for T N pension after talks,Unions representing workers at Turner Newall...,Business
1,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",Sci/Tech
2,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,Sci/Tech
3,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,Sci/Tech
4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,Sci/Tech
5,Open Letter Against British Copyright Indoctri...,The British Department for Education and Skill...,Sci/Tech
6,Loosing the War on Terrorism,"\\""Sven Jaschan, self-confessed author of the ...",Sci/Tech
7,"FOAFKey: FOAF, PGP, Key Distribution, and Bloo...",\\FOAF/LOAF and bloom filters have a lot of i...,Sci/Tech
8,E-mail scam targets police chief,"Wiltshire Police warns about ""phishing"" after ...",Sci/Tech
9,"Card fraud unit nets 36,000 cards","In its first two years, the UK's dedicated car...",Sci/Tech


# Preprocess the data for MXNet

In [247]:
from collections import Counter
import itertools
import spacy

class TokenPreprocessor():
    def __init__(self, spacy_model, unseen_token=-1, pad_char='<padded>',max_tokens=20, unseen_label=-1):
        self.unseen_token=unseen_token
        self.pad_char = pad_char
        self.max_tokens = max_tokens
        self.unseen_label = unseen_label      
        self.nlp = spacy.load(spacy_model)

    def split_utterance(self, utterance):
        """
        :param utterance: string
        :return: list of string
        """
        doc = self.nlp(utterance)
        return [token.text for token in doc]
    
    def pad_utterance(self, tokenized_utterance):
        """
        :param utterance: list of string
        :param length: desired list length
        :return: padded/sliced list
        """
        diff = len(tokenized_utterance) - self.max_tokens
        if diff > 0:
            return tokenized_utterance[:self.max_tokens]
        else:
            return tokenized_utterance + [self.pad_char] * -diff

    def build_vocab(self, data, depth=1, max_vocab_size=None):
        """
        :param data: list of data
        :param depth: depth of data list
        :param max_vocab_size:
        :return: dict and list mapping data to indices
        """
        if depth >1:
            data = list(itertools.chain.from_iterable(data)) # Make list 1D
        data_counts = Counter(data)  # Count occurrences of each word in the list

        vocabulary_inv = [x[0] for x in data_counts.most_common(max_vocab_size)]
        vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
        return vocabulary, vocabulary_inv

    def fit(self, utterances, labels):
        """
        :param utterances: list of raw utterances
        :param labels: list of raw labels
        """
        split_utterances = [self.split_utterance(utterance) for utterance in utterances]
        padded_utterances = [self.pad_utterance(utterance) for utterance in split_utterances]
        self.token_to_index, self.index_to_token = self.build_vocab(padded_utterances, depth=2)
        self.intent_to_index, self.index_to_intent = self.build_vocab(labels, depth=1)

    def transform_utterance(self, utterance):
        """
        :param utterance: raw utterance string
        :return: preprocessed utterance
        """
        split_utterance = self.split_utterance(utterance)
        padded_utterances = self.pad_utterance(split_utterance)
        return [self.token_to_index.get(token, self.unseen_token) for token in padded_utterances]

    def transform_label(self, label):
        """
        :param label: raw intent label
        :return: indexed intent label
        """
        return self.intent_to_index.get(label, self.unseen_label)

In [248]:
preprocessor = TokenPreprocessor(spacy_model='en_core_web_sm')
preprocessor.fit(train_df['description'].tolist(), train_df['class'].tolist())

In [249]:
print("Label to index mappings:\t{}\n\nToken to index mappings:\n\n{}".
      format(preprocessor.intent_to_index, preprocessor.token_to_index))

Label to index mappings:	{'Sci/Tech': 0, 'World': 1, 'Business': 2, 'Sports': 3}

Token to index mappings:



In [250]:
print("The news looks bad today. ==> {}".format(preprocessor.transform_utterance("The news looks bad today.")))
print("MXNet is awesome. No really... ==>{}".format(preprocessor.transform_utterance("MXNet is awesome. No really...")))

The news looks bad today. ==> [18, 400, 773, 1615, 324, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
MXNet is awesome. No really... ==>[-1, 19, -1, 9, 297, 1456, 131, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


# Build Data Iterators

In [251]:
import mxnet as mx
import numpy as np

X_train = np.array([preprocessor.transform_utterance(utt) for utt in train_df['description'].tolist()])
Y_train = np.array([preprocessor.transform_label(label) for label in train_df['class'].tolist()])

X_test = np.array([preprocessor.transform_utterance(utt) for utt in test_df['description'].tolist()])
Y_test = np.array([preprocessor.transform_label(label) for label in test_df['class'].tolist()])

batch_n=3

train_iter = mx.io.NDArrayIter(data=X_train, label=Y_train, batch_size=batch_n, shuffle=True)
test_iter = mx.io.NDArrayIter(data=X_test, label=Y_test, batch_size=batch_n)

In [252]:
train_iter.reset()
for i, batch in enumerate(train_iter):
    print("\nBatch {}\nX:\n{}\n Y:\n{}".format(i, batch.data, batch.label))
train_iter.reset()


Batch 0
X:
[
[[1.700e+01 1.000e+00 5.026e+03 6.500e+01 5.027e+03 1.000e+01 2.014e+03
  5.028e+03 3.200e+01 9.200e+01 5.300e+02 8.200e+01 5.000e+00 5.029e+03
  4.000e+00 6.850e+02 5.030e+03 5.000e+00 5.031e+03 5.620e+02]
 [1.500e+01 1.000e+00 4.440e+02 7.000e+00 1.098e+03 3.140e+02 8.150e+02
  1.966e+03 1.400e+01 3.000e+01 9.200e+01 5.000e+00 4.741e+03 6.500e+01
  7.500e+02 1.942e+03 1.967e+03 2.200e+01 3.500e+01 1.968e+03]
 [4.862e+03 4.863e+03 4.864e+03 4.865e+03 7.000e+00 1.989e+03 4.866e+03
  2.000e+00 4.700e+01 3.240e+02 1.770e+02 0.000e+00 5.100e+02 6.000e+00
  2.400e+01 1.989e+03 4.867e+03 4.868e+03 5.120e+02 9.000e+00]]
<NDArray 3x20 @cpu(0)>]
 Y:
[
[0. 1. 0.]
<NDArray 3 @cpu(0)>]

Batch 1
X:
[
[[1.050e+02 3.430e+03 4.780e+02 1.037e+03 7.480e+02 1.600e+02 3.431e+03
  1.000e+01 3.432e+03 2.000e+00 2.300e+01 4.000e+00 2.520e+02 1.045e+03
  9.000e+00 3.000e+00 3.000e+00 3.000e+00 3.000e+00 3.000e+00]
 [1.320e+02 2.000e+00 1.020e+02 1.000e+00 1.854e+03 2.800e+01 1.855e+03
  5.440e+

<NDArray 3 @cpu(0)>]

Batch 173
X:
[
[[1.700e+01 1.000e+00 1.800e+01 6.750e+02 8.890e+02 1.328e+03 1.600e+01
  4.920e+02 3.720e+02 8.900e+02 1.000e+00 8.910e+02 9.400e+01 3.100e+01
  1.329e+03 8.920e+02 3.420e+02 8.000e+00 9.700e+01 4.930e+02]
 [8.000e+00 5.100e+01 5.500e+01 1.200e+01 1.500e+01 1.300e+01 1.000e+00
  4.200e+01 2.780e+02 5.195e+03 1.400e+01 3.000e+01 2.100e+01 8.800e+01
  8.000e+00 7.400e+01 5.196e+03 3.400e+01 6.400e+01 8.240e+02]
 [1.700e+01 1.000e+00 2.600e+01 8.600e+01 5.140e+02 4.000e+00 2.520e+02
  1.000e+00 7.070e+02 2.508e+03 2.300e+01 0.000e+00 8.000e+00 9.700e+01
  9.180e+02 1.090e+02 1.402e+03 3.130e+02 4.940e+02 1.600e+01]]
<NDArray 3x20 @cpu(0)>]
 Y:
[
[0. 2. 0.]
<NDArray 3 @cpu(0)>]

Batch 174
X:
[
[[9.470e+02 4.966e+03 4.100e+01 4.967e+03 4.100e+01 4.968e+03 2.000e+00
  0.000e+00 2.170e+02 1.936e+03 6.000e+00 1.890e+02 4.000e+00 7.490e+02
  1.000e+00 4.969e+03 7.670e+02 1.000e+00 4.970e+03 9.830e+02]
 [1.800e+01 3.291e+03 6.000e+00 3.292e+03 1.000e+01 1.64

# Build the model symbol

In [253]:
def sym_gen(sentence_size, num_embed, vocab_size, num_label=2, filter_list=[3, 4, 5], num_filter=100, dropout=0.0):
    
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')
    
    X_shape = (120,20)

    # embedding layer
    embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed)
    print("Embed output shape: {}".format(embed_layer.infer_shape(data=X_shape)[1][0]))
    conv_input = mx.sym.reshape(data=embed_layer, shape=(0, 1, sentence_size, num_embed))
    print("Convolutional input shape: {}".format(conv_input.infer_shape(data=X_shape)[1][0]))

    # create convolution + (max) pooling layer for each filter operation
    pooled_outputs = []
    for i, filter_size in enumerate(filter_list):
        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
        relui = mx.sym.Activation(data=convi, act_type='relu')
        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
        pooled_outputs.append(pooli)

    # combine all pooled outputs
    concat = mx.sym.Concat(*pooled_outputs, dim=1)
    print("Pooled output shape: {}".format(concat.infer_shape(data=X_shape)[1][0]))
    h_pool = mx.sym.reshape(data=concat, shape=(0, -1))
    print("Reshaped pooled output shape: {}".format(h_pool.infer_shape(data=X_shape)[1][0]))
    
    # dropout layer
    h_drop = mx.sym.Dropout(data=h_pool, p=dropout)

    fc = mx.sym.FullyConnected(data=h_drop, num_hidden=num_label)

    # softmax output
    return mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')

In [254]:
symbol = sym_gen(20, 16, 1600)

Embed output shape: (120, 20, 16)
Convolutional input shape: (120, 1, 20, 16)
Pooled output shape: (120, 300, 1, 1)
Reshaped pooled output shape: (120, 300)


# Train the model

In [255]:
module = mx.mod.Module(symbol, data_names=['data'], label_names=['softmax_label'])

In [256]:
module.fit(train_data = train_iter,
           eval_data = test_iter,
           eval_metric = mx.metric.Accuracy(),
           optimizer = 'Adam',
           optimizer_params = {'learning_rate': 0.01},
           initializer = mx.initializer.Uniform(0.1),
           num_epoch = 10)

# Visualize Performance

# Construct Model Class