# Read in Data

In [216]:
import pandas as pd

train_df = pd.read_csv('../data/ag_news/train.csv')[:100]
test_df = pd.read_csv('../data/ag_news/test.csv')[:30]

test_df.head(10)

Unnamed: 0,title,description,class
0,Fears for T N pension after talks,Unions representing workers at Turner Newall...,Business
1,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",Sci/Tech
2,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,Sci/Tech
3,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,Sci/Tech
4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,Sci/Tech
5,Open Letter Against British Copyright Indoctri...,The British Department for Education and Skill...,Sci/Tech
6,Loosing the War on Terrorism,"\\""Sven Jaschan, self-confessed author of the ...",Sci/Tech
7,"FOAFKey: FOAF, PGP, Key Distribution, and Bloo...",\\FOAF/LOAF and bloom filters have a lot of i...,Sci/Tech
8,E-mail scam targets police chief,"Wiltshire Police warns about ""phishing"" after ...",Sci/Tech
9,"Card fraud unit nets 36,000 cards","In its first two years, the UK's dedicated car...",Sci/Tech


# Preprocess the data for MXNet

In [217]:
from collections import Counter
import itertools
import spacy

class TokenPreprocessor():
    def __init__(self, spacy_model, unseen_token=-1, pad_char='<padded>',max_tokens=20, unseen_label=-1):
        self.unseen_token=unseen_token
        self.pad_char = pad_char
        self.max_tokens = max_tokens
        self.unseen_label = unseen_label      
        self.nlp = spacy.load(spacy_model)

    def split_utterance(self, utterance):
        """
        :param utterance: string
        :return: list of string
        """
        doc = self.nlp(utterance)
        return [token.text for token in doc]
    
    def pad_utterance(self, tokenized_utterance):
        """
        :param utterance: list of string
        :param length: desired list length
        :return: padded/sliced list
        """
        diff = len(tokenized_utterance) - self.max_tokens
        if diff > 0:
            return tokenized_utterance[:self.max_tokens]
        else:
            return tokenized_utterance + [self.pad_char] * -diff

    def build_vocab(self, data, depth=1, max_vocab_size=None):
        """
        :param data: list of data
        :param depth: depth of data list
        :param max_vocab_size:
        :return: dict and list mapping data to indices
        """
        if depth >1:
            data = list(itertools.chain.from_iterable(data)) # Make list 1D
        data_counts = Counter(data)  # Count occurrences of each word in the list

        vocabulary_inv = [x[0] for x in data_counts.most_common(max_vocab_size)]
        vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
        return vocabulary, vocabulary_inv

    def fit(self, utterances, labels):
        """
        :param utterances: list of raw utterances
        :param labels: list of raw labels
        """
        split_utterances = [self.split_utterance(utterance) for utterance in utterances]
        padded_utterances = [self.pad_utterance(utterance) for utterance in split_utterances]
        self.token_to_index, self.index_to_token = self.build_vocab(padded_utterances, depth=2)
        self.intent_to_index, self.index_to_intent = self.build_vocab(labels, depth=1)

    def transform_utterance(self, utterance):
        """
        :param utterance: raw utterance string
        :return: preprocessed utterance
        """
        split_utterance = self.split_utterance(utterance)
        padded_utterances = self.pad_utterance(split_utterance)
        return [self.token_to_index.get(token, self.unseen_token) for token in padded_utterances]

    def transform_label(self, label):
        """
        :param label: raw intent label
        :return: indexed intent label
        """
        return self.intent_to_index.get(label, self.unseen_label)

In [218]:
preprocessor = TokenPreprocessor(spacy_model='en_core_web_sm')
preprocessor.fit(train_df['description'].tolist(), train_df['class'].tolist())

In [219]:
print("Label to index mappings:\t{}\n\nToken to index mappings:\n\n{}".
      format(preprocessor.intent_to_index, preprocessor.token_to_index))

Label to index mappings:	{'Business': 0, 'Sci/Tech': 1}

Token to index mappings:



In [220]:
print("The news looks bad today. ==> {}".format(preprocessor.transform_utterance("The news looks bad today.")))
print("MXNet is awesome. No really... ==>{}".format(preprocessor.transform_utterance("MXNet is awesome. No really...")))

The news looks bad today. ==> [15, -1, -1, -1, -1, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
MXNet is awesome. No really... ==>[-1, 18, -1, 11, 616, -1, -1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]


# Build Data Iterators

In [221]:
import mxnet as mx
import numpy as np

X_train = np.array([preprocessor.transform_utterance(utt) for utt in train_df['description'].tolist()])
Y_train = np.array([preprocessor.transform_label(label) for label in train_df['class'].tolist()])

X_test = np.array([preprocessor.transform_utterance(utt) for utt in test_df['description'].tolist()])
Y_test = np.array([preprocessor.transform_label(label) for label in test_df['class'].tolist()])

batch_n=3

train_iter = mx.io.NDArrayIter(data=X_train, label=Y_train, batch_size=batch_n, shuffle=True)
test_iter = mx.io.NDArrayIter(data=X_test, label=Y_test, batch_size=batch_n)

In [222]:
train_iter.reset()
for i, batch in enumerate(train_iter):
    print("\nBatch {}\nX:\n{}\n Y:\n{}".format(i, batch.data, batch.label))
train_iter.reset()


Batch 0
X:
[
[[171. 172.   6. 173.  78.  36.  93.   3.  72. 174.  11. 100. 175.   1.
  176.   1. 177.   1. 178.   1.]
 [185.   0. 145.   6.  81.   4. 565.  12. 566. 102. 567.   5.  84. 186.
   55. 568.   1. 182. 569. 570.]
 [163. 204.  96.   6. 100. 691.   1. 190.  56. 205.  38. 692.  47. 693.
  694.  14. 695. 205.  38. 206.]]
<NDArray 3x20 @cpu(0)>]
 Y:
[
[0. 0. 0.]
<NDArray 3 @cpu(0)>]

Batch 1
X:
[
[[724.  38.  15. 725.   8. 726. 727. 728.  20. 729.  23.  21. 730. 207.
  731. 732.  28.   5. 733.  43.]
 [165. 734.  35.   8. 211. 212. 735.  46.  83.   5. 213. 736.   6.   0.
  737.   4.  37. 738. 739.   1.]
 [ 13.  51.  52.  20.  10.  23.   2.  15. 307. 308. 309.  17. 137.  13.
   45. 310. 311.   3.  79. 138.]]
<NDArray 3x20 @cpu(0)>]
 Y:
[
[0. 0. 0.]
<NDArray 3 @cpu(0)>]

Batch 2
X:
[
[[ 31.   2. 187. 188.   8. 870. 871. 872. 873.  19. 874.  12.  67. 202.
  189. 875.   3. 876.  25. 221.]
 [ 91.   4.   0. 407. 408. 409.  17.   0. 157. 410. 411.   4. 412.  73.
   74. 413. 414. 415.  12

# Build the model symbol

In [223]:
def sym_gen(sentence_size, num_embed, vocab_size, num_label=2, filter_list=[3, 4, 5], num_filter=100, dropout=0.0):
    
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')
    
    X_shape = (120,20)

    # embedding layer
    embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed)
    print("Embed output shape: {}".format(embed_layer.infer_shape(data=X_shape)[1][0]))
    conv_input = mx.sym.reshape(data=embed_layer, shape=(0, 1, sentence_size, num_embed))
    print("Convolutional input shape: {}".format(conv_input.infer_shape(data=X_shape)[1][0]))

    # create convolution + (max) pooling layer for each filter operation
    pooled_outputs = []
    for i, filter_size in enumerate(filter_list):
        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
        relui = mx.sym.Activation(data=convi, act_type='relu')
        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
        pooled_outputs.append(pooli)

    # combine all pooled outputs
    concat = mx.sym.Concat(*pooled_outputs, dim=1)
    print("Pooled output shape: {}".format(concat.infer_shape(data=X_shape)[1][0]))
    h_pool = mx.sym.reshape(data=concat, shape=(0, -1))
    print("Reshaped pooled output shape: {}".format(h_pool.infer_shape(data=X_shape)[1][0]))
    
    # dropout layer
    h_drop = mx.sym.Dropout(data=h_pool, p=dropout)

    fc = mx.sym.FullyConnected(data=h_drop, num_hidden=num_label)

    # softmax output
    sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')

    return sm

In [224]:
symbol = sym_gen(20, 16, 1600)

Embed output shape: (120, 20, 16)
Convolutional input shape: (120, 1, 20, 16)
Pooled output shape: (120, 300, 1, 1)
Reshaped pooled output shape: (120, 300)


# Train the model

In [228]:
module = mx.module.Module(symbol)

module.fit(train_data = train_iter,
           eval_data = test_iter,
           eval_metric = 'acc',
           optimizer = 'Adam',
           optimizer_params = {'learning_rate': 0.01},
           initializer = mx.initializer.Uniform(0.1),
           num_epoch = 10)

# View Training Curve

# Construct Model Class