# Notes

- Read in already preprocessed data & show it in the form of a pandas DF
    - Nothing to do with MXNet and a waste of people's time when it's char level
- Build a bucketing iterator
- 


# Read in previously preprocessed data

In [2]:
import pandas as pd

# train_df = pd.read_pickle('../data/ag_news_char/train.pickle')
# test_df = pd.read_pickle('../data/ag_news_char/test.pickle')

# Build a custom bucketing iterator

In [45]:
import bisect
import random
import numpy as np
from mxnet.io import DataIter, DataBatch, DataDesc
from mxnet import ndarray
from sklearn.utils import shuffle


class BucketUtteranceIter(DataIter):
    """
    This iterator can handle variable length feature arrays
    """
    def __init__(self, utterances, intents, batch_size, buckets, data_pad=-1, label_pad=-1, data_name='utterance',
                 label_name='label', dtype='float32'):
        """
        :param utterances: list of list of int
        :param intents: list of int
        """
        super(BucketUtteranceIter, self).__init__()
        buckets.sort()

        nslice = 0  # Keep track of how many utterances are sliced
        self.utterances = [[] for _ in buckets]
        self.intents = [[] for _ in buckets]
        self.indices = [[] for _ in buckets]

        for i, utt in enumerate(utterances):
            # Find the index of the smallest bucket that is larger than the sentence length
            buck_idx = bisect.bisect_left(buckets, len(utt))

            # Slice utterances that are too long to the largest bucket size
            if buck_idx == len(buckets):
                buck_idx = buck_idx - 1
                nslice += 1
                utt = utt[:buckets[buck_idx]]

            # Pad utterances that are too short for their bucket
            buff = np.full((buckets[buck_idx]), data_pad, dtype=dtype)
            buff[:len(utt)] = utt

            # Add data/label to bucket
            self.utterances[buck_idx].append(buff)
            self.intents[buck_idx].append(intents[i])
            self.indices[buck_idx].append(i)

        # Convert to list of array
        self.utterances = [np.asarray(i, dtype=dtype) for i in self.utterances]
        self.intents = [np.asarray(i, dtype=dtype) for i in self.intents]
        self.indices = [np.asarray(i, dtype=dtype) for i in self.indices]

        print("Warning, {0} utterances sliced to largest bucket size.".format(nslice)) if nslice > 0 else None
        print("Utterances per bucket: {}\nBucket sizes: {}".format([arr.shape[0] for arr in self.utterances], buckets))

        self.data_name = data_name
        self.label_name = label_name
        self.batch_size = batch_size
        self.buckets = buckets
        self.dtype = dtype
        self.data_pad = data_pad
        self.label_pad = label_pad
        self.default_bucket_key = max(buckets)
        self.layout = 'NT'

        self.provide_data = [DataDesc(name=self.data_name,
                                      shape=(self.batch_size, self.default_bucket_key),
                                      layout=self.layout)]
        self.provide_label = [DataDesc(name=self.label_name,
                                       shape=(self.batch_size, ),
                                       layout=self.layout)]

        # create empty list to store batch index values
        self.idx = []
        for i, buck in enumerate(self.utterances):
            self.idx.extend([(i, j) for j in range(0, len(buck) - batch_size + 1, batch_size)])
        self.curr_idx = 0
        self.reset()

    def reset(self):
        """
        Resets the iterator to the beginning of the data.
        """
        self.curr_idx = 0
        # shuffle data in each bucket
        random.shuffle(self.idx)
        for i, buck in enumerate(self.utterances):
            self.indices[i], self.utterances[i], self.intents[i] = shuffle(self.indices[i],
                                                                           self.utterances[i],
                                                                           self.intents[i])
        self.ndindex = []
        self.ndsent = []
        self.ndlabel = []

        # append the lists with an array
        for i, buck in enumerate(self.utterances):
            self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype))
            self.ndsent.append(ndarray.array(self.utterances[i], dtype=self.dtype))
            self.ndlabel.append(ndarray.array(self.intents[i], dtype=self.dtype))

    def next(self):
        """
        Returns the next batch of data.
        """
        if self.curr_idx == len(self.idx):
            raise StopIteration
        # i = batches index, j = starting record
        i, j = self.idx[self.curr_idx]
        self.curr_idx += 1

        indices = self.ndindex[i][j:j + self.batch_size]
        utterances = self.ndsent[i][j:j + self.batch_size]
        intents = self.ndlabel[i][j:j + self.batch_size]

        return DataBatch([utterances],
                         [intents],
                         pad=0,
                         index=indices,
                         bucket_key=self.buckets[i],
                         provide_data=[DataDesc(name=self.data_name, shape=utterances.shape, layout=self.layout)],
                         provide_label=[DataDesc(name=self.label_name, shape=intents.shape, layout=self.layout)])


In [44]:
utterances = [
    [1,2,3,4], 
    [1,2,3,4,5,6,7,8,9,10,11], 
    [1,2],
    [1,2,3,4,5]
]

intents = [1,2,3,4]
batch_size=2

iterator = BucketUtteranceIter(utterances, intents, batch_size, buckets=[2,5,8])

for i, batch in enumerate(iterator):
    print("\nBatch {}\nData\n {} \nLabel\n {}\n Bucket Key\n {}".format(i, batch.data, batch.label, batch.bucket_key))

Utterances per bucket: [1, 2, 1]
Bucket sizes: [2, 5, 8]

Batch 0
Data
 [
[[ 1.  2.  3.  4.  5.]
 [ 1.  2.  3.  4. -1.]]
<NDArray 2x5 @cpu(0)>] 
Label
 [
[4. 1.]
<NDArray 2 @cpu(0)>]
 Bucket Key
 5


# Define the network symbol

In [None]:
import mxnet as mx

def BucketedTokenCnnModule(token_to_index, intent_to_index, num_embed, filters, num_filter, dropout, default_bucket_key,
                           context, invalid_label, batch_norm, smooth_alpha):
    """
    builds an mxnet bucketing module
    """
    def sym_gen(seq_len):
        """
        :param seq_len: bucket size
        :return: symbol for neural network architecture
        """
        def conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
            conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True,
                                      name='%s%s_conv2d' % (name, suffix))
            bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' % (name, suffix), fix_gamma=True)
            act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' % (name, suffix))
            return act

        def conv_block(data, num_filter, name):
            conv1 = conv(data, kernel=(1, 3), num_filter=num_filter, pad=(0, 1), name='conv1'+str(name))
            conv2 = conv(conv1, kernel=(1, 3), num_filter=num_filter, pad=(0, 1), name='conv2'+str(name))
            return conv2

        X_shape, Y_shape = iterator.provide_data[0][1], iterator.provide_label[0][1]

        data = mx.sym.Variable(name="data")
        softmax_label = mx.sym.Variable(name="softmax_label")
        print("data_input: ", data.infer_shape(data=X_shape)[1][0])
        print("label input: ", softmax_label.infer_shape(softmax_label=Y_shape)[1][0])

        # Embed each character to 16 channels
        embedded_data = mx.sym.Embedding(data, input_dim=len(preprocessor.char_to_index), output_dim=hyperparameters['char_embed'])
        embedded_data = mx.sym.Reshape(mx.sym.transpose(embedded_data, axes=(0, 2, 1)), shape=(0, 0, 1, -1))
        print("embedded output: ", embedded_data.infer_shape(data=X_shape)[1][0])

        # Temporal Convolutional Layer (without activation)
        temp_conv_1 = mx.sym.Convolution(embedded_data, kernel=(1, 3), num_filter=hyperparameters['temp_conv_filters'], pad=(0, 1))
        print("temp conv output: ", temp_conv_1.infer_shape(data=X_shape)[1][0])

        # Create convolutional blocks with pooling in-between
        channels = (hyperparameters['temp_conv_filters'],
                    2 * hyperparameters['temp_conv_filters'],
                    4 * hyperparameters['temp_conv_filters'],
                    8 * hyperparameters['temp_conv_filters'])

        blocks = (hyperparameters['block1_blocks'],
                    hyperparameters['block2_blocks'],
                    hyperparameters['block3_blocks'],
                    hyperparameters['block4_blocks'])

        for i, block_size in enumerate(blocks):
            print("section {} ({} blocks)".format(i, block_size))
            for j in list(range(block_size)):
                if i == 0 and j == 0:
                    # first block follows the first temp conv layer
                    block = conv_block(temp_conv_1, num_filter=channels[i], name='block'+str(i)+'_'+str(j))
                elif j == 0:
                    # this block follows a pooling layer
                    block = conv_block(pool, num_filter=channels[i], name='block' + str(i) + '_' + str(j))
                else:
                    # this block follows a previous block
                    block = conv_block(block, num_filter=channels[i], name='block'+str(i)+'_'+str(j))
                print('\tblock'+str(i)+'_'+str(j), block.infer_shape(data=X_shape)[1][0])
            if i != len(blocks)-1:
                # pool after each block size, excluding final layer
                pool = mx.sym.Pooling(block, kernel=(1, 3), stride=(1, 2), pad=(0, 1), pool_type='max')
                print('\tblock' + str(i) + '_p', pool.infer_shape(data=X_shape)[1][0])

        pool_k = block.infer_shape(data=X_shape)[1][0][3]
        print("{0} pool kernel size {1}, stride 1".format(hyperparameters['pool_type'], pool_k))
        block = mx.sym.flatten(mx.sym.Pooling(block, kernel=(1, pool_k), stride=(1, 1), pad=(0, 0), pool_type=hyperparameters['pool_type']))
        print("flattened pooling output: {1}".format(pool_k, block.infer_shape(data=X_shape)[1][0]))
        block = mx.sym.Dropout(block, p=hyperparameters['dropout'])
        print("dropout output: ", block.infer_shape(data=X_shape)[1][0])

        output = mx.sym.FullyConnected(block, num_hidden=len(preprocessor.label_to_index), flatten=True, name='output')
        sm = mx.sym.SoftmaxOutput(output, softmax_label, hyperparameters['smooth_alpha'])
        print("softmax output: ", sm.infer_shape(data=X_shape)[1][0])

        return sm, ('utterance',), ('intent',)

    return mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=default_bucket_key, context=context)

# Train on varying input sequence lengths

Blah