In [None]:
import warnings
warnings.filterwarnings('ignore')

import random
import time
import multiprocessing as mp
import numpy as np

import mxnet as mx
from mxnet import nd, gluon, autograd

import gluonnlp as nlp

random.seed(123)
np.random.seed(123)
mx.random.seed(123)

In [None]:
dropout = 0.5
learning_rate, batch_size = 0.005, 32
bucket_num, bucket_ratio = 10, 0.2
grad_clip = None
log_interval = 100
context = mx.gpu(0)

### Multilingual Word embeddings

In [None]:
from mxnet.contrib import text
import collections

In [None]:
glove_42b300d = nlp.embedding.create('glove', source='glove.42B.300d')

In [None]:
my_embedding = nlp.Vocab(nlp.data.Counter(glove_42b300d.idx_to_token))
my_embedding.set_embedding(glove_42b300d)

### Load sentiment analysis dataset -- IMDB reviews

In [None]:
# tokenizer takes as input a string and outputs a list of tokens.
tokenizer = nlp.data.SpacyTokenizer('en')

# length_clip takes as input a list and outputs a list with maximum length 500.
length_clip = nlp.data.ClipSequence(500)

def preprocess(x):
    try:
        data, subj, pos, neg = x
        data = my_embedding.to_indices(length_clip(tokenizer(data)))
        subj = int((int(subj) == 1))
        return data, subj
    except:
        print(x)

def get_length(x):
    return float(len(x[0]))

In [None]:
train_dataset = nlp.data.TSVDataset('data/train.csv', num_discard_samples=1)
test_dataset = nlp.data.TSVDataset('data/test.csv', num_discard_samples=1)

In [None]:
print('Tokenize using spaCy...')

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

# Preprocess the dataset
train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
test_dataset, test_data_lengths = preprocess_dataset(test_dataset)

In [None]:
def get_dataloader():
    # Construct the DataLoader
    # Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, ret_length=True),
        nlp.data.batchify.Stack(dtype='float32'))
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_data_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    test_dataloader = gluon.data.DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = get_dataloader()

### Network

In [None]:
class AveragePooling(gluon.HybridBlock):
    # Mean pooling layer for output of LSTMs
    def __init__(self, prefix=None, params=None):
        super(AveragePooling, self).__init__(prefix=prefix, params=params)

    def hybrid_forward(self, F, data, length):
        # Data will have shape (T, N, C)
        # fix the data into a certain length
        maskedData = F.SequenceMask(data,sequence_length=length,use_sequence_length=True)
        # average the data
        avgState = F.broadcast_div(F.sum(maskedData, axis=0),F.expand_dims(length, axis=1))
        return avgState

class SentimentNet(gluon.HybridBlock):
    # contruct the network
    def __init__(self, dropout, prefix=None, params=None):
        super(SentimentNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            #bidirection LSTM with 200 size vector outputs
            self.encoder = gluon.nn.HybridSequential()
            with self.encoder.name_scope():
                self.encoder.add(mx.gluon.rnn.LSTM(200,num_layers=2,bidirectional=True))
            #average the last layer
            self.agg_layer = AveragePooling()
            #output layer
            self.output = gluon.nn.HybridSequential()
            with self.output.name_scope():
                self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(1, flatten=False))

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        encoded = self.encoder(data)  # Shape(T, N, C)
        agg_state = self.agg_layer(encoded, valid_length)
        out = self.output(agg_state)
        return out

### Hyperparameters

In [None]:
emb = gluon.nn.Embedding(1917495, 300)
emb.initialize()
emb.weight.set_data(glove_42b300d.idx_to_vec)
net = SentimentNet(dropout=dropout)
net.hybridize()
net.initialize(mx.init.Xavier(),ctx=context)

In [None]:
def evaluate(net, emb, dataloader, context):
    loss = gluon.loss.SigmoidBCELoss()
    total_L = 0.0
    total_sample_num = 0
    #total_correct_num = 0
    start_log_interval_time = time.time()
    acc = mx.metric.Accuracy()
    #print('Begin Testing...')
    for i, ((data, valid_length), label) in enumerate(dataloader):
        data = mx.nd.transpose(data)
        valid_length = valid_length.as_in_context(context).astype(np.float32)
        label = label.as_in_context(context)
        output = net(emb(data).as_in_context(context).detach(), valid_length)
        L = loss(output, label)
        pred = (output > 0.5).reshape(-1)
        total_L += L.sum().asscalar()
        total_sample_num += label.shape[0]
        #total_correct_num += (pred == label).sum().asscalar()
        acc.update(preds = output, labels = label)
        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, len(dataloader),
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()
    avg_L = total_L / float(total_sample_num)
    #acc = total_correct_num / float(total_sample_num)
    return avg_L, acc.get()[1]

In [None]:
def train(net, emb, context, epochs):
    trainer = gluon.Trainer(net.collect_params(), 'ftml',
                            {'learning_rate': learning_rate})
    loss = gluon.loss.SigmoidBCELoss()

    parameters = net.collect_params().values()

    # Training/Testing
    for epoch in range(epochs):
        # Epoch training stats
        start_epoch_time = time.time()
        epoch_L = 0.0
        epoch_sent_num = 0
        epoch_wc = 0
        # Log interval training stats
        start_log_interval_time = time.time()
        log_interval_wc = 0
        log_interval_sent_num = 0
        log_interval_L = 0.0

        for i, ((data, length), label) in enumerate(train_dataloader):
            L = 0
            wc = length.sum().asscalar()
            log_interval_wc += wc
            epoch_wc += wc
            log_interval_sent_num += data.shape[1]
            epoch_sent_num += data.shape[1]
            with autograd.record():
                output = net(emb(data.T).as_in_context(context).detach(),
                             length.as_in_context(context)
                                   .astype(np.float32))
                L = L + loss(output, label.as_in_context(context)).mean()
            L.backward()
            # Clip gradient
            if grad_clip:
                gluon.utils.clip_global_norm(
                    [p.grad(context) for p in parameters],
                    grad_clip)
            # Update parameter
            trainer.step(1)
            log_interval_L += L.asscalar()
            epoch_L += L.asscalar()
            if (i + 1) % log_interval == 0:
                print(
                    '[Epoch {} Batch {}/{}] elapsed {:.2f} s, '
                    'avg loss {:.6f}, throughput {:.2f}K wps'.format(
                        epoch, i + 1, len(train_dataloader),
                        time.time() - start_log_interval_time,
                        log_interval_L / log_interval_sent_num, log_interval_wc
                        / 1000 / (time.time() - start_log_interval_time)))
                # Clear log interval training stats
                start_log_interval_time = time.time()
                log_interval_wc = 0
                log_interval_sent_num = 0
                log_interval_L = 0
        end_epoch_time = time.time()
        test_avg_L, test_acc = evaluate(net, emb, test_dataloader, context)
        print('[Epoch {}] train avg loss {:.6f}, test acc {:.2f}, '
              'test avg loss {:.6f}, throughput {:.2f}K wps'.format(
                  epoch, epoch_L / epoch_sent_num, test_acc, test_avg_L,
                  epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))
        file_name = "weights/epoch{}.params".format(epoch)
        net.save_parameters(file_name)

In [None]:
train(net, emb, context, 10)