In [15]:
import numpy as np
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn, rnn

#let's use cpu for now
context = mx.cpu()

# Create a random sequence

In [25]:
vocab_size = 50

seq = mx.nd.array(np.random.randint(vocab_size, size=100000))

In [26]:
seq


[15. 22.  6. ... 25. 18.  1.]
<NDArray 100000 @cpu(0)>

# DL portion

## RNN Model, we can modify this for more complex model structure

In [8]:
#stolen from https://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html
class RNNModel(gluon.Block):
    """A model with an encoder, recurrent layer, and a decoder."""

    def __init__(self, mode, vocab_size, num_embed, num_hidden,
                 num_layers, dropout=0.5, tie_weights=False, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.encoder = nn.Embedding(vocab_size, num_embed,
                                        weight_initializer = mx.init.Uniform(0.1))
            if mode == 'rnn_relu':
                self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout,
                                   input_size=num_embed)
            elif mode == 'rnn_tanh':
                self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout,
                                   input_size=num_embed)
            elif mode == 'lstm':
                self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout,
                                    input_size=num_embed)
            elif mode == 'gru':
                self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout,
                                   input_size=num_embed)
            else:
                raise ValueError("Invalid mode %s. Options are rnn_relu, "
                                 "rnn_tanh, lstm, and gru"%mode)
            if tie_weights:
                self.decoder = nn.Dense(vocab_size, in_units = num_hidden,
                                        params = self.encoder.params)
            else:
                self.decoder = nn.Dense(vocab_size, in_units = num_hidden)
            self.num_hidden = num_hidden

    def forward(self, inputs, hidden):
        emb = self.drop(self.encoder(inputs))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.reshape((-1, self.num_hidden)))
        return decoded, hidden

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

## Prepare for training

In [18]:
#model constants
num_embed = 5
num_hidden = 5
num_layers = 1

#training constants
args_lr = 0.01
args_epochs = 50
args_batch_size = 32

In [14]:
#define model and loss
model = RNNModel(mode='lstm', vocab_size=vocab_size, num_embed=num_embed, num_hidden=num_hidden,
                 num_layers=num_layers)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': args_lr, 'momentum': 0, 'wd': 0})
loss = gluon.loss.SoftmaxCrossEntropyLoss()

## training loop

In [20]:
def eval(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context)
    for i in range(0, data_source.shape[0] - 1, args_bptt):
        data, target = get_batch(data_source, i)
        output, hidden = model(data, hidden)
        L = loss(output, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal

#args_bptt?
def train(train_data, args_epochs, args_batch_size, context,
         args_bptt=5, args_log_interval=10):
    best_val = float("Inf")
    for epoch in range(args_epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx = context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args_bptt)):
            data, target = get_batch(train_data, i)
            #hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it. (why?)
            #gluon.utils.clip_global_norm(grads, args_clip * args_bptt * args_batch_size)

            trainer.step(args_batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % args_log_interval == 0 and ibatch > 0:
                cur_L = total_L / args_batch_size / args_log_interval
                #cur_L = total_L / args_bptt / args_batch_size / args_log_interval
                print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        #val_L = eval(val_data)

        #print('[Epoch %d] time cost %.2fs, validation loss %.2f, validation perplexity %.2f' % (
        #    epoch + 1, time.time() - start_time, val_L, math.exp(val_L)))

        #if val_L < best_val:
        #    best_val = val_L
        #    test_L = eval(test_data)
        #    model.save_parameters(args_save)
        #    print('test loss %.2f, test perplexity %.2f' % (test_L, math.exp(test_L)))
        #else:
        #    args_lr = args_lr * 0.25
        #    trainer._init_optimizer('sgd',
        #                            {'learning_rate': args_lr,
        #                             'momentum': 0,
        #                             'wd': 0})
        #    model.load_parameters(args_save, context)

In [36]:
def batchify(data, batch_size):
    """Reshape data into (num_example, batch_size)"""
    nbatch = data.shape[0] // batch_size
    data = data[:nbatch * batch_size]
    data = data.reshape((batch_size, nbatch)).T
    return data

train_data = batchify(seq, args_batch_size).as_in_context(context)

In [34]:
args_bptt=5
def get_batch(source, i):
    seq_len = min(args_bptt, source.shape[0] - 1 - i)
    data = source[i : i + seq_len]
    target = source[i + 1 : i + 1 + seq_len]
    return data, target.reshape((-1,))

In [37]:
import math
import os
import time


train(train_data=train_data, args_epochs=args_epochs, args_batch_size=args_batch_size,
      context=context, args_bptt=5, args_log_interval=10)

[Epoch 1 Batch 10] loss 21.52, perplexity 2209321622.81
[Epoch 1 Batch 20] loss 19.56, perplexity 312898480.01
[Epoch 1 Batch 30] loss 19.56, perplexity 312721994.45
[Epoch 1 Batch 40] loss 19.56, perplexity 312514670.63
[Epoch 1 Batch 50] loss 19.56, perplexity 312376054.47
[Epoch 1 Batch 60] loss 19.56, perplexity 312548827.56
[Epoch 1 Batch 70] loss 19.56, perplexity 312316717.42
[Epoch 1 Batch 80] loss 19.56, perplexity 312806346.66
[Epoch 1 Batch 90] loss 19.56, perplexity 312933156.39
[Epoch 1 Batch 100] loss 19.56, perplexity 312301408.39
[Epoch 1 Batch 110] loss 19.56, perplexity 313022103.05
[Epoch 1 Batch 120] loss 19.56, perplexity 312535355.09
[Epoch 1 Batch 130] loss 19.56, perplexity 313183047.31
[Epoch 1 Batch 140] loss 19.56, perplexity 312669390.23
[Epoch 1 Batch 150] loss 19.56, perplexity 312441302.48
[Epoch 1 Batch 160] loss 19.56, perplexity 312673385.93
[Epoch 1 Batch 170] loss 19.56, perplexity 313214648.68
[Epoch 1 Batch 180] loss 19.56, perplexity 312904030.36


NameError: name 'val_data' is not defined