# 6. Recurrent Neural Networks and Language Models

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture8.pdf
* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture9.pdf
* http://colah.github.io/posts/2015-08-Understanding-LSTMs/
* https://github.com/pytorch/examples/tree/master/word_language_model
* https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/language_model

In [1]:
import os
import mindspore
from mindspore import nn, ops, Tensor
import random
import numpy as np
from mindnlp.modules import Accumulator
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

  from tqdm.autonotebook import tqdm


In [2]:
gpu = '0'
# 设置使用哪些显卡进行训练
os.environ["CUDA_VISIBLE_DEVICES"] = gpu

In [3]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w]
                    if word2index.get(w) is not None
                    else word2index["<UNK>"], seq))
    sequence = Tensor(idxs, dtype=mindspore.int64)
    return sequence

## Data load and Preprocessing

### Penn TreeBank

In [4]:
def prepare_ptb_dataset(filename, word2index=None):
    corpus = open(filename, 'r', encoding='utf-8').readlines()
    corpus = flatten([co.strip().split() + ['</s>'] for co in corpus])

    if word2index is None:
        vocab = list(set(corpus))
        word2index = {'<unk>': 0}
        for vo in vocab:
            if word2index.get(vo) is None:
                word2index[vo] = len(word2index)

    return prepare_sequence(corpus, word2index), word2index

In [5]:
# borrowed code from https://github.com/pytorch/examples/tree/master/word_language_model

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.shape[0] // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1)
    return data

In [6]:
def getBatch(data, seq_length):
    for i in range(0, data.shape[1] - seq_length, seq_length):
        inputs = data[:, i: i + seq_length]
        targets = data[:, (i + 1): (i + 1) + seq_length]
        yield (inputs, targets)

In [7]:
train_data, word2index = prepare_ptb_dataset('../dataset/ptb/ptb.train.txt')
dev_data , _ = prepare_ptb_dataset('../dataset/ptb/ptb.valid.txt', word2index)
test_data, _ = prepare_ptb_dataset('../dataset/ptb/ptb.test.txt', word2index)

In [8]:
len(word2index)

10000

In [9]:
index2word = {v: k for k, v in word2index.items()}

## Modeling 

<img src="../images/06.rnnlm-architecture.png">
<center>borrowed image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture8.pdf</center>

In [10]:
class LanguageModel(nn.Cell):
    def __init__(self, vocab_size, embedding_size, hidden_size, n_layers=1, dropout_p=0.5):

        super(LanguageModel, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embedding_size, embedding_table="XavierUniform")
        self.rnn = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True)
        self.dense = nn.Dense(hidden_size, vocab_size, weight_init="XavierUniform", bias_init="Zero")
        self.dropout = nn.Dropout(p=dropout_p)

    def init_hidden(self, batch_size):
        hidden = ops.zeros((self.n_layers, batch_size, self.hidden_size))
        context = ops.zeros((self.n_layers, batch_size, self.hidden_size))
        return (hidden, context)

    def detach_hidden(self, hiddens):
        return tuple([hidden.copy() for hidden in hiddens])

    def construct(self, inputs, hidden, is_training=False):
        embeds = self.embed(inputs)
        if is_training:
            embeds = self.dropout(embeds)
        out, hidden = self.rnn(embeds, hidden)
        return self.dense(out.view(out.shape[0] * out.shape[1], -1)), hidden

## Train 

It takes for a while...

In [11]:
EMBED_SIZE = 128
HIDDEN_SIZE = 1024
NUM_LAYER = 1
LR = 0.0002
SEQ_LENGTH = 30  # for bptt
BATCH_SIZE = 20
EPOCH = 40
RESCHEDULED = False

In [12]:
train_data = batchify(train_data, BATCH_SIZE)
dev_data = batchify(dev_data, BATCH_SIZE // 2)
test_data = batchify(test_data, BATCH_SIZE // 2)

In [13]:
model = LanguageModel(len(word2index), EMBED_SIZE, HIDDEN_SIZE, NUM_LAYER, 0.5)
loss_function = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=LR)

In [14]:
accumulate_step = 2
accumulator = Accumulator(optimizer, accumulate_step)


def forward_fn(inputs, hidden, targets, is_training):
    """Forward function"""
    preds, hidden = model(inputs, hidden, is_training)
    targets = targets.astype(mindspore.int32)
    loss = loss_function(preds, targets.view(-1))
    return loss / accumulate_step


# Get gradient function
grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())


# Define function of one-step training
def train_step(inputs, hidden, targets, is_training):
    """Training steps"""
    loss, grads = grad_fn(inputs, hidden, targets, is_training)
    grads = ops.clip_by_value(grads, clip_value_min=-0.5, clip_value_max=0.5)
    loss = ops.depend(loss, accumulator(grads))
    return loss

In [15]:
for epoch in range(EPOCH):
    total_loss = 0
    losses = []
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(getBatch(train_data, SEQ_LENGTH)):
        inputs, targets = batch
        hidden = model.detach_hidden(hidden)
        loss = train_step(inputs, hidden, targets, True)

        losses.append(loss.asnumpy().item(0) * accumulate_step)

        if i > 0 and i % 500 == 0:
            print("[%02d/%d] mean_loss : %0.2f, Perplexity : %0.2f" % (epoch, EPOCH, np.mean(losses), np.exp(np.mean(losses))))
            losses = []

    # learning rate anealing
    # You can use http://pytorch.org/docs/master/optim.html#how-to-adjust-learning-rate
    if RESCHEDULED is False and epoch == EPOCH // 2:
        LR *= 0.1
        optimizer = nn.Adam(model.trainable_params(), learning_rate=LR)
        accumulator = Accumulator(optimizer, accumulate_step)
        RESCHEDULED = True

[00/40] mean_loss : 7.00, Perplexity : 1094.19
[00/40] mean_loss : 6.62, Perplexity : 749.49
[00/40] mean_loss : 6.40, Perplexity : 601.51
[01/40] mean_loss : 6.21, Perplexity : 497.77
[01/40] mean_loss : 6.09, Perplexity : 442.04
[01/40] mean_loss : 5.96, Perplexity : 386.40
[02/40] mean_loss : 5.89, Perplexity : 359.83
[02/40] mean_loss : 5.81, Perplexity : 332.81
[02/40] mean_loss : 5.70, Perplexity : 298.73
[03/40] mean_loss : 5.66, Perplexity : 285.92
[03/40] mean_loss : 5.60, Perplexity : 269.15
[03/40] mean_loss : 5.51, Perplexity : 246.17
[04/40] mean_loss : 5.48, Perplexity : 239.39
[04/40] mean_loss : 5.43, Perplexity : 228.43
[04/40] mean_loss : 5.35, Perplexity : 211.15
[05/40] mean_loss : 5.33, Perplexity : 206.84
[05/40] mean_loss : 5.29, Perplexity : 198.86
[05/40] mean_loss : 5.22, Perplexity : 185.06
[06/40] mean_loss : 5.20, Perplexity : 181.94
[06/40] mean_loss : 5.17, Perplexity : 175.52
[06/40] mean_loss : 5.10, Perplexity : 163.99
[07/40] mean_loss : 5.09, Perplex

### Test 

In [16]:
total_loss = 0
hidden = model.init_hidden(BATCH_SIZE // 2)
for batch in getBatch(test_data, SEQ_LENGTH):
    inputs, targets = batch

    hidden = model.detach_hidden(hidden)
    preds, hidden = model(inputs, hidden)
    targets = targets.astype(mindspore.int32)
    total_loss += inputs.shape[1] * loss_function(preds, targets.view(-1))

total_loss = total_loss / test_data.shape[1]
print("Test Perpelexity : %5.2f" % (np.exp(total_loss)))

Test Perpelexity : 157.29


## Further topics

* <a href="https://arxiv.org/pdf/1609.07843.pdf">Pointer Sentinel Mixture Models</a>
* <a href="https://arxiv.org/pdf/1708.02182">Regularizing and Optimizing LSTM Language Models</a>