In [1]:
%matplotlib inline
import os
import re
import collections
import random
import math

import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F

# 1.text preprocessing

typical preprocess steps:

    1.Load text as strings into memory, with some re.sub.

    2.Split strings into tokens (e.g., words and characters).

    3.Build a table of vocabulary to map the split tokens to numerical indices.

    4.Convert text into sequences of numerical indices so they can be manipulated by models easily.
    
    5.Form data iter

## 1.1 load text

In [2]:
def read_novels():
    """
    read lines of 10 famous novels.
    basic re.
    """
    lines = []
    folder_path = "../data/novels"
    for file in os.listdir(folder_path):
        if not file.startswith("."):
            lines += open(os.path.join(folder_path, file), "r").readlines()
    lines = [re.sub('[^A-Za-z]', ' ', line).strip().lower() for line in lines]
    return [line for line in lines if line]

In [3]:
lines = read_novels()

## 1.2 tokenize

In [4]:
def tokenize(lines, token='char'):
    """
    Split text lines into word or character tokens.
    we focus on char now
    """
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type: ' + token)

In [5]:
tokens = tokenize(lines)

## 1.3 bulid vocabulary

The string type of the token is inconvenient to be used by models, which take numerical inputs. 

Now let us build a dictionary, often called vocabulary as well, to map string tokens into numerical indices starting from 0

In [6]:
def count_corpus(tokens):
    """
    Count token frequencies.
    """
    return collections.Counter([token for line in tokens for token in line])

In [7]:
class Vocab:
    """
    Vocabulary for text.
    """
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # Sort according to frequencies
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The index for the unknown token is 0
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [token for token, freq in self.token_freqs
                        if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        """
        tokens may be list, tuple, string
        """
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        """
        indexs may be list, tuple, int 
        """
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

In [8]:
vocab = Vocab(tokens)

## 1.4 putting all things together

In [9]:
def load_corpus_novels():
    """
    Return token indices and the vocabulary of the novels dataset.
    """
    lines = read_novels()
    tokens = tokenize(lines)
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line if vocab[token] != 0]
    return corpus, vocab

In [10]:
corpus, vocab = load_corpus_novels()

In [11]:
len(corpus), len(vocab)

(410043, 28)

## 1.5 form data iter

In [12]:
def seq_data_iter(corpus, batch_size, num_steps):
    """
    Generate a minibatch
    result shape: (batch_size, num_steps)
    """
    # Start with a random offset to partition a sequence
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = torch.tensor(corpus[offset: offset + num_tokens]).reshape(batch_size, -1)
    Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens]).reshape(batch_size, -1)
    
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i + num_steps]
        yield X, Y

In [13]:
class SeqDataLoader:
    """
    An iterator to load sequence data.
    """
    def __init__(self, batch_size, num_steps):
        self.corpus, self.vocab = load_corpus_novels()
        self.batch_size, self.num_steps = batch_size, num_steps
        self.data_iter_fn = seq_data_iter

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

In [14]:
def load_data_novels(batch_size, num_steps):
    """
    Return the iterator and the vocabulary of the novel dataset.
    """
    data_iter = SeqDataLoader(batch_size, num_steps)
    return data_iter, data_iter.vocab

In [15]:
batch_size, num_steps = 32, 40
train_iter, vocab = load_data_novels(batch_size, num_steps)

# 2.basic language model

assume that the tokens in a text sequence of length  $T$  are in turn  $x_{1},x_{2},...,x_{T}$

the goal of language model is to estimate the joint-probability of the sequence:

$$P(x_{1},p_{2},...,x_{T})$$

an ideal language model would be able to generate natural text just on its own, simply by drawing one token at a time:

$$x_{t} \sim P(x_{t}|x_{1},...,x_{t-1})$$

## 2.1 n-gram

how to compute the joint-probabilty? let us start by applying basic probability rules:

$$P(x_{1},...,x_{T}) = \prod_{t=1}^{T}P(x_{t}|x_{1},...,x_{t-1})$$

probability could be estimated by frequency, for example:

$$\hat{P}(learning|deep) = \frac{n(deep, learning)}{n(deep)}$$

addtionally, we commonly perform laplace smoothing:

$$\hat{P(x)} = \frac{n(x) + \epsilon_{1}/m}{n + \epsilon_{1}}$$

$$\hat{P(x'|x)}=\frac{n(x,x') + \epsilon_{2}p(x')}{n(x) + \epsilon_{2}}$$

$$\hat{P(x''|x,x')}=\frac{n(x, x', x'') + \epsilon_{3}p(x'')}{n(x, x') + \epsilon_{3}}$$

with markov assumption, we get n-gram:

$$P(x_{t}|x_{1},...,x_{t-1}) = P(x_{t}|x_{t-n+1},...,x_{t-1})$$

so for unigram, bigram, trigram, we each have:

$$P(x_{1}, x_{2}, x_{3}, x_{4}) = P(x_{1})P(x_{2})P(x_{3})p(x_{4})$$

$$P(x_{1}, x_{2}, x_{3}, x_{4}) = P(x_{1})P(x_{2}|x_{1})P(x_{3}|x_{2})p(x_{4}|x_{3})$$

$$P(x_{1}, x_{2}, x_{3}, x_{4}) = P(x_{1})P(x_{2}|x_{1})P(x_{3}|x_{1},x_{2})p(x_{4}|x_{2},x_{3})$$

## 2.2 perplexity

how to measure the language model quality?

we can ask about predicting the next token given the current set of tokens. A better language model should allow us to predict the next token more accurately

thus can measure it by the cross-entropy loss averaged over all the  $n$  tokens of a sequence:

$$\frac{1}{n}\sum_{t=1}^{n}-log\ P(x_{t}|x_{t-1},...,x_{1})$$

where  𝑃  is given by a language model and  $x_{t}$  is the actual token observed at time step  $t$  from the sequence. 

This makes the performance on documents of different lengths comparable.

For historical reasons, we use exponentials called perplexity:

$$exp\left (\frac{1}{n}\sum_{t=1}^{n}-log\ P(x_{t}|x_{t-1},...,x_{1})\right )$$

# 3. rnn model

in n-gram models, the conditional probability of a word $x_{t}$ only depends on the $n - 1$ previous words.

if we want to incorporate the possible effect of words earlier than time step $t - (n - 1)$, we need to increase $n$
. but the number of model parameters $\left | V \right | ^{n}$ increase exponentially

Hence, rather than modeling  $P(x_{t}|x_{t-n+1},...,x_{t-1})$,  it is preferable to use a latent variable model:

$$P(x_{t}|x_{1},...,x_{t-1}) \approx P(x_{t}|h_{t-1})$$

normal mlp with one hidden layer:

$$H = \phi(XW_{xh} + b_{h})$$

$$O = HW_{hq} + b_{q}$$

for sequence problem, Assume that we have a minibatch of inputs  $X_{t}\in \mathbb{R}^{n\times{d}}$  at time step $t$, then use previous latent variable:

$$H_{t} = \phi(X_{t}W_{xh} + H_{t-1}W_{hh} + b_{h})$$

$$O_{t} = H_{t}W_{hq} + b_{q}$$

often $\phi = tanh$ as default.

RNN-based Character-Level Language Models:

![jupyter](./rnn-train.svg)

## 3.1 build model

In [16]:
num_hiddens = 256
rnn_layer = nn.RNN(len(vocab), num_hiddens)

nn.RNN descrption:

For each element in the input sequence, each layer computes the following function:

$$h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})$$

here $h_t$ is the hidden state at time $t$.<br>
$x_t$ is the input at time $t$.<br>
$h_{(t-1)}$ is the hidden state of the previous layer at time $t-1$ or the initial hidden state at time $0$.

In [17]:
class RNNModel(nn.Module):
    """
    The RNN model.
    """
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.num_hiddens = self.rnn.hidden_size
        self.linear = nn.Linear(self.num_hiddens, self.vocab_size)

    def forward(self, inputs, state):
        # (batch_size, num_steps) to (num_steps, batch_size, vocab_size)
        X = F.one_hot(inputs.T.long(), self.vocab_size).type(torch.float32)
        Y, state = self.rnn(X, state)
        # The fully connected layer will first change the shape of `Y` to
        # (`num_steps` * `batch_size`, `num_hiddens`). Its output shape is
        # (`num_steps` * `batch_size`, `vocab_size`).
        output = self.linear(Y.reshape((-1, Y.shape[-1])))
        return output, state

    def begin_state(self, batch_size=1):
        if not isinstance(self.rnn, nn.LSTM):
            return torch.zeros((1, batch_size, self.num_hiddens))
        else:
            # lstm has two states
            return (torch.zeros((1, batch_size, self.num_hiddens)),
                   torch.zeros((1, batch_size, self.num_hiddens)))

In [18]:
net = RNNModel(rnn_layer, vocab_size=len(vocab))

## 3.2 predict & train functions

predict function

In [19]:
def predict_ch8(prefix, num_preds, net, vocab):
    """
    Generate new characters following the prefix.
    """
    state = net.begin_state(batch_size=1) # for single predict, use batch_size=1, net need to implement
    outputs = [vocab[prefix[0]]]
    get_input = lambda: torch.reshape(torch.tensor([outputs[-1]]), (1, 1)) # batch_size, num_steps = 1, 1
    # warm-up period
    for y in prefix[1:]:
        _, state = net(get_input(), state) # call of net
        outputs.append(vocab[y])
    # predict num_preds steps
    for _ in range(num_preds):
        y, state = net(get_input(), state)
        # y shape: (1*1, vocab_size)
        outputs.append(int(y.argmax(dim=1))) # output max-probability
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [20]:
predict_ch8('time traveller ', 10, net, vocab)

'time traveller xx xx xx x'

train epoch function

In [21]:
def train_epoch_ch8(net, train_iter, loss, updater):
    """
    Train a net within one epoch
    """
    state = None
    metric = [0.0, 0.0] # Sum of training loss, no. of tokens
    for X, Y in train_iter: # shape: (batch_size, num_steps)
        if state is None:
            # Initialize state when it is the first iteration
            state = net.begin_state(batch_size=X.shape[0])
        else:
            if isinstance(state, tuple): # lstm has two states
                for s in state:
                    s.detach_()
            else:
                state.detach_()
        y = Y.T.reshape(-1) # shape: batch_size * num_steps
        y_hat, state = net(X, state) # y_hat shape: (num_steps * batch_size, vocab_size)
        l = loss(y_hat, y.long()).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            # Since the mean function has been invoked
            updater(batch_size=1)
        metric[0] += float(l * len(y))
        metric[1] += len(y)
        # print(metric)
    print(metric)
    return metric[0] / metric[1] # perplexity

train function

In [22]:
def train_ch8(net, train_iter, vocab, lr, num_epochs):
    """
    Train a model
    """
    loss = nn.CrossEntropyLoss()
    # Initialize
    if isinstance(net, nn.Module):
        updater = torch.optim.SGD(net.parameters(), lr)
    else:
        updater = lambda batch_size: sgd(net.params, lr, batch_size)
    predict = lambda prefix: predict_ch8(prefix, 30, net, vocab)
    # Train and predict
    for epoch in range(num_epochs):
        ppl = train_epoch_ch8(net, train_iter, loss, updater)
        if (epoch + 1) % 10 == 0:
            print(f"epoch: {epoch + 1}, log ppl: {ppl}, predict: {predict('time traveller')}")
    print(f'log perplexity {ppl:.1f}')
    print(predict('time traveller'))
    print(predict('traveller'))

In [23]:
num_epochs, lr = 50, 0.1
train_ch8(net, train_iter, vocab, lr, num_epochs)

[1133096.4404296875, 409600.0]
[1054988.9448242188, 409600.0]
[1000192.8452148438, 409600.0]
[969105.1059570312, 409600.0]
[950116.5832519531, 409600.0]
[936716.408203125, 409600.0]
[925909.9848632812, 409600.0]
[916766.8342285156, 409600.0]
[907932.7380371094, 409600.0]
[899192.279296875, 409600.0]
epoch: 10, log ppl: 2.195293650627136, predict: time traveller    and the  and the  and the 
[890760.8466796875, 409600.0]
[882635.9470214844, 409600.0]
[874675.4450683594, 409600.0]
[867071.2788085938, 409600.0]
[860329.0942382812, 409600.0]
[853209.9924316406, 409600.0]
[846818.1467285156, 409600.0]
[840788.4501953125, 409600.0]
[834735.4404296875, 409600.0]
[828678.6572265625, 409600.0]
epoch: 20, log ppl: 2.023141252994537, predict: time traveller    and the was the was the wa
[823315.140625, 409600.0]
[817850.8608398438, 409600.0]
[812675.3845214844, 409600.0]
[807695.9619140625, 409600.0]
[802583.1977539062, 409600.0]
[798355.2785644531, 409600.0]
[793553.59765625, 409600.0]
[788857.0

# 4.GRU

when calculate gradient in RNNs, long products of matrices can lead to vanishing or exploding gradients.

A number of methods have been proposed to address this, one is gru.

The key distinction between RNNs and GRUs is that the latter support gating of the hidden state

This means that we have mechanisms for when a hidden state should be updated and also when it should be reset

For instance, if the first token is of great importance we will learn not to update the hidden state after the first observation. Likewise, we will learn to skip irrelevant temporary observations

## 4.1 reset gate and update gate:

![jupyter](./gru-1.svg)

equations:

$$
\begin{aligned}
R_t = \sigma(X_t W_{xr} + H_{t-1} W_{hr} + b_r),\\
Z_t = \sigma(X_t W_{xz} + H_{t-1} W_{hz} + b_z),
\end{aligned}
$$

where $X_{t} \in \mathbb{R}^{n \times d}, W_{xr},W_{xz} \in \mathbb{R}^{d \times h}, H_{t-1} \in \mathbb{R}^{n \times h}, W_{hr},W_{hz} \in \mathbb{R}^{h \times h}, b_r,b_z \in \mathbb{R}^{1 \times h}$.

finally $R_t, Z_t \in \mathbb{R}^{n \times h}$ with elements in $(0, 1)$.

## 4.2 candidate hidden state

integrate the reset gate  $R_{t}$  with the regular latent state updating:

$$\tilde{H}_{t} = tanh(X_{t}W_{xh} + (R_{t} \odot H_{t-1})W_{hh} + b_{h})$$

the difference between RNNs and GRUs:

$$H_{t-1} \to R_{t} \odot H_{t-1}$$

reset gate: control how much of the previous state we might still want to remember, better capture short dependencies.

RNNs: how much to remember is fixed along all steps. 

GRUs: how much to remember is flexible determined by $(X_{t}, H_{t-1})$

## 4.3 hidden state

incorporate the effect of the update gate  $Z_{t}$. 

$$H_{t} = Z_{t} \odot H_{t-1} + (1 - Z_{t}) \odot \tilde{H}_{t}$$

$Z_{t}$  is close to 1: we simply retain the old state, In this case the information from  $X_{t}$  is essentially ignored.

$Z_{t}$ is close to 0: $H_{t}$  approaches the candidate latent state  $\tilde{H}_{t}$

this design: better capture dependencies for sequences with large time step distances.

![jupyter](./gru-3.svg)

reset gate $R_{t}$: inner, short dependencies.

update gate $Z_{t}$: outer, long dependencies.

## 4.4 gru implementation

In [24]:
gru_layer = nn.GRU(len(vocab), num_hiddens)
gru_net = RNNModel(gru_layer, vocab_size=len(vocab))

In [25]:
num_epochs, lr = 50, 1
train_ch8(gru_net, train_iter, vocab, lr, num_epochs)

[1055053.6240234375, 409600.0]
[944187.3481445312, 409600.0]
[895549.3020019531, 409600.0]
[859744.2561035156, 409600.0]
[832119.8303222656, 409600.0]
[809264.7875976562, 409600.0]
[789003.4211425781, 409600.0]
[770029.1254882812, 409600.0]
[752922.2001953125, 409600.0]
[735960.9733886719, 409600.0]
epoch: 10, log ppl: 1.7967797201871871, predict: time traveller  and the sood  and the sood  
[720925.8706054688, 409600.0]
[706715.208984375, 409600.0]
[693712.6833496094, 409600.0]
[681708.9838867188, 409600.0]
[670652.5776367188, 409600.0]
[660365.0559082031, 409600.0]
[650396.6463623047, 409600.0]
[641452.8891601562, 409600.0]
[632858.6715087891, 409600.0]
[624666.8208007812, 409600.0]
epoch: 20, log ppl: 1.5250654804706574, predict: time traveller  and the starder  and the sta
[617160.4501953125, 409600.0]
[609748.3176269531, 409600.0]
[602846.6145019531, 409600.0]
[596218.3585205078, 409600.0]
[589681.0755615234, 409600.0]
[583643.8028564453, 409600.0]
[577619.9357910156, 409600.0]
[5

# 5. LSTM

challenges: long-term information preservation and short-term input skipping.

## 5.1 input gate, forget gate, output gate

equations:

$$
\begin{aligned}
I_t = \sigma(X_t W_{xi} + H_{t-1} W_{hi} + b_i),\\
F_t = \sigma(X_t W_{xf} + H_{t-1} W_{hf} + b_f),\\
O_t = \sigma(X_t W_{xo} + H_{t-1} W_{ho} + b_o)
\end{aligned}
$$

where $X_{t} \in \mathbb{R}^{n \times d}, W_{xi},W_{xf},W_{xo} \in \mathbb{R}^{d \times h}, H_{t-1} \in \mathbb{R}^{n \times h}, W_{hi},W_{hf},W_{ho} \in \mathbb{R}^{h \times h}, b_i,b_f,b_o \in \mathbb{R}^{1 \times h}$.

finally $I_t, F_t, O_t \in \mathbb{R}^{n \times h}$ with elements in $(0, 1)$.

## 5.2 memory cell

we first have the candidate memory cell using the tanh activation:

$$\tilde{C}_{t} = tanh(X_t W_{xc} + H_{t-1} W_{hc} + b_c)$$

![jupyter](./lstm-1.svg)

forget gate $F_{t}$: addresses how much of the old memory cell content  $C_{t-1}$  we retain.

input gate $I_{t}$: how much we take new data into account via  $\tilde{C}_{t}$

$$C_{t} = F_{t} \odot C_{t-1} + I_{t} \odot \tilde{C}_{t}$$

## 5.3 hidden state

output gate: how to use memory in hidden state.

$$H_{t} = O_{t} \odot tanh(C_{t})$$

$H_{t}$ is therefore in the interval $(-1, 1)$.

![jupyter](./lstm-3.svg)

output gate $O_{t}$: outer, long term dependencies.

input gate $I_{t}$, forget gate $F_{t}$: inner, short term dependencies.

## 5.4 lstm implementation

In [26]:
lstm_layer = nn.LSTM(len(vocab), num_hiddens)
lstm_net = RNNModel(lstm_layer, vocab_size=len(vocab))

In [27]:
num_epochs, lr = 50, 1
train_ch8(lstm_net, train_iter, vocab, lr, num_epochs)

[1118939.3466796875, 409600.0]
[988705.4716796875, 409600.0]
[930641.1115722656, 409600.0]
[883755.7602539062, 409600.0]
[848321.111328125, 409600.0]
[822815.9350585938, 409600.0]
[801239.1000976562, 409600.0]
[782676.5886230469, 409600.0]
[766345.4780273438, 409600.0]
[750910.0705566406, 409600.0]
epoch: 10, log ppl: 1.8332765394449233, predict: time traveller  and the could  i was the cou
[737386.3405761719, 409600.0]
[724549.5661621094, 409600.0]
[712904.8166503906, 409600.0]
[701851.7102050781, 409600.0]
[691463.306640625, 409600.0]
[682094.59765625, 409600.0]
[672885.2631835938, 409600.0]
[664293.7593994141, 409600.0]
[656162.9904785156, 409600.0]
[648605.845703125, 409600.0]
epoch: 20, log ppl: 1.583510365486145, predict: time traveller  and the compress  and the co
[641640.5600585938, 409600.0]
[634418.2532958984, 409600.0]
[628169.5251464844, 409600.0]
[621698.1871337891, 409600.0]
[615903.4088134766, 409600.0]
[609829.5883789062, 409600.0]
[604449.1677246094, 409600.0]
[599121