Following implementation makes use of pytorch's computational graph to do backprop, else everything is from sratch.

In [1]:
!pip install d2l

Installing collected packages: d2l
Successfully installed d2l-0.16.6


In [2]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from d2l import torch as d2l
import re
from collections import Counter

Reading the data

Data taken from 
[H. G. Wells’ The Time Machine](http://www.gutenberg.org/ebooks/35)

# Loading the dataset

In [3]:
batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)

Downloading ../data/timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...


In [4]:
print(len(vocab))
print(vocab.token_to_idx)

28
{'<unk>': 0, ' ': 1, 'e': 2, 't': 3, 'a': 4, 'i': 5, 'n': 6, 'o': 7, 's': 8, 'h': 9, 'r': 10, 'd': 11, 'l': 12, 'm': 13, 'u': 14, 'c': 15, 'f': 16, 'w': 17, 'g': 18, 'y': 19, 'p': 20, 'b': 21, 'v': 22, 'k': 23, 'x': 24, 'z': 25, 'j': 26, 'q': 27}


In [5]:
x,y = next(iter(train_iter))
print(x.shape) # (batch_size x num_steps)
print(y.shape) # (batch_size x num_steps)

torch.Size([32, 35])
torch.Size([32, 35])


In [6]:
print(x[0])
print(y[0])

tensor([ 5,  3,  9,  2,  1,  3,  5, 13,  2,  1,  3, 10,  4, 22,  2, 12, 12,  2,
        10,  1, 16,  7, 10,  1,  8,  7,  1,  5,  3,  1, 17,  5, 12, 12,  1])
tensor([ 3,  9,  2,  1,  3,  5, 13,  2,  1,  3, 10,  4, 22,  2, 12, 12,  2, 10,
         1, 16,  7, 10,  1,  8,  7,  1,  5,  3,  1, 17,  5, 12, 12,  1, 21])


In [7]:
# One hot encoding

# input of shape (batch_size x num_steps)
X = torch.arange(10).reshape((2,5))

# we want (num_steps x batch_size x len(vocab))

F.one_hot(X.T, len(vocab)).shape

torch.Size([5, 2, 28])

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initializing the model parameters

The number of hidden units num_hiddens is a tunable hyperparameter. When training language models, the inputs and outputs are from the same vocabulary. Hence, they have the same dimension, which is equal to the vocabulary size.



Mini batch of examples
$\textbf{X}_{t} \in \mathbb{R}^{nxd}$, where n = batch_size, d = inputs size (vocab_size) at time step t. Each row of $\textbf{X}_t$ corresponds to one example at time step t. 

Hidden layer's output at time $t-1$,
$\textbf{H}_{t-1} \in \mathbb{R}^{nxh}$, where n = batch_size, h = number of hidden units in hidden layer from previous time step.

Hidden variable of current time step $t$ is calculated as 

$\textbf{H}_t = \phi(\textbf{X}_t\textbf{W}_{xh} + \textbf{H}_{t-1}\textbf{W}_{hh} + \textbf{b}_h)$

where, 
- $\textbf{W}_{xh} \in \mathbb{R}^{dxh}$ 
- $\textbf{W}_{hh} \in \mathbb{R}^{hxh}$ 
- $\textbf{b}_h \in \mathbb{R}^{1xh}$

Output layer at time $t$ is calculated as

$\textbf{O}_t = \textbf{H}_t\textbf{W}_{hy} + \textbf{b}_y$

where, $\textbf{W}_{hy} \in \mathbb{R}^{hxy}$ and,
$\textbf{b}_h \in \mathbb{R}^{1xy}$



In [9]:
def get_params(vocab_size, num_hiddens, device):
  num_inputs = num_outputs = vocab_size

  def normal(shape):
    return torch.randn(size=shape, device=device) * 0.01
  
  # Hidden layer parameters
  W_xh = normal((num_inputs, num_hiddens))
  W_hh = normal((num_hiddens, num_hiddens))
  b_h = torch.zeros(num_hiddens, device=device)

  # Output layer parameters
  W_hy = normal((num_hiddens, num_outputs))
  b_y = torch.zeros(num_outputs, device=device)

  # Attach gradients
  params = [W_xh, W_hh, b_h, W_hy, b_y]
  for param in params:
    param.requires_grad_(True)

  return params

# RNN Model

To define an RNN model, we first need an init_rnn_state function to return the hidden state at initialization. It returns a tensor filled with 0 and with a shape of (batch size, number of hidden units). Using tuples makes it easier to handle situations where the hidden state contains multiple variables.

In [10]:
def init_rnn_state(batch_size, num_hiddens, device):
  return (torch.zeros((batch_size, num_hiddens), device=device) ,)

def rnn(inputs, state, params):
  # inputs shape: (num_steps, batch_size, vocab_size)
  W_xh, W_hh, b_h, W_hy, b_y = params
  H, = state
  outputs = []
  for X in inputs:
    # shape of X: (batch_size, vocab_size)
    # b_h and b_y are broadcasted
    H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + b_h)
    Y = torch.mm(H, W_hy) + b_y # (batch_size x vocab_size)
    outputs.append(Y)
  return torch.cat(outputs, dim=0), (H,)

In [12]:
# Testing above function
num_hiddens = 256
params = get_params(len(vocab), num_hiddens=num_hiddens, device=device)
state = init_rnn_state(batch_size=batch_size, num_hiddens=num_hiddens, device=device)
inputs = torch.rand(size=(num_steps, batch_size, len(vocab))).to(device)
output, state = rnn(inputs, state, params)
print(output.shape)

torch.Size([1120, 28])


In [13]:
class RNNModelScratch():
  def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn):
    # forward_fn: can be forward implementation of rnn or gru or lstm
    self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
    self.params = get_params(vocab_size, num_hiddens, device)
    self.init_state, self.forward_fn = init_state, forward_fn
  
  def __call__(self, X, state):
    # Here X: (batch_size x num_steps)
    X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
    # Shape of X: (num_steps, batch_size, vocab_size)
    return self.forward_fn(X, state, self.params)
  
  def begin_state(self, batch_size, device):
    return self.init_state(batch_size, self.num_hiddens, device)

In [14]:
# batch_size x num_steps
X = torch.arange(10).reshape((2,5))

num_hiddens = 512
net = RNNModelScratch(vocab_size=len(vocab), num_hiddens=num_hiddens, device=device, get_params=get_params, init_state=init_rnn_state, forward_fn=rnn)
state = net.begin_state(batch_size=X.shape[0], device=device)
Y, new_state = net(X.to(device), state)
print(Y.shape) # (num_steps x batch_size, vocab_size)
print(len(new_state))
print(new_state[0].shape)

torch.Size([10, 28])
1
torch.Size([2, 512])


# Prediction

In [15]:
def predict_ch8(prefix, num_preds, net, vocab, device):
  """Generate new characters following the `prefix`."""
  state = net.begin_state(batch_size=1, device=device) # (batch_size x h)
  outputs = [vocab[prefix[0]]]
  get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape(
      (1, 1))
  for y in prefix[1:]:  # Warm-up period
    _, state = net(get_input(), state)
    outputs.append(vocab[y])
  for _ in range(num_preds):  # Predict `num_preds` steps
    y, state = net(get_input(), state)
    outputs.append(int(y.argmax(dim=1).reshape(1)))
  return ''.join([vocab.idx_to_token[i] for i in outputs])

In [17]:
predict_ch8('time traveller ', 10, net, vocab, device)

'time traveller jlcqxenlcq'

# Gradient Clipping

In [18]:
def grad_clipping(net, theta):
  """Clip the gradient."""
  if isinstance(net, nn.Module):
    params = [p for p in net.parameters() if p.requires_grad]
  else:
    params = net.params
  norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
  if norm > theta:
    for param in params:
      param.grad[:] *= theta / norm

# Training

In [19]:
def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter):
  for X, Y in train_iter:
    # X: (batch_size x num_steps)
    # Y: (batch_size x num_steps)
    state = None
    if state is None or use_random_iter:
      state = net.begin_state(batch_size=X.shape[0], device=device)
    else:
      if isinstance(net, nn.Module) and not isinstance(state, tuple):
        # state is a tensor for nn.GRU
        state.detach_()
      else:
        for s in state:
          s.detach_()
    
    y = Y.T.reshape(-1)
    X, y = X.to(device), y.to(device)
    y_hat, state = net(X, state)
    l = loss(y_hat, y.long()).mean()
    if isinstance(updater, torch.optim.Optimizer):
      updater.zero_grad()
      l.backward()
      grad_clipping(net, 1)
      updater.step()
    else:
      l.backward()
      grad_clipping(net, 1)
      updater(batch_size=1)

In [22]:
def train_ch8(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False):
  loss = nn.CrossEntropyLoss()

  if isinstance(net, nn.Module):
    updater = torch.optim.SGD(net.parameters(), lr)
  else:
    updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
  
  predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device)

  # Train and predict
  for epoch in range(num_epochs):
    train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter)

    if (epoch + 1) % 10 == 0:
      print(f'Epoch: {epoch+1}')
      print(predict('time traveller'))
    
  print(predict('time traveller'))
  print(predict('traveller'))

In [23]:
num_epochs, lr = 500, 1
train_ch8(net, train_iter, vocab, lr, num_epochs, device)

Epoch: 10
time traveller and and the the thate the thate the thate the tha
Epoch: 20
time travellere the the the the the the the the the the the the 
Epoch: 30
time traveller the gratin this this this thith sime that s and t
Epoch: 40
time traveller the pace this the proment dimensions and the thin
Epoch: 50
time travellertour and and the this treeredinet mavely that said
Epoch: 60
time traveller the prome tore ware this thing sion sime thave ti
Epoch: 70
time travellerit t angithe mave at in thisgis tr all ghour and t
Epoch: 80
time traveller the pace theegraveledraccedend sur and have it is
Epoch: 90
time traveller tho ghat s ane her about the room and that sis th
Epoch: 100
time traveller th cereally boct thithing a of thee this four dim
Epoch: 110
time traveller but howhis the forme athing difer man as i chenot
Epoch: 120
time traveller ffor ano hime and this time thaveller ffor any hi
Epoch: 130
time traveller ffreed his psere bling bof comistencefilby bucal 
Epoch: 140
time trave