Character level RNN

In [None]:
!pip install d2l

In [2]:
import math
import torch 
import torch.nn as nn
from torch.nn import functional as F
from d2l import torch as d2l

In [3]:
# Load data
batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)

Downloading ../data/timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...


In [4]:
print(len(vocab))
print(vocab.token_freqs)

28
[(' ', 29927), ('e', 17838), ('t', 13515), ('a', 11704), ('i', 10138), ('n', 9917), ('o', 9758), ('s', 8486), ('h', 8257), ('r', 7674), ('d', 6337), ('l', 6146), ('m', 4043), ('u', 3805), ('c', 3424), ('f', 3354), ('w', 3225), ('g', 3075), ('y', 2679), ('p', 2427), ('b', 1897), ('v', 1295), ('k', 1087), ('x', 236), ('z', 144), ('j', 97), ('q', 95)]


In [5]:
x, y = next(iter(train_iter))
print(x.shape, y.shape)

torch.Size([32, 35]) torch.Size([32, 35])


In [6]:
print(x[0])
print(y[0])

tensor([21, 19,  1,  9,  1, 18,  1, 17,  2, 12, 12,  8,  5,  3,  9,  2,  1,  3,
         5, 13,  2,  1,  3, 10,  4, 22,  2, 12, 12,  2, 10,  1, 16,  7, 10])
tensor([19,  1,  9,  1, 18,  1, 17,  2, 12, 12,  8,  5,  3,  9,  2,  1,  3,  5,
        13,  2,  1,  3, 10,  4, 22,  2, 12, 12,  2, 10,  1, 16,  7, 10,  1])


One Hot Encoding

In [7]:
F.one_hot(torch.tensor([0,3,5]), len(vocab))

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

The shape of the minibatch that we sample each time is (batch size, number of time steps). The one_hot function transforms such a minibatch into a three-dimensional tensor with the last dimension equals to the vocabulary size (len(vocab)). We often transpose the input so that we will obtain an output of shape (number of time steps, batch size, vocabulary size). This will allow us to more conveniently loop through the outermost dimension for updating hidden states of a minibatch, time step by time step.

In [8]:
X = torch.arange(10).reshape(2,5)
F.one_hot(X.T, len(vocab)).shape

torch.Size([5, 2, 28])

# Defining the Model

RNN Model

[torch.nn.RNN](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

Parameters:
- input feature size in x
- number of features in hidden state h
- number of recurrent layers
- batch_first: if **True** then input and output shape: (batch_size, seq_len, feature); if **False** then input and output shape: (seq_len, batch_size, feature)
- bidirectional: True/False




In [15]:
num_hiddens = 256
num_layers = 1
rnn_layer = nn.RNN(input_size=len(vocab), hidden_size=num_hiddens, num_layers=num_layers, batch_first=False)
# rnn_layer = nn.RNN(input_size=len(vocab), hidden_size=num_hiddens, num_layers=num_layers, batch_first=True)

Shape of hidden state: (D*num_layers, batch_size, hidden_size)

where, D = 2 if bidirectional = True otherwise 1

In [16]:
# initialize the hidden state h0
state = torch.zeros((num_layers, batch_size, num_hiddens))
state.shape

torch.Size([1, 32, 256])

Computing hidden state


Input shape:
```
  if batch_first is False:
    (seq_len, batch_size, feature_size)
  else:
    (batch_size, seq_len, feature_size)
```

In [17]:
# If batch_first = True
# X = torch.rand(size = (batch_size, num_steps, len(vocab)))
# Y, state_new = rnn_layer(X, state)  
# print(Y.shape)  # (batch_size, num_steps, num_hiddens)
# print(state_new.shape) # (num_layers, batch_size, num_hiddens)

# If batch_first = False
X = torch.rand(size = (num_steps, batch_size, len(vocab)))
Y, state_new = rnn_layer(X, state)
print(Y.shape)
print(state_new.shape)

torch.Size([35, 32, 256])
torch.Size([1, 32, 256])


In [22]:
class RNN(nn.Module):
  def __init__(self, rnn_layer, vocab_size, **kwargs):
    super(RNN, self).__init__(**kwargs)
    self.rnn = rnn_layer
    self.vocab_size = vocab_size
    self.num_hiddens = self.rnn.hidden_size

    if not self.rnn.bidirectional:
      self.num_directions = 1
      self.linear = nn.Linear(self.num_hiddens, self.vocab_size)
    else:
      self.num_directions = 2
      self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size)

  def forward(self, inputs, state):
    # inputs shape: (batch_size x num_steps)
    # since batch first is false we transpose inputs
    # one hot encode
    X = F.one_hot(inputs.T.long(), self.vocab_size) 
    # now X has shape (num_steps x batch_size x vocab_size)
    X = X.to(torch.float32)
    Y, state = self.rnn(X, state)
    # shape of Y: (num_steps x batch_size x num_hiddens)

    # fully connected layer will change shape of Y to 
    # (num_steps x batch_size x vocab_size)
    output = self.linear(Y.reshape(-1, Y.shape[-1]))
    return output, state
  
  def begin_state(self, device, batch_size=1):
    if not isinstance(self.rnn, nn.LSTM):
      # nn.GRU takes a tensor as hidden state
      return torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device)
    else:
      # nn.LSTM takes a tuple of hidden states
      return (torch.zeros((
                  self.num_directions * self.rnn.num_layers, 
                  batch_size, 
                  self.num_hiddens), 
              device = device), 
              torch.zeros((
                  self.num_directions * self.rnn.num_layers, 
                  batch_size, 
                  self.num_hiddens), 
               device = device))

In [77]:
# testing rnn architecture
device = 'cuda' if torch.cuda.is_available() else 'cpu'
rnn_layer = nn.RNN(len(vocab), num_hiddens, num_layers)
net = RNN(rnn_layer, len(vocab)).to(device)
inputs = torch.rand(1,1, device=device)
state = net.begin_state(batch_size=1, device=device)
y, _ = net(inputs, state)
y.shape

torch.Size([1, 28])

In [78]:
# maximum value occuring at index 
int(y.argmax(dim=1).reshape(1))

3

# Predict Function

In [79]:
def predict(prefix, num_preds, net, vocab, device):
  # prefix: user provided text to start with
  # num_preds: the number of characters to be predicted
  # the rnn net object
  # vocab: dictionary mapping character to idx
  
  state = net.begin_state(device, batch_size=1)
  outputs = [vocab[prefix[0]]]
  get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape(1,1)

  # warm-up period: model updates itself but does not make predictions
  for y in prefix[1:]:
    _, state = net(get_input(), state)
    outputs.append(vocab[y])
  
  # predict 'num_preds' step
  for _ in range(num_preds):
    y, state = net(get_input(), state)
    outputs.append(int(y.argmax(dim=1).reshape(1)))
  return ''.join([vocab.idx_to_token[i] for i in outputs])

In [80]:
predict('time traveller ', num_preds=10, net=net, vocab=vocab, device=device)

'time traveller oaoaoaoaoa'

Gradient clipping

In [43]:
def grad_clipping(net, theta):
  """Clip the gradient."""
  if isinstance(net, nn.Module):
    params = [p for p in net.parameters() if p.requires_grad]
  else:
    params = net.params
  norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
  if norm > theta:
      for param in params:
          param.grad[:] *= theta / norm

Train the model

In [81]:
def train(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False):
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(net.parameters(), lr=lr)

  pred = lambda prefix: predict(prefix, 50, net, vocab, device)

  state = None
  # train and predict
  for epoch in range(num_epochs):
    running_loss = 0.0
    cnt = 0
    for X, y in train_iter:
      cnt += 1
      # shape of X: (batch_size x num_steps)
      # shape of y: (batch_size x num_steps)
      if state is None:
        state = net.begin_state(batch_size=X.shape[0], device=device)
      else:
        if isinstance(net, nn.Module) and not isinstance(state, tuple):
          # state is a tensor for nn.FRU
          state.detach_()
        else:
          # state is a tuple of tensors for nn.LSTM
          for s in state:
            s.detach_()
      y = y.T.reshape(-1)
      X, y = X.to(device), y.to(device)
      y_hat, state = net(X, state)
      
      loss = criterion(y_hat, y.long()).mean()
      optimizer.zero_grad()
      loss.backward()
      grad_clipping(net, 1)
      optimizer.step()
      running_loss += loss

    if (epoch + 1) % 10 == 0:
      print('****')
      print(pred('time traveller '))
      print('****')
    print(f'Epoch: {epoch+1}/{num_epochs}\tLoss: {running_loss/cnt:.6f}')


In [82]:
num_epochs, lr = 500, 1
train(net, train_iter, vocab, lr, num_epochs, device)

Epoch: 1/500	Loss: 3.097515
Epoch: 2/500	Loss: 2.921096
Epoch: 3/500	Loss: 2.813699
Epoch: 4/500	Loss: 2.756021
Epoch: 5/500	Loss: 2.680970
Epoch: 6/500	Loss: 2.606729
Epoch: 7/500	Loss: 2.543646
Epoch: 8/500	Loss: 2.501996
Epoch: 9/500	Loss: 2.459864
****
time traveller the the the the the the the the the the the the th
****
Epoch: 10/500	Loss: 2.411392
Epoch: 11/500	Loss: 2.394263
Epoch: 12/500	Loss: 2.370791
Epoch: 13/500	Loss: 2.336989
Epoch: 14/500	Loss: 2.320216
Epoch: 15/500	Loss: 2.309089
Epoch: 16/500	Loss: 2.278907
Epoch: 17/500	Loss: 2.268442
Epoch: 18/500	Loss: 2.250309
Epoch: 19/500	Loss: 2.257007
****
time traveller the the the the the the the the the the the the th
****
Epoch: 20/500	Loss: 2.228314
Epoch: 21/500	Loss: 2.221901
Epoch: 22/500	Loss: 2.203267
Epoch: 23/500	Loss: 2.185552
Epoch: 24/500	Loss: 2.188191
Epoch: 25/500	Loss: 2.182741
Epoch: 26/500	Loss: 2.144275
Epoch: 27/500	Loss: 2.157930
Epoch: 28/500	Loss: 2.139886
Epoch: 29/500	Loss: 2.142646
****
time travel

In [92]:
predict('hi lana wonder how ', num_preds=120, net=net, vocab=vocab, device=device)

'hi lana wonder how stay ie a fised in thithine breall inot butingon is and why cannotwe move in time as we move about in timethat is the ge'