# Introduction

This notebook presents **LSTM** network with character-wise input trained on Shakespeare plays.

Dataset file is included in this repo and consists of all works of Shakespeare concatenated together (4.6MB).

# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import collections

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
dataset_location = '../Datasets/shakespeare/shakespeare_input.txt'

# Read Data

Open text file

In [5]:
with open(dataset_location, 'r') as f:
    text = f.read()
print(text[:173])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.


Discard bit at the end such that text is divisible by 1024. This allow for batch sizes [1, 2, 4, 8, 16, 32, ..., 1024]

In [6]:
mod1024 = len(text) % 1024
text = text[:-mod1024]

Tokenize

In [17]:
tokens = collections.Counter(text).most_common()
tokens[0:5]

[(' ', 696165), ('e', 386342), ('t', 272672), ('o', 268956), ('a', 230593)]

In [20]:
i2c = {i : c for i, (c, n) in enumerate(tokens)}
c2i = {c : i for i, c in i2c.items()}
print('i2c:', i2c)
print('c2i:', c2i)

i2c: {0: ' ', 1: 'e', 2: 't', 3: 'o', 4: 'a', 5: 'h', 6: 's', 7: 'n', 8: 'r', 9: 'i', 10: '\n', 11: 'l', 12: 'd', 13: 'u', 14: 'm', 15: 'y', 16: ',', 17: 'w', 18: 'f', 19: 'c', 20: 'g', 21: 'I', 22: ':', 23: 'b', 24: 'p', 25: 'A', 26: '.', 27: 'v', 28: 'T', 29: 'k', 30: "'", 31: 'S', 32: 'E', 33: 'O', 34: 'N', 35: 'R', 36: 'L', 37: ';', 38: 'C', 39: 'H', 40: 'W', 41: 'M', 42: 'U', 43: 'B', 44: 'D', 45: '?', 46: 'F', 47: '!', 48: '-', 49: 'G', 50: 'P', 51: 'Y', 52: 'K', 53: 'V', 54: 'j', 55: 'q', 56: 'x', 57: 'J', 58: 'z', 59: 'Q', 60: 'Z', 61: 'X', 62: '3', 63: '&', 64: '[', 65: ']', 66: '$'}
c2i: {' ': 0, 'e': 1, 't': 2, 'o': 3, 'a': 4, 'h': 5, 's': 6, 'n': 7, 'r': 8, 'i': 9, '\n': 10, 'l': 11, 'd': 12, 'u': 13, 'm': 14, 'y': 15, ',': 16, 'w': 17, 'f': 18, 'c': 19, 'g': 20, 'I': 21, ':': 22, 'b': 23, 'p': 24, 'A': 25, '.': 26, 'v': 27, 'T': 28, 'k': 29, "'": 30, 'S': 31, 'E': 32, 'O': 33, 'N': 34, 'R': 35, 'L': 36, ';': 37, 'C': 38, 'H': 39, 'W': 40, 'M': 41, 'U': 42, 'B': 43, 'D': 44

Encode text as tokens, reshape to batches, convert to tensor

In [29]:
batch_size = 128

data = np.array([c2i[c] for c in text])
data = data.reshape((batch_size, -1))
print('data: ', data)
print('shape:', data.shape)

data:  [[46  9  8 ...  3  2  0]
 [ 9  7  0 ... 30  6 16]
 [ 0 17  4 ...  1 19  3]
 ...
 [43  8  3 ...  0  9  2]
 [ 0  9  7 ...  5  1  8]
 [ 0  2  5 ...  0  4  8]]
shape: (128, 35728)


In [42]:
split_index = int(data.shape[1]*.9)  # 90% train, 10% valid
train_data, valid_data = np.split(data, [split_index], axis=1)
print('train_data:', train_data.shape)
print('valid_data:', valid_data.shape)

train_data: (128, 32155)
valid_data: (128, 3573)


Move to GPU if possible

In [43]:
train_x = torch.tensor(train_data).to(device)
valid_x = torch.tensor(valid_data).to(device)
print('train_x:', train_x.shape)
print('valid_x:', valid_x.shape)

train_x: torch.Size([128, 32155])
valid_x: torch.Size([128, 3573])


Model

In [73]:
class CharRNN(nn.Module):
    
    def __init__(self, nb_layers, n_in, n_embed, n_hid, n_out, dropout):
        super(CharRNN, self).__init__()
        self.embed = nn.Embedding(num_embeddings=n_in, embedding_dim=n_embed)
        self.lstm = nn.LSTM(input_size=n_embed, hidden_size=n_hid, num_layers=nb_layers,
                           batch_first=True, dropout=dropout)
        self.drop = nn.Dropout(p=dropout)
        self.fc = nn.Linear(in_features=n_hid, out_features=n_out)
    
    def forward(self, x, hidden):
        x = self.embed(x)                   # shape [n_batch, n_seq, n_embed]
        x, hidden = self.lstm(x, hidden)    # shape [n_batch, n_seq, n_hid]
        x = self.drop(x)
        x = self.fc(x)                      # shape [n_batch, n_seq, n_out]
        return x, hidden

In [74]:
nb_layers = 2
n_in = len(i2c)
n_seq = 256
n_embed = 50
n_hid = 64
n_out = len(i2c)
dropout = .5

model = CharRNN(nb_layers, n_in, n_embed, n_hid, n_out, dropout)
model.to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [75]:
import time

In [76]:
trace = {'tloss': []}

ts = time.time()
for e in range(1):
    
    ### Train ###
    model.train()
    hidden = None  # reset LSTM hidden state
    for i in range(0, train_x.shape[1]-1, n_seq):
        
        # Pick mini-batch (over seqence dimension)
        inputs = train_x[:,i:i+n_seq]        # shape [n_batch, n_seq], less for last batch
        targets = train_x[:,i+1:i+1+n_seq]   # shape [n_batch, n_seq], less for last batch
        if inputs.shape[1] != targets.shape[1]:
            inputs = inputs[:,:-1]           # fix shape disparity for last batch in epoch
        
        assert inputs.shape == targets.shape
        
        # Optimize
        optimizer.zero_grad()
        outputs, hidden = model(inputs, hidden)
        hidden = tuple(h.detach() for h in hidden)
        loss = criterion(outputs.view(-1, n_out), targets.flatten())
        loss.backward()
        optimizer.step()

        # Record per-iteration loss
        trace['tloss'].append( loss.item() )
        
        print(f'Epoch: {e:3} {i*100/train_x.shape[1]:4.1f}%     loss: {loss.item():.4f}')
        
        

print(time.time() - ts)

Epoch:   0  0.0%     loss: 4.2041
Epoch:   0  0.8%     loss: 4.1938
Epoch:   0  1.6%     loss: 4.1847
Epoch:   0  2.4%     loss: 4.1740
Epoch:   0  3.2%     loss: 4.1627
Epoch:   0  4.0%     loss: 4.1503
Epoch:   0  4.8%     loss: 4.1368
Epoch:   0  5.6%     loss: 4.1208
Epoch:   0  6.4%     loss: 4.1018
Epoch:   0  7.2%     loss: 4.0794
Epoch:   0  8.0%     loss: 4.0512
Epoch:   0  8.8%     loss: 4.0220
Epoch:   0  9.6%     loss: 3.9798
Epoch:   0 10.4%     loss: 3.9334
Epoch:   0 11.1%     loss: 3.8809
Epoch:   0 11.9%     loss: 3.8197
Epoch:   0 12.7%     loss: 3.7596
Epoch:   0 13.5%     loss: 3.7039
Epoch:   0 14.3%     loss: 3.6642
Epoch:   0 15.1%     loss: 3.6185
Epoch:   0 15.9%     loss: 3.5918
Epoch:   0 16.7%     loss: 3.5640
Epoch:   0 17.5%     loss: 3.5238
Epoch:   0 18.3%     loss: 3.5069
Epoch:   0 19.1%     loss: 3.4970
Epoch:   0 19.9%     loss: 3.4516
Epoch:   0 20.7%     loss: 3.4465
Epoch:   0 21.5%     loss: 3.4399
Epoch:   0 22.3%     loss: 3.4348
Epoch:   0 23.

In [62]:
inputs[:3, :10]

tensor([[ 3,  8,  1, 16,  0, 23,  1,  0,  9,  2],
        [ 6,  1,  0,  5,  9,  6,  0,  5,  4,  2],
        [ 0, 15,  3, 13, 45, 10, 10, 53,  3, 11]], device='cuda:0')

In [63]:
targets[:3, :10]

tensor([[ 8,  1, 16,  0, 23,  1,  0,  9,  2,  0],
        [ 1,  0,  5,  9,  6,  0,  5,  4,  2,  8],
        [15,  3, 13, 45, 10, 10, 53,  3, 11,  6]], device='cuda:0')

In [261]:
oo = outputs.view(-1, n_out)

In [256]:
targets.view(-1, 1).shape

RuntimeError: invalid argument 2: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Call .contiguous() before .view(). at /opt/conda/conda-bld/pytorch_1544176307774/work/aten/src/THC/generic/THCTensor.cpp:220

In [267]:
tt = targets.flatten()

In [268]:
criterion(oo, tt)

tensor(4.2124, device='cuda:0', grad_fn=<NllLossBackward>)

In [266]:
targets.flatten()

tensor([55, 48,  7, 30, 35, 62, 55, 30, 55, 60,  4, 36,  0, 59, 11,  4, 46, 27,
        48,  4, 35,  9,  4, 35,  5, 48, 27, 31,  4,  4, 42, 35, 35, 12, 27,  2,
         4, 35, 30, 49, 38, 35, 42, 44, 51, 22, 49, 30,  4, 48, 33, 59,  1, 36,
        42, 35, 43,  4, 44, 36, 35, 30, 27, 35, 64,  7, 35, 30, 27, 43, 40,  7,
        39, 59, 59, 62,  1, 57, 23, 34, 54, 26,  0, 59, 66, 35, 49,  4, 44,  2,
         4, 36,  7, 16, 35, 66, 35,  7,  5,  4,  4, 31, 49, 12,  4,  7,  7, 35,
        42, 55, 44, 12,  4, 31, 30, 33, 59, 52, 51, 31, 49, 35, 44,  7, 35, 43,
        27,  2, 35, 46, 44, 12, 12,  7, 59, 23,  5, 27, 36, 35, 38, 27, 51, 48,
        35, 49,  4, 44, 42,  7, 61, 61, 55,  7, 35, 36, 27, 30, 49, 55, 55, 36,
        42,  4,  4, 42, 33, 35, 30,  4, 12, 12, 35, 43,  4, 35, 49, 27,  9, 35,
        43, 51, 31, 49, 39, 59, 59, 10,  1, 21, 37, 35, 35,  9,  4, 12, 12, 35,
        48,  4,  5, 51, 30,  4, 42, 39, 59, 59, 63, 23, 37, 54,  0, 59, 53, 44,
        30, 49, 35, 49,  4, 35, 36, 27, 

In [220]:
y_

tensor([[30, 35,  9, 55, 30, 49, 35, 43, 38, 35,  7, 27, 51, 12, 35,  0],
        [36, 59, 26, 49,  4,  7,  4, 35, 42,  4, 44, 42, 35, 43,  4,  0],
        [ 4, 35, 55,  7, 35, 44, 35,  5, 48, 27, 36,  4, 35, 44, 36,  0],
        [27, 12, 44, 30,  4, 35, 55,  7, 12,  4, 33, 35,  4, 12,  7,  0],
        [ 0, 59, 18, 46, 35, 55, 30, 35, 40,  4, 35, 12, 27,  2,  4,  0],
        [55, 30, 49, 27, 51, 30, 35, 42,  4,  7,  4, 48, 30, 35,  7,  0],
        [30,  7, 41, 59, 59, 17, 66,  1,  3, 35, 34,  1, 35, 57, 23,  0],
        [ 4,  7, 30, 35, 42,  4, 46,  4, 36, 31,  4, 39, 59, 59, 10,  0],
        [ 9, 35,  9, 55, 30, 49, 35, 44, 35, 48, 27, 40, 40,  4, 48,  0],
        [35, 30, 49, 55,  7, 35,  7, 31, 27, 48, 36, 41, 59, 18,  7,  0],
        [44, 48,  7, 33, 35, 27, 48, 35, 44, 36, 38, 35,  7,  4, 36,  0],
        [34,  1, 23, 63, 18, 23, 52,  0, 59, 45, 27, 12, 12, 27,  9,  0],
        [ 7, 35, 44, 36, 38, 35,  7, 30, 27, 36,  4, 33, 35, 44, 36,  0],
        [26, 18, 10, 66,  3,  0, 59, 2