Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)

Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks

Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)

# Writing your own War and Peace with RNNs 

## Acquiring and analyzing the training data 

In [1]:
with open('warpeace_input.txt', 'r', encoding="utf8") as fp:
    raw_text = fp.read()
raw_text = raw_text.lower()

In [2]:
print(raw_text[:200])

﻿"well, prince, so genoa and lucca are now just family estates of the
buonapartes. but i warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetr


In [3]:
all_words = raw_text.split()
unique_words = list(set(all_words))
print(f'Number of unique words: {len(unique_words)}')

Number of unique words: 39830


In [4]:
n_chars = len(raw_text)
print(f'Total characters: {n_chars}')

Total characters: 3196213


In [5]:
chars = sorted(list(set(raw_text)))
vocab_size = len(chars)
print(f'Total vocabulary (unique characters): {vocab_size}')
print(chars)

Total vocabulary (unique characters): 57
['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'ä', 'é', 'ê', '\ufeff']


## Constructing the training set for the RNN text generator

In [6]:
index_to_char = dict((i, c) for i, c in enumerate(chars))
char_to_index = dict((c, i) for i, c in enumerate(chars))
print(char_to_index)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '=': 24, '?': 25, 'a': 26, 'b': 27, 'c': 28, 'd': 29, 'e': 30, 'f': 31, 'g': 32, 'h': 33, 'i': 34, 'j': 35, 'k': 36, 'l': 37, 'm': 38, 'n': 39, 'o': 40, 'p': 41, 'q': 42, 'r': 43, 's': 44, 't': 45, 'u': 46, 'v': 47, 'w': 48, 'x': 49, 'y': 50, 'z': 51, 'à': 52, 'ä': 53, 'é': 54, 'ê': 55, '\ufeff': 56}


In [7]:
import numpy as np
text_encoded = np.array(
    [char_to_index[ch] for ch in raw_text],
    dtype=np.int32)

In [8]:
seq_length = 40
chunk_size = seq_length + 1

text_chunks = np.array([text_encoded[i:i+chunk_size] 
               for i in range(len(text_encoded)-chunk_size+1)]) 


In [9]:
import torch
from torch.utils.data import Dataset

class SeqDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_dataset = SeqDataset(torch.from_numpy(text_chunks))


In [10]:
from torch.utils.data import DataLoader
 
batch_size = 64

torch.manual_seed(0)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

## Building and Training an RNN text generator 

In [11]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_dim = rnn_hidden_dim
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_dim, 
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_dim)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_dim)
        return hidden, cell

In [12]:
embed_dim = 256
rnn_hidden_dim = 512

torch.manual_seed(0)
model = RNN(vocab_size, embed_dim, rnn_hidden_dim) 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model 

RNN(
  (embedding): Embedding(57, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=57, bias=True)
)

In [13]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

In [14]:
num_epochs = 10000

torch.manual_seed(0)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden.to(device), cell.to(device)) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} - loss: {loss:.4f}')


Epoch 0 - loss: 4.0255
Epoch 500 - loss: 1.4560
Epoch 1000 - loss: 1.2794
Epoch 1500 - loss: 1.3793
Epoch 2000 - loss: 1.3275
Epoch 2500 - loss: 1.3033
Epoch 3000 - loss: 1.2388
Epoch 3500 - loss: 1.2926
Epoch 4000 - loss: 1.2658
Epoch 4500 - loss: 1.2186
Epoch 5000 - loss: 1.2181
Epoch 5500 - loss: 1.2342
Epoch 6000 - loss: 1.2134
Epoch 6500 - loss: 1.2532
Epoch 7000 - loss: 1.2642
Epoch 7500 - loss: 1.2028
Epoch 8000 - loss: 1.2410
Epoch 8500 - loss: 1.2557
Epoch 9000 - loss: 1.2014
Epoch 9500 - loss: 1.2442


In [15]:
from torch.distributions.categorical import Categorical

def generate_text(model, starting_str, len_generated_text=500):
    encoded_input = torch.tensor([char_to_index[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()

    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for _ in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        last_char = Categorical(logits=logits).sample()
        generated_str += str(index_to_char[last_char.item()])
        
    return generated_str


model.to('cpu')
torch.manual_seed(0)
print(generate_text(model, 'the emperor', 500))

the emperor!" said he.

"finished! it's all with moscow, it's not get bald hills!" he added the civer with whom and desire to change. they really asked the imperor's field!" she said. alpaty. there happed the cause of the longle matestood itself. "the mercy tiresist between paying so impressions, and till the staff offsicilling petya, the chief dear body, returning quite dispatchma--he turned and ecstatically. "ars doing her dome." said rostov, and the general feelings of the bottom would be the pickled ha


---

Readers may ignore the next cell.

In [16]:
!jupyter nbconvert --to python ch12_part3.ipynb --TemplateExporter.exclude_input_prompt=True

[NbConvertApp] Converting notebook ch12_part3.ipynb to python
[NbConvertApp] Writing 4748 bytes to ch12_part3.py
