# Downloading shakespeare data

Running the cell below will create a folder called 'data' in the session storage, and store all of shakespeare's work in it.

In [1]:
import os
import urllib.request
import codecs

DATA_DIR = 'data'
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)   

shakespeare_url = 'http://www.gutenberg.org/files/100/100-0.txt'
raw_shakespeare_data = os.path.join(DATA_DIR, 'shakespeare_raw.txt')
if not os.path.exists(raw_shakespeare_data):
    print('Downloading shakespeare corpus from %s' % shakespeare_url)
    urllib.request.urlretrieve(shakespeare_url, raw_shakespeare_data)

with codecs.open(raw_shakespeare_data, 'rb', 'utf-8') as f:
    shakespeare = f.readlines()

shakespeare = shakespeare[139:147417]


boilerplate = {u'<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM\r\n',
u'SHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS\r\n',
u'PROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE COLLEGE\r\n',
u'WITH PERMISSION.  ELECTRONIC AND MACHINE READABLE COPIES MAY BE\r\n',
u'DISTRIBUTED SO LONG AS SUCH COPIES (1) ARE FOR YOUR OR OTHERS\r\n',
u'PERSONAL USE ONLY, AND (2) ARE NOT DISTRIBUTED OR USED\r\n',
u'COMMERCIALLY.  PROHIBITED COMMERCIAL DISTRIBUTION INCLUDES BY ANY\r\n',
u'SERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>\r\n'}
shakespeare = [l for l in shakespeare if l not in boilerplate]


shakespeare_sonnets = os.path.join(DATA_DIR, 'shakespeare_sonnets.txt')
if not os.path.exists(shakespeare_sonnets):
    with codecs.open(shakespeare_sonnets, 'wb', 'utf-8') as f:
        print('Saving shakespeare sonnets file.')
        for i in range(2772):
            f.write(shakespeare[i])


shakespeare_plays = os.path.join(DATA_DIR, 'shakespeare_plays.txt')
if not os.path.exists(shakespeare_plays):
    with codecs.open(shakespeare_plays, 'wb', 'utf-8') as f:
        print('Saving shakespeare plays file.')
        for i in range(2773, len(shakespeare)):
            f.write(shakespeare[i])
print(f'Data directory \"{DATA_DIR}\" has files {os.listdir(DATA_DIR)}')

Data directory "data" has files ['shakespeare_raw.txt', 'shakespeare_sonnets.txt', 'shakespeare_plays.txt']


Let's see what the first 10 lines of the plays and sonnets look like

In [2]:
for text in ['shakespeare_plays','shakespeare_sonnets']:
  with open(f'{DATA_DIR}/{text}.txt','r', encoding="utf8") as f:
    print("".join(next(f) for _ in range(10)))

ACT I
Scene I. Rossillon. A room in the Countess’s palace.
Scene II. Paris. A room in the King’s palace.
Scene III. Rossillon. A Room in the Palace.


ACT II
Scene I. Paris. A room in the King’s palace.
Scene II. Rossillon. A room in the Countess’s palace.
Scene III. Paris. The King’s palace.

Making a famine where abundance lies,
Thy self thy foe, to thy sweet self too cruel:
Thou that art now the world’s fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And, tender churl, mak’st waste in niggarding:
  Pity the world, or else this glutton be,
  To eat the world’s due, by the grave and thee.





# Preprocessing

We have the data now, but it isn't yet in a form usable by a machine learning model.
We'll want to do:


* Tokenization: chopping the text up into individual units. Here, those units will be words
* Vectorization: replacing each token with an index (that will index into an emebdding matrix).
* Creating training examples: dividing the list of tokens into sequences of some limited length, with inputs and target
* Mini-batching: rather than waiting to perform the gradient update after the model sees _all_ the data, you can speed up learning by doing gradient updates after only a few samples.

We can organize the first three of these steps in a DataSet class: this will be the interface that goes from text file to vectorized examples.

In [3]:
import os
import torch.nn.functional as F
from torch.utils.data import Dataset
import nltk
nltk.download('punkt')
from nltk import word_tokenize
import operator
import torch
from tqdm import tqdm
UNK_TOKEN = 'UNKNOWN'

class ShakespeareDataset(Dataset):
    def __init__(self, corpus_file, seq_length,word_level=True,max_vocab_size=10000):
        self.corpus_file = corpus_file
        self.seq_length = seq_length
        self.word_level = word_level
        self.max_vocab_size=max_vocab_size

        self.token_ids = []
        self.token2id = {}
        self.read_and_load_data_into_mem()

        self.vocab = Vocab(self.token2id)

    # Because the datasets are relatively small, we can just load them into memory
    def read_and_load_data_into_mem(self):
        if len(self.token_ids) == 0:
            print(f"Loading data from {self.corpus_file} into memory")
            temp_tokens = []
            token_counts = {}
            with open(self.corpus_file,'r',encoding='utf-8') as f:
                for line in tqdm(f):
                    # Here, we chunk the continuous text into tokens
                    # First, do word_level, you can use word_tokenize from nltk. Don't forget to include '\n' tokens though!
                    # At the end, come back and include the option for character-level.
                    # TODO
                    token_list = word_tokenize(line) + ['\n'] if self.word_level else list(line)
                    for t in token_list:
                        token = t.lower() if self.word_level else t
                        if token in token_counts:
                            token_counts[token] += 1
                        else:
                            token_counts[token] = 1
                        temp_tokens.append(token)

            # Here, we make self.token2id, that maps the most-appearing words to ids, based on token_counts and self.max_vocab_size
            # TODO. Don't forget to add an extra 'common token': the UNK_TOKEN
            token_counts = sorted(token_counts.items(), key=operator.itemgetter(1), reverse=True)  # sort by key
            if len(token_counts) > self.max_vocab_size:
                token_counts = token_counts[:self.max_vocab_size]
                print('Truncating vocab.')
            most_common_tokens = ['<unk>']
            for t, _ in token_counts:
                most_common_tokens.append(t)
            self.token2id = {w: i for i, w in enumerate(most_common_tokens)}

            # Finally, using this vocabulary, we can convert the word tokens into indexes, and store those.
            self.token_ids = [self.token2id[t] if t in self.token2id else self.token2id['<unk>'] for t in temp_tokens]

    def __len__(self):
        #In this class, we want to give the number of examples. This is not the same as the number of tokens!
        num_tokens = len(self.token_ids)
        num_examples = (num_tokens-1) // self.seq_length # extra word for the target
        return num_examples

    def __getitem__(self, example_idx):
        start_token_idx = example_idx*self.seq_length
        end_token_idx = (example_idx+1)*self.seq_length
        inputs = self.token_ids[start_token_idx:end_token_idx]
        targets = self.token_ids[start_token_idx+1:end_token_idx+1]

        # Some technicalities: to work well with pytorch, the integers need to be represented in the right format
        tensor_inputs = torch.tensor(inputs).long()
        tensor_targets = torch.tensor(targets).long()

        return tensor_inputs,tensor_targets


# Helper class to easily convert from tokens to indices and back
class Vocab():

  def __init__(self,token2id):
    self.token2id = token2id
    self.id2token = {id:word for word,id in self.token2id.items()}

  def __len__(self):
    return len(self.token2id)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Let's check out the dataset.


In [4]:
corpus = shakespeare_plays
ds = ShakespeareDataset(corpus,seq_length=32)
v = ds.vocab
print(len(ds))
example_input_idxs, example_target_idxs = ds[0]
print(example_input_idxs)
print([v.id2token[int(idx)] for idx in example_input_idxs])

844it [00:00, 8435.42it/s]

Loading data from data\shakespeare_plays.txt into memory


144505it [00:22, 6350.18it/s]


Truncating vocab.
37120
tensor([ 269,    6,    1,  117,  732, 2229,    3,   11,  367,   14,    4,  733,
          10,   37,  461,    3,    1,  117,  360,    3,  624,    3,   11,  367,
          14,    4,   53,   10,   37,  461,    3,    1])
['act', 'i', '\n', 'scene', 'i.', 'rossillon', '.', 'a', 'room', 'in', 'the', 'countess', '’', 's', 'palace', '.', '\n', 'scene', 'ii', '.', 'paris', '.', 'a', 'room', 'in', 'the', 'king', '’', 's', 'palace', '.', '\n']


# Model specification


In the code comments we'll use 

* B to denote minibatch size
* L to denote number of timesteps / sequence length
* V to denote the vocabulary size = dimension of the one-hot embeddings
* E to denote (dense) embedding dimension
* H to denote the LSTM hidden dimension
* N to denote the number of LSTM layers



In [5]:
%%capture
!pip install pytorch_lightning

We'll use the pytorch lightning framework to organize our code: this includes creating the logic for training steps, optimization, validation etc. inside your model class. The pytorch lightning library then makes sure to call these at the right time.

In [6]:
import pytorch_lightning as pl
import torch.nn as nn
import torch


class TimeDistributed(nn.Module):
    def __init__(self, module, batch_first=False):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):

        if len(x.size()) <= 2:
            return self.module(x)

        # Squash B and L into a single axis
        x_reshape = x.contiguous().view(-1, x.size(-1))  # [B * L, H]

        y = self.module(x_reshape)

        # We have to reshape Y
        if self.batch_first:
            y = y.contiguous().view(x.size(0), -1, y.size(-1))  # [B, L, V]
        else:
            y = y.view(-1, x.size(1), y.size(-1))  # [L, B, V]

        return y


class GeneratorLSTM(pl.LightningModule):

    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedder = nn.Embedding(vocab_size, emb_dim)
        num_layers = 2
        self.lstm = nn.LSTM(emb_dim, hidden_dim,num_layers,batch_first=True)
        self.out = TimeDistributed(nn.Linear(hidden_dim, vocab_size))
        self.hidden = None

    # Hidden is a tuple that contains cell state and hiddens state
    def forward(self, inputs,remember_hidden=False):
        embeddings = self.embedder(inputs) # step 1: embedder: [B,L] -> [B,L,E]
        
        # step 2: lstm: if self.hidden is None: [B,L,E] -> [B,L,E], ([B,N,H],[B,N,H]). Else: [B,L,E],([B,N,H],[B,N,H]) -> [B,L,H], ([B,N,H],[B,N,H])
        if self.hidden is None:
            args = [embeddings]
        else:
            args = [embeddings,self.hidden]
        lstm_hidden_states, new_hidden = self.lstm(*args)
        
        # step 3: out:  # [B,L,H] -> [B,L,V]        
        vocab_scores = self.out(lstm_hidden_states)
        if remember_hidden:
            self.hidden = tuple(el.detach() for el in new_hidden)
        return vocab_scores

    def training_step(self, train_batch, batch_idx):
        inputs, targets = train_batch
        vocab_scores = self(inputs,remember_hidden=True)  #
        loss_fn = nn.CrossEntropyLoss()
        B, L, V = vocab_scores.shape
        loss = loss_fn(vocab_scores.view(B * L, V),
                       targets.view(B * L))  # calculate the loss in parallel for all B*L words
        self.log('train_loss', loss)
        return loss

    # We can use performance on a held-out validation set to get a feeling for when we are overfitting on the training data.
    def validation_step(self, val_batch, batch_idx):
        inputs, targets = val_batch
        vocab_scores = self(inputs,remember_hidden=True)  #
        loss_fn = nn.CrossEntropyLoss()
        B, L, V = vocab_scores.shape
        loss = loss_fn(vocab_scores.view(B * L, V),
                       targets.view(B * L))  # calculate the loss in parallel for all B*L words
        self.log('val_loss', loss)

    # The optimizer doesn't influence the gradient. However, a good optimizer makes good decisions about the learning rate for different parameters: sometimes it's better to take big steps, sometimes small steps are better.
    # The Adam optimizer is a very good default
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def on_epoch_end(self) -> None:
        self.hidden = None

Let's see what an untrained model predicts.


In [7]:
hidden_dim = 128
emb_dim = 50
vocab_size = len(ds.token2id)

model = GeneratorLSTM(vocab_size,emb_dim,hidden_dim)
sample_input,_ = ds[0]# Add dummy batch dimension
output_scores = model(sample_input.unsqueeze(0) )

print("INPUT: "," ".join(v.id2token[idx] for idx in sample_input.tolist()))
print("PREDICTED NEXT WORDS: ", " ".join(v.id2token[idx] for idx in output_scores.argmax(dim=-1)[0].tolist()))



INPUT:  act i 
 scene i. rossillon . a room in the countess ’ s palace . 
 scene ii . paris . a room in the king ’ s palace . 

PREDICTED NEXT WORDS:  chop shroud shroud hole shroud hole wart wart wart shroud shroud shroud shroud stubbornness shroud wart wart wart wart wart wart wart wart wart shroud shroud shroud shroud shroud shroud wart wart


We want our model to learn on long consecutive sequences of text.
To speed things up, we'll also want to parallelize: work in minibatches of text at the same time, after each of which a learning step is performed.
We want to make sure the batches don't interfere with having long consecutive sequences.
Hence, let's make a custom sampler

In [8]:
from torch.utils.data import Sampler

class CustomBatchSampler(Sampler):

    # If we had 6 sequences in our dataset: 0 2 3 4 5, and we want batch size 2, we want our batch inputs to look like:
    # [[0, 3], [1, 4], [2, 5]]. So in the first batch: 0 and 3, second batch: 1 and 4, and so on.
    def __init__(self, dataset, batch_size):
        self.batch_size = batch_size
        self.dataset = dataset
        self.per_batchslice_dataset_len = len(self.dataset) // self.batch_size

    def __iter__(self):
        return iter([i*self.per_batchslice_dataset_len+j for i in range(self.batch_size)] for j in range(self.per_batchslice_dataset_len))

    def __len__(self):
        return len(self.dataset) // self.batch_size


... not great yet. Let's train it!
First, lets split our data into a training dataset, and a validation dataset

In [9]:
import math
from torch.utils.data import DataLoader
batch_size=32
cutoff = math.floor(.95*len(ds))
training_ds, validation_ds = torch.utils.data.random_split(ds,lengths=[cutoff,len(ds)-cutoff])
print(len(training_ds),len(validation_ds))
train_loader = DataLoader(training_ds,batch_sampler=CustomBatchSampler(training_ds, batch_size))
val_loader = DataLoader(validation_ds,batch_sampler=CustomBatchSampler(validation_ds, batch_size))



35264 1856


Then, training is simply a matter of creating a trainer object, and calling fit on it

In [10]:
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import math

batch_size=32
trainer = pl.Trainer(max_time={"minutes": 0.5},callbacks=[EarlyStopping(monitor="val_loss")]) # Max minutes of training
trainer.fit(model, train_loader,val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name     | Type            | Params
---------------------------------------------
0 | embedder | Embedding       | 500 K 
1 | lstm     | LSTM            | 224 K 
2 | out      | TimeDistributed | 1.3 M 
---------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
8.058     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  rank_zero_warn(
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Time limit reached. Elapsed time is 0:00:30. Signaling Trainer to stop.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




With our trained model, let's try to generate some text

In [11]:
def generate_text(model, seed_tokens, temperature, vocab, N):
    # first we initialize the state of the LSTM using the seed_str
    seed_token_ids = torch.tensor([vocab.token2id[w] for w in seed_tokens])
    probs = model(seed_token_ids.unsqueeze(0),remember_hidden=True)

    # now we start generating text
    next_token_probs = probs[0,-1,:]
    next_token_idx = sample(next_token_probs, temperature)
    generated_text_idx = [next_token_idx]
    generated_text = [vocab.id2token[next_token_idx]]
    for i in range(N - 1):
        probs = model(torch.tensor(next_token_idx).unsqueeze(0).unsqueeze(0),remember_hidden=True) # Unsqueeze for batch size and sequence length
        next_token_probs = probs[0,-1,:]
        next_token_idx = sample(next_token_probs, temperature)
        generated_text_idx.append(next_token_idx)
        generated_text.append(vocab.id2token[next_token_idx])
    return generated_text

def sample(token_probs, temperature=1.0):
    temped_probs = token_probs / temperature
    softmaxed_probs = F.softmax(temped_probs, dim=0)
    sampled_idx = torch.multinomial(softmaxed_probs, 1)
    return int(sampled_idx)


from nltk import word_tokenize
seed_str = '''To be or not to be'''
seed_str = [w.lower() for w in word_tokenize(seed_str)]
generated_text = generate_text(model, seed_str, 1.1, ds.vocab, 300)

In [12]:
print(' '.join(seed_str + generated_text))

to be or not to be cried daughter him stout 
 . and 
 
 you mowbray have she 
 by ajax withal leader 
 knocking good was can ? lord th having april , hang , so seek turn the out and noble spirits bold king garland 
 norfolk nothing up thou that such thee mother . pyramus first 
 macbeth build business 
 dare ladies , , gossips hear trot naked their mine heard employ 
 ! all ! angel just of prove can 
 ! sound thoughts therefore away his caius be . cold pity the 
 it other and winchester strange my . office us are days virgilia last i comes woman ; he 
 curtis dare liege volumnia 
 even my servant with though timon few stern . duke count each 
 , , thee spies be unless , know make more take world snow ; for can 
 how palace day in 
 so ! say <unk> priam any a of either . should babbling 
 durst 
 doubt sicken those . 
 and nay chamber were ; 
 ? , , extended off , 
 to sampson <unk> merry brutus true fortress what night 
 doth 
 day face i ay call benedick their , breath no ] pucelle fo

Finally, go back and include the option to work at the character level in the ShakespeareDataset class.

In [13]:
c_ds = ShakespeareDataset(corpus,seq_length=64,word_level=False)

6129it [00:00, 60691.01it/s]

Loading data from data\shakespeare_plays.txt into memory


144505it [00:02, 62638.07it/s]


In [30]:
hidden_dim = 128
emb_dim = 50
vocab_size = len(c_ds.token2id)

c_model = GeneratorLSTM(vocab_size,emb_dim,hidden_dim)

In [31]:
cutoff = math.floor(.95*len(c_ds))
training_ds, validation_ds = torch.utils.data.random_split(c_ds,lengths=[cutoff,len(c_ds)-cutoff])
train_loader = DataLoader(training_ds,batch_sampler=CustomBatchSampler(training_ds,batch_size))
val_loader = DataLoader(validation_ds,batch_sampler=CustomBatchSampler(validation_ds,batch_size))
batch_size=32
trainer = pl.Trainer(max_time={"minutes": 5},callbacks=[EarlyStopping(monitor="val_loss")]) # Max minutes of training
trainer.fit(c_model, train_loader,val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name     | Type            | Params
---------------------------------------------
0 | embedder | Embedding       | 5.0 K 
1 | lstm     | LSTM            | 224 K 
2 | out      | TimeDistributed | 13.0 K
---------------------------------------------
242 K     Trainable params
0         Non-trainable params
242 K     Total params
0.969     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Time limit reached. Elapsed time is 0:05:00. Signaling Trainer to stop.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [32]:
seed_str = '''To be or not to be'''
seed_str = [w for w in seed_str]
generated_text = generate_text(c_model, seed_str, 1.1, c_ds.vocab, 300)
print(''.join(seed_str + generated_text))



To be or not to bestint treatior,s are bre’fumallfor. Alord, in theich was celions tle his man’lf:
Shukic?

DER THIY. Cowesy wealded be sladI.
:
Thom; as sroutun hiel outs not mounk them the when’s; ald for’d maes an thee, Ancuincle, yor wlule uparcer-sary’G Torat iffelg, le thut it]Cen; by “ings cosciul monds a book
