In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with open('../PYTORCH_NOTEBOOKS/Data/shakespeare.txt','r',encoding='utf8') as f:
    text = f.read()

In [10]:
text[:1000]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl mak'st waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the world's due, by the grave and thee.\n\n\n                     2\n  When forty winters shall besiege thy brow,\n  And dig deep trenches in thy beauty's field,\n  Thy youth's proud livery so gazed on now,\n  Will be a tattered weed of small worth held:  \n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To sa

In [11]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [12]:
len(text)

5445609

## Encode Entire Text

In [14]:
all_characters = set(text)
#all_characters

In [77]:
len(all_characters)

84

In [20]:
# num --> letter
decoder = dict(enumerate(all_characters))
#decoder

In [21]:
# letter --> num
encoder = {char: ind for ind, char in decoder.items()} # Dictionary Comprehension
#encoder

In [22]:
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:500]

array([78,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6, 67, 78,  6,  6, 40, 61, 19, 28,  6,  7, 69, 14,
       61, 82, 73, 52,  6, 13, 61, 82, 69, 52, 72, 61, 82, 73,  6, 43, 82,
        6,  3, 82, 73, 14, 61, 82,  6, 14, 38, 13, 61, 82, 69, 73, 82, 16,
       78,  6,  6, 80, 37, 69, 52,  6, 52, 37, 82, 61, 82, 74, 35,  6, 74,
       82, 69, 72, 52, 35, 83, 73,  6, 61, 19, 73, 82,  6, 28, 14, 41, 37,
       52,  6, 38, 82, 33, 82, 61,  6,  3, 14, 82, 16, 78,  6,  6, 79, 72,
       52,  6, 69, 73,  6, 52, 37, 82,  6, 61, 14, 49, 82, 61,  6, 73, 37,
       19, 72, 50,  3,  6, 74, 35,  6, 52, 14, 28, 82,  6,  3, 82, 13, 82,
       69, 73, 82, 16, 78,  6,  6, 81, 14, 73,  6, 52, 82, 38,  3, 82, 61,
        6, 37, 82, 14, 61,  6, 28, 14, 41, 37, 52,  6, 74, 82, 69, 61,  6,
       37, 14, 73,  6, 28, 82, 28, 19, 61, 35,  9, 78,  6,  6, 79, 72, 52,
        6, 52, 37, 19, 72,  6, 13, 19, 38, 52, 61, 69, 13, 52, 82,  3,  6,
       52, 19,  6, 52, 37

## One Hot Encoding

As previously discussed, we need to one-hot encode our data inorder for it to work with the network structure. Make sure to review numpy if any of these operations confuse you!

In [23]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text
    
    num_uni_chars = number of unique characters (len(set(text)))
    '''
    
    # METHOD FROM:
    # https://stackoverflow.com/questions/29831489/convert-encoded_textay-of-indices-to-1-hot-encoded-numpy-encoded_textay
      
    # Create a placeholder for zeros.
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    
    # Convert data type for later use with pytorch (errors if we dont!)
    one_hot = one_hot.astype(np.float32)

    # Using fancy indexing fill in the 1s at the correct index locations
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0
    

    # Reshape it so it matches the batch shape
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    
    return one_hot

In [24]:
one_hot_encoder(np.array([1,2,0]),3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

## Creating Training Batches

We need to create a function that will generate batches of characters along with the next character in the sequence as a label.

In [25]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    
    '''
    Generate (using yield) batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    X: [[1 2 3]]
    Y: [[ 2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    batch_size : Number of samples per batch
    seq_len : Length of character sequence
       
    '''
    
    # Total number of characters per batch
    # Example: If samp_per_batch is 2 and seq_len is 50, then 100
    # characters come out per batch.
    char_per_batch = samp_per_batch * seq_len
    
    
    # Number of batches available to make
    # Use int() to round to nearest integer
    num_batches_avail = int(len(encoded_text)/char_per_batch)
    
    # Cut off end of encoded_text that
    # won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]
    
    
    # Reshape text into rows the size of a batch
    # Rows is number of samples per batch
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    

    # Go through each row in array.
    for n in range(0, encoded_text.shape[1], seq_len):
        
        # Grab feature characters
        x = encoded_text[:, n:n+seq_len]
        
        # y is the target shifted over by 1
        y = np.zeros_like(x)
       
        #
        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]
            
        # FOR POTENTIAL INDEXING ERROR AT THE END    
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y

### Example of generating a batch

In [44]:
sample_text = encoded_text[:40]
sample_text

array([78,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6, 67, 78,  6,  6, 40, 61, 19, 28,  6,  7, 69, 14,
       61, 82, 73, 52,  6, 13])

In [45]:
batch_generator = generate_batches(sample_text,
                                   samp_per_batch=2,
                                   seq_len=5)

In [46]:
# Grab first batch
x, y = next(batch_generator)
print(x)
print("-"*20)
print(y)

[[78  6  6  6  6]
 [ 6  6 67 78  6]]
--------------------
[[ 6  6  6  6  6]
 [ 6 67 78  6  6]]


In [47]:
# Grab second batch
x, y = next(batch_generator)
print(x)
print("-"*20)
print(y)

[[ 6  6  6  6  6]
 [ 6 40 61 19 28]]
--------------------
[[ 6  6  6  6  6]
 [40 61 19 28  6]]


# Creating the LSTM Model

**Note! We will have options for GPU users and CPU users. CPU will take MUCH LONGER to train and you may encounter RAM issues depending on your hardware.**

In [48]:
class CharModel(nn.Module):
    
    def __init__(self,
                 all_chars,
                 num_hidden=256,
                 num_layers=4,
                 drop_prob=0.5,
                 use_gpu=False):
                
        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        
        #CHARACTER SET, ENCODER, and DECODER
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: ind for ind,char in decoder.items()}
        
        # Input is OHE so size is set appropriately
        self.lstm = nn.LSTM(input_size = len(self.all_chars),
                            hidden_size = num_hidden,
                            num_layers = num_layers,
                            dropout = drop_prob,
                            batch_first = True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        # Output is OHE so size is set appropriately
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
      
    
    def forward(self, x, hidden):
        """
        The forward method here is different from the time series data. In 
        the TS data, we had a fixed window for X and we were predicting single value of y (1 future value). More importantly. the model was 
        trained with the same length of data as the predictions. Hence we did
        not care about the hidden state (and this was kept internal to the model).
        
        This case is however slightly different. We are training the model
        on a sequence of say 100 characters (so we learn long term memory effects),
        but when generating text, we are not feeding it 100 characters (since we 
        dont have that many to start with anyway). We are only feeding in 1 
        character at a time (previous prediction) along with the previous hidden 
        state to predict the next character. Hence we need to not encapsulate the
        hidden state inside the model, rather pass it to the forward method so 
        that the training and generation can use different number of X sequence 
        values (100 suring training, but only one at a time during generation)
        """
        
        lstm_output, hidden = self.lstm(x, hidden)
                
        drop_output = self.dropout(lstm_output)
        
        # Reshaping it so it can be fed to the FC layer
        drop_output = drop_output.contiguous().view(-1, self.num_hidden)
        
        final_out = self.fc_linear(drop_output)
                
        return final_out, hidden
    
    
    def hidden_state(self, batch_size):
        '''
        Used as separate method to account for both GPU and CPU users.
        '''
        
        if self.use_gpu:
            
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden
        

## Instance of the Model

In [49]:
model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [51]:
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))
    
sum(total_param)

5470292

## Optimizer and Loss

In [52]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

## Training Data and Validation Data

In [53]:
# percentage of data to be used for training
train_percent = 0.1

print(len(encoded_text))
print(int(len(encoded_text) * (train_percent)))

5445609
544560


In [54]:
train_ind = int(len(encoded_text) * (train_percent))

In [55]:
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

# Training the Network

## Variables

Feel free to play around with these values!

In [57]:
## VARIABLES

# Epochs to train for
epochs = 50
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

In [91]:
# hidden = (torch.zeros(2,3,4).cuda(),
#           torch.zeros(2,3,4).cuda())
# hidden

In [92]:
# hidden2 = tuple([state.data for state in hidden])
# hidden2

In [81]:
# Set model to train
model.train()

# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
        
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)
        # Note that OHE is not needed for Y for CrossEntropyLoss as per
        # https://pytorch.org/docs/stable/nn.html#crossentropyloss
        
        # Convert Numpy Arrays to Tensor
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        # print("Target Shape: {}".format(targets.shape))
        ## Target Shape: torch.Size([128, 100])

        # Adjust for GPU if necessary
        if model.use_gpu: 
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        
        # print("Before Shape: {} {}".format(hidden[0].shape, hidden[1].shape))
        hidden = tuple([state.data for state in hidden])
        # print("After Shape: {} {}".format(hidden[0].shape, hidden[1].shape))
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        
        # print("LSTM Output Shape: {}".format(lstm_output.shape))
        # # LSTM Output Shape: torch.Size([12800, 84])
        
        # Note that OHE is not needed for Y for CrossEntropyLoss as per
        # https://pytorch.org/docs/stable/nn.html#crossentropyloss
        loss = criterion(lstm_output,
                         targets.view(batch_size*seq_len).long())
        
        loss.backward()
        # POSSIBLE EXPLODING GRADIENT PROBLEM! LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        optimizer.step()
                
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                
                # Convert Numpy Arrays to Tensor
                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:
                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 0 Step: 75 Val Loss: 3.196232318878174
Epoch: 0 Step: 100 Val Loss: 3.081327438354492
Epoch: 1 Step: 125 Val Loss: 3.0036447048187256
Epoch: 1 Step: 150 Val Loss: 2.8926119804382324
Epoch: 2 Step: 175 Val Loss: 2.75602650642395
Epoch: 3 Step: 200 Val Loss: 2.6580865383148193
Epoch: 3 Step: 225 Val Loss: 2.5415375232696533
Epoch: 4 Step: 250 Val Loss: 2.4562277793884277
Epoch: 4 Step: 275 Val Loss: 2.4034595489501953
Epoch: 5 Step: 300 Val Loss: 2.359745979309082
Epoch: 6 Step: 325 Val Loss: 2.315908193588257
Epoch: 6 Step: 350 Val Loss: 2.271934986114502
Epoch: 7 Step: 375 Val Loss: 2.2370591163635254
Epoch: 7 Step: 400 Val Loss: 2.2066919803619385
Epoch: 8 Step: 425 Val Loss: 2.1765236854553223
Epoch: 9 Step: 450 Val Loss: 2.145747184753418
Epoch: 9 Step: 475 Val Loss: 2.122340440750122
Epoch: 10 Step: 500 Val Loss: 2.095566749572754
Epoch: 10 Step: 525 Val Loss: 2.074287176132202
Epoch: 11 Step: 550 Val Loss: 2.0558643341064453
Epoch: 12 Step: 575 Val Loss: 2.036259651184082
E

In [83]:
# Backup model saved as hidden512_layers3_shakes_ng.net
model_name = 'hidden512_layers3_shakes.net'
torch.save(model.state_dict(),model_name)

## Load Model

In [84]:
# MUST MATCH THE EXACT SAME SETTINGS AS MODEL USED DURING TRAINING!

model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [85]:
model.load_state_dict(torch.load(model_name))
model.eval()

CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

# Generating Predictions

In [86]:
def predict_next_char(model, char, hidden=None, k=1):
        """
        char: Single Character
        k: returns k largest probabilities
        """
        
        # Encode raw letters with model
        encoded_text = model.encoder[char]
        
        # set as numpy array for one hot encoding
        # NOTE THE [[ ]] dimensions!!
        encoded_text = np.array([[encoded_text]])
        
        # One hot encoding
        encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))
        
        # Convert to Tensor
        inputs = torch.from_numpy(encoded_text)
        
        # Check for CPU
        if(model.use_gpu):
            inputs = inputs.cuda()
              
        # Grab hidden states
        hidden = tuple([state.data for state in hidden])
          
        # Run model and get predicted output
        lstm_out, hidden = model(inputs, hidden)

        # Convert lstm_out to probabilities
        probs = F.softmax(lstm_out, dim=1).data
        
        if(model.use_gpu):
            # move back to CPU to use with numpy
            probs = probs.cpu()
            
        # k determines how many characters to consider
        # for our probability choice.
        # https://pytorch.org/docs/stable/torch.html#torch.topk
        
        # Return k largest probabilities in tensor
        probs, index_positions = probs.topk(k)
        
        index_positions = index_positions.numpy().squeeze()
        
        # Create array of probabilities
        probs = probs.numpy().flatten()
        
        # Convert to probabilities per index
        probs = probs/probs.sum()
        
        # randomly choose a character based on probabilities
        char = np.random.choice(index_positions, p=probs)
       
        # return the encoded value of the predicted char and the hidden state
        return model.decoder[char], hidden

In [89]:
def generate_text(model, size, seed='The', k=1):
    """
    size: how many characters we want to predict
    """    
    # CHECK FOR GPU
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()
    
    # Evaluation mode
    model.eval()
    
    # begin output from initial seed
    output_chars = [c for c in seed]
    
    # intiate hidden state
    hidden = model.hidden_state(batch_size = 1)
    
    # predict the next character for every character in seed
    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)
    
    # add initial characters to output
    output_chars.append(char)
    
    # Now generate for size requested
    for i in range(size):
        
        # predict based off very last letter in output_chars
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)
        
        # add predicted character
        output_chars.append(char)
    
    # return string of predicted text
    return ''.join(output_chars)

In [90]:
print(generate_text(model, 1000, seed='The ', k=3))

The see her sead against me.
    I have not so best something thee all. They have seen
    And best that strange one to his father word.
  CELIA. There would I spoke a worthy. If thou, I shall not seem the forest
    and ston to her fool. Think in the can that's true too morth to
    the stranger of the court, and this in my soldier winds of his
    sone, and the war of merching of the strights of the counterfore
    hath taunt the can that which time have stoly that has the master, with
    him than I have sent.
  CLOWN. What will you, sir, and she within the content of this
    competies, and stants are all men. I have speak as he hath
    and servance.
  COUNTESS. I am not thine one, and what thou art not so many
    that we shall not be.
  CELIA. If this well beauty, then we see him. If I with me.
  ROSALIND. I will stay to me.
  ROSALIND. What, she it spoke than your braves of the canson more of the world
    as you to her this to him.
  COUNTESS. With him and me, my lord, I will 