# Pride and prejudice text generation

## Imports and loading the data

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with open('../data/pride_and_prejudice.txt','r',encoding='utf8') as f:
    text = f.read()

In [3]:
text[:1000]

'Chapter 1\n\n\nIt is a truth universally acknowledged, that a single man in possession\nof a good fortune, must be in want of a wife.\n\nHowever little known the feelings or views of such a man may be on his\nfirst entering a neighbourhood, this truth is so well fixed in the minds\nof the surrounding families, that he is considered the rightful property\nof some one or other of their daughters.\n\n“My dear Mr. Bennet,” said his lady to him one day, “have you heard that\nNetherfield Park is let at last?”\n\nMr. Bennet replied that he had not.\n\n“But it is,” returned she; “for Mrs. Long has just been here, and she\ntold me all about it.”\n\nMr. Bennet made no answer.\n\n“Do you not want to know who has taken it?” cried his wife impatiently.\n\n“_You_ want to tell me, and I have no objection to hearing it.”\n\nThis was invitation enough.\n\n“Why, my dear, you must know, Mrs. Long says that Netherfield is taken\nby a young man of large fortune from the north of England; that he came\ndow

In [4]:
print(text[:1000])

Chapter 1


It is a truth universally acknowledged, that a single man in possession
of a good fortune, must be in want of a wife.

However little known the feelings or views of such a man may be on his
first entering a neighbourhood, this truth is so well fixed in the minds
of the surrounding families, that he is considered the rightful property
of some one or other of their daughters.

“My dear Mr. Bennet,” said his lady to him one day, “have you heard that
Netherfield Park is let at last?”

Mr. Bennet replied that he had not.

“But it is,” returned she; “for Mrs. Long has just been here, and she
told me all about it.”

Mr. Bennet made no answer.

“Do you not want to know who has taken it?” cried his wife impatiently.

“_You_ want to tell me, and I have no objection to hearing it.”

This was invitation enough.

“Why, my dear, you must know, Mrs. Long says that Netherfield is taken
by a young man of large fortune from the north of England; that he came
down on Monday in a chaise and fo

In [5]:
len(text)

684743

## Encode entire text

We create an encoder and a decoder for each character in the text:

In [6]:
all_characters = sorted(list(set(text)))

In [7]:
decoder = dict(enumerate(all_characters))

In [8]:
decoder.items()

dict_items([(0, '\n'), (1, ' '), (2, '!'), (3, "'"), (4, '('), (5, ')'), (6, '*'), (7, ','), (8, '-'), (9, '.'), (10, '0'), (11, '1'), (12, '2'), (13, '3'), (14, '4'), (15, '5'), (16, '6'), (17, '7'), (18, '8'), (19, '9'), (20, ':'), (21, ';'), (22, '?'), (23, 'A'), (24, 'B'), (25, 'C'), (26, 'D'), (27, 'E'), (28, 'F'), (29, 'G'), (30, 'H'), (31, 'I'), (32, 'J'), (33, 'K'), (34, 'L'), (35, 'M'), (36, 'N'), (37, 'O'), (38, 'P'), (39, 'R'), (40, 'S'), (41, 'T'), (42, 'U'), (43, 'V'), (44, 'W'), (45, 'Y'), (46, 'Z'), (47, '_'), (48, 'a'), (49, 'b'), (50, 'c'), (51, 'd'), (52, 'e'), (53, 'f'), (54, 'g'), (55, 'h'), (56, 'i'), (57, 'j'), (58, 'k'), (59, 'l'), (60, 'm'), (61, 'n'), (62, 'o'), (63, 'p'), (64, 'q'), (65, 'r'), (66, 's'), (67, 't'), (68, 'u'), (69, 'v'), (70, 'w'), (71, 'x'), (72, 'y'), (73, 'z'), (74, '“'), (75, '”')])

In [9]:
encoder = {char: ind for ind,char in decoder.items()}

In [10]:
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:100]

array([25, 55, 48, 63, 67, 52, 65,  1, 11,  0,  0,  0, 31, 67,  1, 56, 66,
        1, 48,  1, 67, 65, 68, 67, 55,  1, 68, 61, 56, 69, 52, 65, 66, 48,
       59, 59, 72,  1, 48, 50, 58, 61, 62, 70, 59, 52, 51, 54, 52, 51,  7,
        1, 67, 55, 48, 67,  1, 48,  1, 66, 56, 61, 54, 59, 52,  1, 60, 48,
       61,  1, 56, 61,  1, 63, 62, 66, 66, 52, 66, 66, 56, 62, 61,  0, 62,
       53,  1, 48,  1, 54, 62, 62, 51,  1, 53, 62, 65, 67, 68, 61])

In [11]:
decoder[25]

'C'

**We one-hot encode our data:**

In [12]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text
    
    num_uni_chars = number of unique characters (len(set(text)))
    '''
    
    # METHOD FROM:
    # https://stackoverflow.com/questions/29831489/convert-encoded_textay-of-indices-to-1-hot-encoded-numpy-encoded_textay
      
    # Create a placeholder for zeros.
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    
    # Convert data type for later use with pytorch
    one_hot = one_hot.astype(np.float32)

    # Using fancy indexing fill in the 1s at the correct index locations
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0
    
    # Reshape it so it matches the batch shape
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    
    return one_hot

In [13]:
# one_hot_encoder(np.array([1,2,0]),3)

## Create the training batches

We create a function that will generate batches of characters along with the next character in the sequence as a label.

In [14]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    
    '''
    Generate batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    x:
    
    [[1 2 3]]
    
    y:
    
    [[2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    samp_per_batch : Number of samples (sequences) per batch
    seq_len : Length of character sequence
       
    '''
    
    # Total number of characters per batch:
    char_per_batch = samp_per_batch * seq_len
    
    # Number of batches available to make (rounded down to integer)
    num_batches_avail = int(np.floor(len(encoded_text)/char_per_batch))
    
    # Cut off end of encoded_text that won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]
    
    # Reshape text into samp_per_batch rows
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    
    # Go through each row in array.
    for n in range(0, encoded_text.shape[1], seq_len):
        
        # Grab feature characters
        x = encoded_text[:, n:n+seq_len]
        
        # Go through each row in array.
        y = np.zeros_like(x)
       
        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]
            
        # End of the row:    
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y

### Example of generating a batch:

In [15]:
sample_text = encoded_text[:20]
sample_text

array([25, 55, 48, 63, 67, 52, 65,  1, 11,  0,  0,  0, 31, 67,  1, 56, 66,
        1, 48,  1])

In [16]:
batch_generator = generate_batches(sample_text, samp_per_batch=2, seq_len=5)

In [17]:
# Grab first batch
x, y = next(batch_generator)

In [18]:
x

array([[25, 55, 48, 63, 67],
       [ 0,  0, 31, 67,  1]])

In [19]:
y

array([[55, 48, 63, 67, 52],
       [ 0, 31, 67,  1, 56]])

## Creating the LSTM Model


In [20]:
class CharModel(nn.Module):
    
    def __init__(self, all_chars, num_hidden=256, num_layers=4, drop_prob=0.5, use_gpu=False):
        
        
        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        
        #CHARACTER SET, ENCODER, and DECODER
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: ind for ind,char in decoder.items()}
        
        
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
      
    
    def forward(self, x, hidden):
                
        lstm_output, hidden = self.lstm(x, hidden)
        drop_output = self.dropout(lstm_output)
        drop_output = drop_output.contiguous().view(-1, self.num_hidden)
        final_out = self.fc_linear(drop_output)
        return final_out, hidden
    
    
    def hidden_state(self, batch_size):
        '''
        Used as separate method to account for both GPU and CPU users.
        '''
        
        if self.use_gpu:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden
        

## Instance of the model

In [21]:
model = CharModel(
    all_chars=all_characters,
    num_hidden=256,
    num_layers=2,
    drop_prob=0.3,
    use_gpu=False,
)

In [22]:
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))

In [23]:
sum(total_param)

887884

In [24]:
len(encoded_text)

684743

The number of parameters is roughly of the same magnitude of the total number of characters

In [25]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.002)
criterion = nn.CrossEntropyLoss()

## Training Data and Validation Data

In [26]:
train_length = int(len(encoded_text)*0.8)

train_data = encoded_text[:train_length]
val_data = encoded_text[train_length:]

In [27]:
# Epochs to train for
epochs = 80

# batch size 
batch_size = 128

# Length of sequence
seq_len = 200

tracker = 0

# number of characters in text
num_char = max(encoded_text) + 1

In [28]:
# Set model to train
model.train()


# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        # Adjust for GPU if necessary
        
        if model.use_gpu:
            
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                # Reset Hidden State
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 1 Step: 25 Val Loss: 3.100309133529663
Epoch: 2 Step: 50 Val Loss: 3.093801498413086
Epoch: 3 Step: 75 Val Loss: 3.084357261657715
Epoch: 4 Step: 100 Val Loss: 3.0183935165405273
Epoch: 5 Step: 125 Val Loss: 2.7930691242218018
Epoch: 7 Step: 150 Val Loss: 2.6292505264282227
Epoch: 8 Step: 175 Val Loss: 2.507021903991699
Epoch: 9 Step: 200 Val Loss: 2.4266793727874756
Epoch: 10 Step: 225 Val Loss: 2.367846727371216
Epoch: 11 Step: 250 Val Loss: 2.286653995513916
Epoch: 13 Step: 275 Val Loss: 2.2247824668884277
Epoch: 14 Step: 300 Val Loss: 2.1715362071990967
Epoch: 15 Step: 325 Val Loss: 2.1219828128814697
Epoch: 16 Step: 350 Val Loss: 2.0771384239196777
Epoch: 17 Step: 375 Val Loss: 2.033853530883789
Epoch: 19 Step: 400 Val Loss: 1.9908596277236938
Epoch: 20 Step: 425 Val Loss: 1.9494863748550415
Epoch: 21 Step: 450 Val Loss: 1.912807822227478
Epoch: 22 Step: 475 Val Loss: 1.8778191804885864
Epoch: 23 Step: 500 Val Loss: 1.842667579650879
Epoch: 24 Step: 525 Val Loss: 1.81272387

### Saving the model:

In [29]:
model_name = 'LSTM_prideandprejudice_256_2_0.3.net'

torch.save(model.state_dict(), '../models/'+model_name)

## Loading the model:

In [28]:
# Must match the same parameters as in training

model = CharModel(
    all_chars=all_characters,
    num_hidden=256,
    num_layers=2,
    drop_prob=0.3,
    use_gpu=False,
)

model_name = 'LSTM_prideandprejudice_256_2_0.3.net'

model.load_state_dict(torch.load('../models/'+model_name))

model.eval()

CharModel(
  (lstm): LSTM(76, 256, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc_linear): Linear(in_features=256, out_features=76, bias=True)
)

## Generating Predictions

In [29]:
def predict_next_char(model, char, hidden=None, k=1, temperature=1.0):
        
        # Encode -> one-hot -> tensor
    
        encoded_text = model.encoder[char]
        encoded_text = np.array([[encoded_text]])
    
        encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))
    
        inputs = torch.from_numpy(encoded_text)
        
        # Check for CPU
        if(model.use_gpu):
            inputs = inputs.cuda()
        
        # Grab hidden states
        # hidden = tuple([state.data for state in hidden])
        hidden = tuple(h.detach() for h in hidden)
        
        # Run model and get predicted output
        lstm_out, hidden = model(inputs, hidden)

    
        # scale logits before softmax: lower temp = more conservative, higher = more random
        if temperature <= 0:
            temperature = 1e-8  # avoid division by zero
        lstm_out = lstm_out / temperature


    
        # Convert lstm_out to probabilities
        probs = F.softmax(lstm_out, dim=1).data
        
        if(model.use_gpu):
            # move back to CPU to use with numpy
            probs = probs.cpu()

        # Return k largest probabilities in tensor
        probs, index_positions = probs.topk(k)
         
        index_positions = index_positions.numpy().reshape(-1)
        
        # Create array of probabilities
        probs = probs.numpy().reshape(-1)
        
        # Convert to probabilities per index
        probs = probs/probs.sum()
        
        # randomly choose a character based on probabilities
        if k==1:
            char = int(index_positions[0])
        else:
            char = np.random.choice(index_positions, p=probs)
       
        # return the decoded value of the predicted char and the hidden state
        return model.decoder[char], hidden

In [30]:
def generate_text(model, size, seed='The', k=1, temperature=1.0):
        
    # CHECK FOR GPU
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()
    
    # Evaluation mode
    model.eval()
    
    # begin output from initial seed
    output_chars = [c for c in seed]
    
    # intiate hidden state
    hidden = model.hidden_state(1)
    
    # predict the next character for every character in seed
    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k, temperature=temperature)
    
    # add initial characters to output
    output_chars.append(char)
    
    # Now generate for size requested
    for i in range(size):
        
        # predict based off very last letter in output_chars
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k, temperature=temperature)
        
        # add predicted character
        output_chars.append(char)
    
    # return string of predicted text
    return ''.join(output_chars)

## Example generated text

In [44]:
print(generate_text(model, 1000, seed='Mr', k=3, temperature=1.2))

Mr.
Bennet, was not a streat of the passing a servant was no make her to be something a day, and soon something to be some accamily the discovery as so seemed to the suppressed himself to be a family as had no seemed on the particularly, as she, “and his
care on the chaice as he could not have been served, as to this compleshed and such opinion of a family,” replied her, when they having a manner on her family, and and sense of the caming herself.”

“As she was
not assisting the most of her
astermined. To had been to to the sertion of the consideration which she had not befured them, that they had not been a dingryer that the provised
her thought his settliched his accouns a day and a most been as to be, and all her from
anything offer of the part a most day.

“The prising him at a moners at her acterraining to happiness of the present attention to the same on the caming her to the provession,
and she
could he was soon supposing
and such a few
ander and a sense of her family, than thin