# Pride and prejudice text generation

## Imports and loading the data

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
with open('../data/pride_and_prejudice.txt','r',encoding='utf8') as f:
    text = f.read()

In [10]:
text[:1000]

'Chapter 1\n\n\nIt is a truth universally acknowledged, that a single man in possession\nof a good fortune, must be in want of a wife.\n\nHowever little known the feelings or views of such a man may be on his\nfirst entering a neighbourhood, this truth is so well fixed in the minds\nof the surrounding families, that he is considered the rightful property\nof some one or other of their daughters.\n\n“My dear Mr. Bennet,” said his lady to him one day, “have you heard that\nNetherfield Park is let at last?”\n\nMr. Bennet replied that he had not.\n\n“But it is,” returned she; “for Mrs. Long has just been here, and she\ntold me all about it.”\n\nMr. Bennet made no answer.\n\n“Do you not want to know who has taken it?” cried his wife impatiently.\n\n“_You_ want to tell me, and I have no objection to hearing it.”\n\nThis was invitation enough.\n\n“Why, my dear, you must know, Mrs. Long says that Netherfield is taken\nby a young man of large fortune from the north of England; that he came\ndow

In [11]:
print(text[:1000])

Chapter 1


It is a truth universally acknowledged, that a single man in possession
of a good fortune, must be in want of a wife.

However little known the feelings or views of such a man may be on his
first entering a neighbourhood, this truth is so well fixed in the minds
of the surrounding families, that he is considered the rightful property
of some one or other of their daughters.

“My dear Mr. Bennet,” said his lady to him one day, “have you heard that
Netherfield Park is let at last?”

Mr. Bennet replied that he had not.

“But it is,” returned she; “for Mrs. Long has just been here, and she
told me all about it.”

Mr. Bennet made no answer.

“Do you not want to know who has taken it?” cried his wife impatiently.

“_You_ want to tell me, and I have no objection to hearing it.”

This was invitation enough.

“Why, my dear, you must know, Mrs. Long says that Netherfield is taken
by a young man of large fortune from the north of England; that he came
down on Monday in a chaise and fo

In [12]:
len(text)

684743

## Encode entire text

We create an encoder and a decoder for each character in the text:

In [13]:
all_characters = set(text)

In [15]:
decoder = dict(enumerate(all_characters))

In [18]:
decoder.items()

dict_items([(0, 't'), (1, '-'), (2, 'R'), (3, 'p'), (4, 's'), (5, 'Z'), (6, 'T'), (7, 'd'), (8, ';'), (9, 'z'), (10, '7'), (11, 'D'), (12, 'j'), (13, '9'), (14, 'c'), (15, '_'), (16, ' '), (17, 'N'), (18, 'q'), (19, "'"), (20, 'b'), (21, '('), (22, 'k'), (23, 'w'), (24, 'M'), (25, 'r'), (26, 'm'), (27, ':'), (28, '4'), (29, '3'), (30, 'L'), (31, 'Y'), (32, 'W'), (33, 'l'), (34, ')'), (35, 'O'), (36, '*'), (37, '5'), (38, '.'), (39, 'v'), (40, 'J'), (41, '1'), (42, 'a'), (43, '!'), (44, '2'), (45, 'B'), (46, 'u'), (47, 'h'), (48, 'e'), (49, '\n'), (50, 'f'), (51, 'F'), (52, 'P'), (53, '“'), (54, 'H'), (55, 'S'), (56, 'C'), (57, 'i'), (58, 'E'), (59, 'G'), (60, 'I'), (61, 'A'), (62, '8'), (63, '?'), (64, 'y'), (65, '6'), (66, 'U'), (67, '0'), (68, 'g'), (69, 'K'), (70, 'V'), (71, ','), (72, '”'), (73, 'o'), (74, 'x'), (75, 'n')])

In [19]:
encoder = {char: ind for ind,char in decoder.items()}

In [23]:
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:100]

array([56, 47, 42,  3,  0, 48, 25, 16, 41, 49, 49, 49, 60,  0, 16, 57,  4,
       16, 42, 16,  0, 25, 46,  0, 47, 16, 46, 75, 57, 39, 48, 25,  4, 42,
       33, 33, 64, 16, 42, 14, 22, 75, 73, 23, 33, 48,  7, 68, 48,  7, 71,
       16,  0, 47, 42,  0, 16, 42, 16,  4, 57, 75, 68, 33, 48, 16, 26, 42,
       75, 16, 57, 75, 16,  3, 73,  4,  4, 48,  4,  4, 57, 73, 75, 49, 73,
       50, 16, 42, 16, 68, 73, 73,  7, 16, 50, 73, 25,  0, 46, 75])

In [29]:
decoder[56]

'C'

**We one-hot encode our data:**

In [30]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text
    
    num_uni_chars = number of unique characters (len(set(text)))
    '''
    
    # METHOD FROM:
    # https://stackoverflow.com/questions/29831489/convert-encoded_textay-of-indices-to-1-hot-encoded-numpy-encoded_textay
      
    # Create a placeholder for zeros.
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    
    # Convert data type for later use with pytorch
    one_hot = one_hot.astype(np.float32)

    # Using fancy indexing fill in the 1s at the correct index locations
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0
    
    # Reshape it so it matches the batch shape
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    
    return one_hot

In [32]:
# one_hot_encoder(np.array([1,2,0]),3)

## Create the training batches

We create a function that will generate batches of characters along with the next character in the sequence as a label.

In [56]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    
    '''
    Generate batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    x:
    
    [[1 2 3]]
    
    y:
    
    [[2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    samp_per_batch : Number of samples (sequences) per batch
    seq_len : Length of character sequence
       
    '''
    
    # Total number of characters per batch:
    char_per_batch = samp_per_batch * seq_len
    
    # Number of batches available to make (rounded down to integer)
    num_batches_avail = int(np.floor(len(encoded_text)/char_per_batch))
    
    # Cut off end of encoded_text that won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]
    
    # Reshape text into samp_per_batch rows
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    
    # Go through each row in array.
    for n in range(0, encoded_text.shape[1], seq_len):
        
        # Grab feature characters
        x = encoded_text[:, n:n+seq_len]
        
        # Go through each row in array.
        y = np.zeros_like(x)
       
        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]
            
        # End of the row:    
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y

### Example of generating a batch:

In [58]:
sample_text = encoded_text[:20]
sample_text

array([56, 47, 42,  3,  0, 48, 25, 16, 41, 49, 49, 49, 60,  0, 16, 57,  4,
       16, 42, 16])

In [59]:
batch_generator = generate_batches(sample_text, samp_per_batch=2, seq_len=5)

In [64]:
# Grab first batch
x, y = next(batch_generator)

In [65]:
x

array([[48, 25, 16, 41, 49],
       [57,  4, 16, 42, 16]])

In [66]:
y

array([[25, 16, 41, 49, 56],
       [ 4, 16, 42, 16, 49]])

## Creating the LSTM Model


In [67]:
class CharModel(nn.Module):
    
    def __init__(self, all_chars, num_hidden=256, num_layers=4, drop_prob=0.5, use_gpu=False):
        
        
        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        
        #CHARACTER SET, ENCODER, and DECODER
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: ind for ind,char in decoder.items()}
        
        
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
      
    
    def forward(self, x, hidden):
                  
        
        lstm_output, hidden = self.lstm(x, hidden)
        
        
        drop_output = self.dropout(lstm_output)
        
        drop_output = drop_output.contiguous().view(-1, self.num_hidden)
        
        
        final_out = self.fc_linear(drop_output)
        
        
        return final_out, hidden
    
    
    def hidden_state(self, batch_size):
        '''
        Used as separate method to account for both GPU and CPU users.
        '''
        
        if self.use_gpu:
            
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden
        

## Instance of the model

In [74]:
model = CharModel(
    all_chars=all_characters,
    num_hidden=128,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=False,
)

In [75]:
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))

In [76]:
sum(total_param)

379468

In [77]:
len(encoded_text)

684743

The number of parameters is roughly of the same magnitude of the total number of characters

In [78]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

## Training Data and Validation Data