<a href="https://colab.research.google.com/github/mandelatrey/ML/blob/main/Text_Generation_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing modules

In [5]:
import numpy as np

Importing the dataset and finding text properties

In [6]:
with open('/content/The Mysterious Island.txt', 'r', encoding='utf8') as fp:
    text=fp.read()

start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)
print('Total length: ', len(text))
print('Unique Chars: ', len(char_set))



Total length:  1130711
Unique Chars:  85


we then build a dictionary and map chars to integers

In [7]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)

print('text encoded shape: ', text_encoded.shape)
print(text[:15], '== Encoding ==> ', text_encoded[:15]) #first 15, last 15
print(text_encoded[15:21], '==reversed==>',
      ''.join(char_array[text_encoded[15:21]]))


text encoded shape:  (1130711,)
THE MYSTERIOUS  == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32] ==reversed==> ISLAND


Visualising the encodings

In [8]:
for ex in text_encoded[:15]:
    print('{} --> {}'.format(ex, char_array[ex]))

48 --> T
36 --> H
33 --> E
1 -->  
41 --> M
53 --> Y
47 --> S
48 --> T
33 --> E
46 --> R
37 --> I
43 --> O
49 --> U
47 --> S
1 -->  


In [9]:
import torch
from torch.utils.data import Dataset

seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size]
               for i in range(len(text_encoded)-chunk_size)]
               
from torch.utils.data import Dataset
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [10]:
# for i, (seq, target) in enumerate(seq_dataset):
#     print(' input (x): ',
#           repr(''.join(char_array[seq])))
#     print('Target (y): ',
#           repr(''.join(char_array[target])))
#     print()
#     if i == 1:
#         break


for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ',
        repr(''.join(char_array[seq])))
    print('Target (y): ',
        repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

 Input (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

 Input (x):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y):  'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [11]:
from torch.utils.data import DataLoader

batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size,
                    shuffle=True, drop_last=True)

In [12]:
import torch.nn as nn

Building a model

In [13]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [14]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model



RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

Making a loss function

In [15]:
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

We then train the model

In [16]:
num_epochs = 10000
torch.manual_seed(1)
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    optimiser.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimiser.step()
    loss = loss.item()/seq_length

    if epoch % 300 == 0:
        print(f"Epoch {epoch} loss: {loss:.4f}")




Epoch 0 loss: 4.4368
Epoch 300 loss: 1.6249
Epoch 600 loss: 1.4520
Epoch 900 loss: 1.3720
Epoch 1200 loss: 1.3754
Epoch 1500 loss: 1.3374
Epoch 1800 loss: 1.2882
Epoch 2100 loss: 1.2668
Epoch 2400 loss: 1.2464
Epoch 2700 loss: 1.1897
Epoch 3000 loss: 1.1967
Epoch 3300 loss: 1.1585
Epoch 3600 loss: 1.1674
Epoch 3900 loss: 1.1354
Epoch 4200 loss: 1.1324
Epoch 4500 loss: 1.1363
Epoch 4800 loss: 1.1005
Epoch 5100 loss: 1.1010
Epoch 5400 loss: 1.1293
Epoch 5700 loss: 1.1394
Epoch 6000 loss: 1.1279
Epoch 6300 loss: 1.1001
Epoch 6600 loss: 1.0714
Epoch 6900 loss: 1.0783
Epoch 7200 loss: 1.0575
Epoch 7500 loss: 1.0715
Epoch 7800 loss: 1.0046
Epoch 8100 loss: 1.0540
Epoch 8400 loss: 1.0322
Epoch 8700 loss: 1.0152
Epoch 9000 loss: 0.9880
Epoch 9300 loss: 1.0865
Epoch 9600 loss: 0.9451
Epoch 9900 loss: 0.9871


We use categoricals to sample random samples from the distributn

In [20]:
from torch.distributions.categorical import Categorical

In [42]:
def sample(model, starting_str,
    len_generated_text=500,
    scale_factor=1.0):
    encoded_input = torch.tensor(
        [char2int[s] for s in starting_str]
    )
    encoded_input = torch.reshape(
        encoded_input, (1, -1)
    )
    generated_str = starting_str #first set it to input

    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(
            encoded_input[:, c].view(1), hidden, cell
        )

    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(
            last_char.view(1), hidden, cell
        )

        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])

    return generated_str

torch.manual_seed(53)
print(sample(model, starting_str='The boys stood amazed at', 
             scale_factor=2.0)) # 2.0=more predictable 0.5=more random

The boys stood amazed at the corral, and to watch the wind spiring with the first sight of the rocks. They must have
reasoning to the beach, and the true, we shall see a dozen of the forest, and
the sailor talked away the water to the east, and the new crater and the situation of the
distance in the sand.”

“We shall see that the time when the colonists,” replied Cyrus Harding.

“And that we can go some day that the convulsion of this storm might be the passages and transformed into the surface of the lake. It was imag
