In [1]:
# Preprocessing

import numpy as np

with open('1268-0.txt', 'r', encoding='utf8') as fp:
    text = fp.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find ('End of the Project Gutenberg')
text = text[start_indx:end_indx]
char_set = set(text)

print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112350
Unique Characters: 80


In [2]:
chars_sorted = sorted(char_set)
char2int = {ch: i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)

print('Text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(
    text_encoded[15:21], '== Reverse ==>',
    ''.join(char_array[text_encoded[15:21]])
)

Text encoded shape: (1112350,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ==> ISLAND


In [3]:
for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, char_array[ex]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [4]:
import torch
from torch.utils.data import Dataset

seq_length = 40
chunk_size = seq_length + 1
text_chunks = [
    text_encoded[i:i + chunk_size]
    for i in range(len(text_encoded) - chunk_size + 1)
]

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks
    
    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()


seq_dataset = TextDataset(torch.tensor(text_chunks))
for i, (seq, target) in enumerate(seq_dataset):
    print(
        ' Input (x): ',
        repr(''.join(char_array[seq]))
    )
    print(
        'Target (y): ',
        repr(''.join(char_array[target]))
    )
    print()
    if i == 1:
        break

 Input (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
Target (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

 Input (x):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'
Target (y):  'E MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by '



  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [5]:
from torch.utils.data import DataLoader

batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(
    seq_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

In [6]:
# Model

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(
            embed_dim,
            rnn_hidden_size,
            batch_first=True
        )
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)
    
    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell


vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [11]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 200
torch.manual_seed(1)
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item() / seq_length

    if epoch % 10 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 1.5536
Epoch 10 loss: 1.5974
Epoch 20 loss: 1.5773
Epoch 30 loss: 1.5042
Epoch 40 loss: 1.5988
Epoch 50 loss: 1.5669
Epoch 60 loss: 1.5355
Epoch 70 loss: 1.5525
Epoch 80 loss: 1.5033
Epoch 90 loss: 1.4675
Epoch 100 loss: 1.5088
Epoch 110 loss: 1.4264
Epoch 120 loss: 1.4865
Epoch 130 loss: 1.4706
Epoch 140 loss: 1.5077
Epoch 150 loss: 1.4634
Epoch 160 loss: 1.4577
Epoch 170 loss: 1.4491
Epoch 180 loss: 1.4171
Epoch 190 loss: 1.3909


In [13]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)
logits = torch.tensor([[1.0, 1.0, 1.0]])
print(
    'Probabilities:', 
    nn.functional.softmax(logits, dim=1).numpy()[0]
)

m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples.numpy())

Probabilities: [0.33333334 0.33333334 0.33333334]
[[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [2]
 [1]
 [1]]


In [15]:
torch.manual_seed(1)
logits = torch.tensor([[1.0, 1.0, 3.0]])
print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])
m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples.numpy())

Probabilities: [0.10650698 0.10650698 0.78698605]
[[0]
 [2]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [20]:
def sample(
        model, starting_str,
        len_generated_text=500,
        scale_factor=1.0
):
    encoded_input = torch.tensor(
        [char2int[s] for s in starting_str]
    )
    encoded_input = torch.reshape(
        encoded_input, (1, -1)
    )
    generated_str = starting_str
    
    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(
            encoded_input[:, c].view(1),
            hidden, 
            cell
        )
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(
            last_char.view(1),
            hidden,
            cell
        )
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
    
    return generated_str

In [21]:
torch.manual_seed(1)
print(sample(model, starting_str='The island'))

The island.

Still on the muttelling a ril unuse our
to its claw opening, and canemst of the distance
spy in aros helpo is prokes has bar Would be to as not hand on his yet spanions must too hall; in the Pacificed; on
solped in again were during up I rasts, which mach, almost, and only to one of game, has seet; Pencroft creacially
several could no thoughed of the dipies was our day othing time, but on the wreck over,
togeting and way who with ratch of what discovicies! Nop were legned.

They could not twe


In [22]:
logits = torch.tensor([[1.0, 1.0, 3.0]])
print(
    'Probabilities before scaling:',
    nn.functional.softmax(logits, dim=1).numpy()[0]
)

print(
    'Probabilities after scaling with 0.5:',
    nn.functional.softmax(0.5 * logits, dim=1).numpy()[0]
)

print(
    'Probabilities after scaling with 0.1:',
    nn.functional.softmax(0.1 * logits, dim=1).numpy()[0]
)

Probabilities before scaling: [0.10650698 0.10650698 0.78698605]
Probabilities after scaling with 0.5: [0.21194156 0.21194156 0.57611686]
Probabilities after scaling with 0.1: [0.3104238  0.3104238  0.37915248]


In [23]:
torch.manual_seed(1)
# Less randomness
print(sample(model, starting_str='The island', scale_factor=2.0))

The island was arrived to be thing an iron under the sea. A clowed some had been employing out on the sailor, to the world. The stone of the store of the island of the conservicts of the water. It is that a morned to as the bottom of the engineer. “I as not be all the intelligent of the water of the works. The wood, might on
the wind on the words on the engineer. The wind the reporter of the sailor of the water of the restion of the other the reporter work of the colonists were leave the doubtle was compl


In [24]:
torch.manual_seed(1)
# More randomness
print(sample(model, starting_str='The island', scale_factor=0.5))

The island-
pztid
bobert, muttel!” an iron umped azoit wick alsoo of awahwic; C7remst pigrelping nevamsyy in,”ro1 hee.
His paikely! But juss; t miy?”
 oon’s return him Sybfan’s’ muss-pile
to hais; in-underatior exofo’s on
Top!” ago was deed-in hrucky rasns, Jayring!? Theo/, Mr.. Tho,
knewnown” oughion has. Tou; Po0ctmoduc ”ill lin
Creek o’clows from onefez ocpospairpielied.l
sLod hiote’s prittawb,, wevedoesce? 1 vie,
togetiomer.
instow!” with. 6
5e overowled!
Non.

Af! busififa?
Win?”

“That is. Towmom.” 
