In [1]:
import numpy as np
with open('1268-0.txt') as fp: # THE MYSTERIOUS ISLAND by Jules Verne
    text = fp.read()

start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)
print(f'Text length: {len(text)}')
print(f'Unique characters: {len(char_set)}')

Text length: 1112310
Unique characters: 80


In [1]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)

NameError: name 'char_set' is not defined

In [3]:
import torch
from torch.utils.data import Dataset
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size] 
               for i in range(len(text_encoded)-chunk_size)]

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, idx):
        text_chunk = torch.tensor(self.text_chunks[idx])
        return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(text_chunks)

In [4]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(0)
seq_dl = DataLoader(seq_dataset, batch_size,
                    shuffle=True, drop_last=True)

In [56]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, device, vocab_size, embed_dim,
                 rnn_hidden_size):
        super().__init__()
        self.device = device
        self.embedding = nn.Embedding(vocab_size, embed_dim).to(device)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True).to(device)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size).to(device)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size).to(self.device)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size).to(self.device)
        return hidden, cell        

In [57]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
elif torch.backends.mps.is_available():
    device = torch.device('mps:0')
else:
    device = torch.device('cpu')

print(device)

mps:0


In [58]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(0)
model = RNN(device, vocab_size, embed_dim, rnn_hidden_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [59]:
num_epochs = 10000
torch.manual_seed(0)
model.train()
for epoch in range(1, num_epochs+1):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item() / seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} Loss {loss:.4f}')

Epoch 500 Loss 1.4117
Epoch 1000 Loss 1.3540
Epoch 1500 Loss 1.3390
Epoch 2000 Loss 1.1891
Epoch 2500 Loss 1.2130
Epoch 3000 Loss 1.1530
Epoch 3500 Loss 1.1236
Epoch 4000 Loss 1.1016
Epoch 4500 Loss 1.0432
Epoch 5000 Loss 1.1290
Epoch 5500 Loss 1.0998
Epoch 6000 Loss 1.0371
Epoch 6500 Loss 1.0822
Epoch 7000 Loss 1.0800
Epoch 7500 Loss 1.0968
Epoch 8000 Loss 1.0186
Epoch 8500 Loss 1.0596
Epoch 9000 Loss 1.0713
Epoch 9500 Loss 1.0382
Epoch 10000 Loss 1.0091


In [71]:
from torch.distributions.categorical import Categorical
def sample(model, starting_str,
           len_generated_text=500,
           scale_factor=1.0):
    encoded_input = torch.tensor(
        [char2int[ch] for ch in starting_str]
    ).reshape(1, -1).to(device)
    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str) - 1):
        _, hidde, cell = model(encoded_input[:, c], hidden, cell)

    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(
            last_char.view(1), hidden, cell
        )
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample().to(device)
        generated_str += str(char_array[last_char])

    return generated_str

#### Generated text, scale_factor = 1.0

In [89]:
torch.manual_seed(0)
print(sample(model, starting_str=' ', scale_factor=1.0))

 Pencroft, cut in blown into the rocks, not even that metal
place had been confident as a communication with, and always after having been being once and, for they had been, were well, the former castaways, covered weapons with yourself.

“I would still
struggle it; Herbert separated some time.”

“Then pipe.”

“I am she we enable that,” said Cyrus Harding; “for no island could not be thrown up, and if they would construct a prey when the incidents were now wing morth?

What a more effect.

During


#### Generated text, scale_factor = 0.5 (more random generation)

In [90]:
print(sample(model, starting_str=' ', scale_factor=0.5))

 fadaine.”
 Dail It did noburans ty bocomowary, laited necessity. Matters assirved us-’at’-bank; no ized unfoor rate. At
their, beyodes, where dressark,”s endshile, more saltic”! adefarcounded?

Ayrton-foring Gidnoinhestoms as the
breezing. Twey fast?
Hirtleith thus those vauled betwein it.
Outsiw’ timped itself.

“Oh igningbouildining-machings,” he said,-whomoutiously this heat, burry quarse,’s being quice hed. Their wholb
tight anticallant civicing hoist, eighth fusgeh.
Neb,
of jessp coat dante


#### Generated text, scale_factor = 3.0 (more deterministic generation)

In [93]:
print(sample(model, starting_str=' ', scale_factor=3.0))

 the reporter.

“No, my boy,” replied the reporter.

“No, Pencroft, that is to say, our first cavern with the interior of the island with a good castaway.”

“But what are you mean to be an account of the productions of the mountain,” said the reporter, “but we cannot see if it is a man to find himself out of the country?” asked Herbert, “the river will be no doubt that the interior of the convicts had been carried to the corral. The contrary, the colonists were not more than the colonists, who wa


In [94]:
torch.save(model, 'char_level_lang_modeling_RNN.pth')