# Character Level Modeling

In [6]:
import numpy as np
import torch
from torch.utils.data import Dataset


In [2]:
## Reading and processing text
with open('1268-0.txt', 'r', encoding='utf8') as fp:
    text = fp.read()
start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112310
Unique Characters: 80


Most NN libraries and RNN implementations cannot deal
with input data in string format, which is why we have to convert the text into a numeric format. To do
this, we will create a simple Python dictionary that maps each character to an integer, char2int. We
will also need a reverse mapping to convert the results of our model back to text. Although the reverse
can be done using a dictionary that associates integer keys with character values, using a NumPy array
and indexing the array to map indices to those unique characters is more efficient. 

In [5]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)
print('Text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>',
      ''.join(char_array[text_encoded[15:21]]))
for ex in text_encoded[:15]:
    print('{} -> {}'.format(ex, char_array[ex]))

Text encoded shape: (1112310,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ==> ISLAND
44 -> T
32 -> H
29 -> E
1 ->  
37 -> M
48 -> Y
43 -> S
44 -> T
29 -> E
42 -> R
33 -> I
39 -> O
45 -> U
43 -> S
1 ->  


In [7]:
seq_length = 40
chunk_size = seq_length+1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - chunk_size)]

In [8]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, index):
        text_chunk = self.text_chunks[index]
        return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [9]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ',
          repr(''.join(char_array[seq])))
    print(' Target (y): ',
          repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

 Input (x):  'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'
 Target (y):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'

 Input (x):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'
 Target (y):  'E MYSTERIOUS ISLAND\n\nby Jules Verne\n\n187'



In [10]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

## RNN Model

In [11]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [12]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [13]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
num_epochs = 10000
torch.manual_seed(1)
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.3720
Epoch 500 loss: 1.5593
Epoch 1000 loss: 1.3769
Epoch 1500 loss: 1.3498
Epoch 2000 loss: 1.2374
Epoch 2500 loss: 1.1918
Epoch 3000 loss: 1.1465
Epoch 3500 loss: 1.1611
Epoch 4000 loss: 1.0998
Epoch 4500 loss: 1.1116
Epoch 5000 loss: 1.0913
Epoch 5500 loss: 1.0913
Epoch 6000 loss: 1.0623
Epoch 6500 loss: 1.0994
Epoch 7000 loss: 1.0379
Epoch 7500 loss: 1.0502
Epoch 8000 loss: 1.0503
Epoch 8500 loss: 1.0589
Epoch 9000 loss: 1.0437
Epoch 9500 loss: 1.0398


In [15]:
from torch.distributions.categorical import Categorical

def sample(model, starting_str, len_generated_text=500, scale_factor=1.0):
    encoded_input = torch.tensor(
        [char2int[s] for s in starting_str]
    )
    encoded_input = torch.reshape(
        encoded_input, (1, -1)
    )
    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(
            encoded_input[:, c].view(1), hidden, cell
        )
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(
            last_char.view(1), hidden, cell
        )
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits = scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])

    return generated_str

In [16]:
# generating some new text
torch.manual_seed(1)
print(sample(model, starting_str='The island'))

The island hope to
make Tabor Island. A few rays!” cried Pencroft, “for besides, promised to be made from the
highest be severely that he carefully examined, but they could not have gineous, till thinking thumself over.

Pencroft knew more escaped from the question. You saw
Jup, but sometimes continued. He was then sheltered the position
of the monkeys moss a more masoned drink.

“What are the entrance from the crater east!” exclaimed the engineer, “if it has been able to south, for it was still nothing b


In [17]:
# importance of scale factor
logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities before scaling:        ', nn.functional.softmax(logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.5:', nn.functional.softmax(0.5*logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.1:', nn.functional.softmax(0.1*logits, dim=1).numpy()[0])

Probabilities before scaling:         [0.10650698 0.10650698 0.78698605]
Probabilities after scaling with 0.5: [0.21194156 0.21194156 0.57611686]
Probabilities after scaling with 0.1: [0.3104238  0.3104238  0.37915248]


In [18]:
torch.manual_seed(1)
print(sample(model, starting_str='The island', 
             scale_factor=2.0))

The island was there that the bed of the sea, and in a voice was not wanting for the forest, bounding over the island.

The colonists should disturb his life, and the colonists had not been torn with the tide was there. Neb and Pencroft had become carefully explored the shore, the box was in sight of the balloon was composed of the colony, soon made a magnificent trees, and which flows were the narrow gallinaceae, which was not more than two years and the manufacture of the lake would perhaps
many very fa


In [19]:
torch.manual_seed(1)
print(sample(model, starting_str='The island', 
             scale_factor=0.5))

The island fougred, He treuml at Grotto amour,
throwed off enemy, caft which done some issueable dograe, built infffliently Neved
that his hode hard impelled,” but
yetcedluge.”

“Oh, Wasomak picict-insue!

Tubeltileasings, 5chee lying! ourse, Nez 90 Oyrusua--drip?
Mr!-
Washed the replain,” answered Neb.

That moment he azid feared burst roasted, Pencroft.?
errates,”
squee drwited from
damp pet!’

Chaneay Housa--jeaced Tabor Would ventured every destruarized mmethrubsions;, our mountain wide, and novclo,--
