In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

In [6]:
# open text
with open('../../deep-learning-v2-pytorch/recurrent-neural-networks/char-rnn/data/anna.txt', 'r') as f:
    text = f.read()

In [7]:
# first 100
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [8]:
# Tokenization
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode
encoded = np.array([char2int[ch] for ch in text])

In [9]:
encoded[:100]

array([31, 71, 63, 28, 64, 67, 60, 56, 65, 53, 53, 53, 49, 63, 28, 28, 13,
       56, 14, 63,  6, 12, 17, 12, 67, 70, 56, 63, 60, 67, 56, 63, 17, 17,
       56, 63, 17, 12, 50, 67, 76, 56, 67, 77, 67, 60, 13, 56, 21,  2, 71,
       63, 28, 28, 13, 56, 14, 63,  6, 12, 17, 13, 56, 12, 70, 56, 21,  2,
       71, 63, 28, 28, 13, 56, 12,  2, 56, 12, 64, 70, 56, 73, 45,  2, 53,
       45, 63, 13,  7, 53, 53, 22, 77, 67, 60, 13, 64, 71, 12,  2])

In [10]:
# one hot encode
def one_code_encode(arr, labels):
    one_hot = np.zeros((np.multiply(*arr.shape),labels), dtype=np.float32 )
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    one_hot = one_hot.reshape((*arr.shape, labels))
    return one_hot

In [11]:
test_seq = np.array([[1,2,5]])
one_hot = one_code_encode(test_seq, 8)
one_hot

array([[[0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0.]]], dtype=float32)

In [14]:
def get_batches(arr, batch_size, seq_length):
    # get the number of full batches
    batch_size_total = batch_size * seq_length
    n_batches = len(arr)//batch_size_total
    # keep only enough chars to make full batches
    arr = arr[:n_batches * batch_size_total]
    # reshape into {batch_size} rows
    arr = arr.reshape((batch_size, -1))
    # loop through the batches using a seq length of 3
    for n in range(0, arr.shape[1], seq_length):
        # features
        x = arr[:, n:n+seq_length]
        # targets
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [15]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [16]:
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[31 71 63 28 64 67 60 56 65 53]
 [70 73  2 56 64 71 63 64 56 63]
 [67  2 78 56 73 60 56 63 56 14]
 [70 56 64 71 67 56 44 71 12 67]
 [56 70 63 45 56 71 67 60 56 64]
 [44 21 70 70 12 73  2 56 63  2]
 [56 61  2  2 63 56 71 63 78 56]
 [34 43 17 73  2 70 50 13  7 56]]

y
 [[71 63 28 64 67 60 56 65 53 53]
 [73  2 56 64 71 63 64 56 63 64]
 [ 2 78 56 73 60 56 63 56 14 73]
 [56 64 71 67 56 44 71 12 67 14]
 [70 63 45 56 71 67 60 56 64 67]
 [21 70 70 12 73  2 56 63  2 78]
 [61  2  2 63 56 71 63 78 56 70]
 [43 17 73  2 70 50 13  7 56 39]]
