In [10]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

In [11]:
# open text
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [12]:
# first 100
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [13]:
# Tokenization
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode
encoded = np.array([char2int[ch] for ch in text])

In [16]:
encoded[:100]

array([64, 19, 47, 41, 65, 30, 51, 29, 20, 32, 32, 32, 33, 47, 41, 41, 44,
       29,  5, 47, 66, 63, 45, 63, 30,  8, 29, 47, 51, 30, 29, 47, 45, 45,
       29, 47, 45, 63, 72, 30, 68, 29, 30, 36, 30, 51, 44, 29, 16, 43, 19,
       47, 41, 41, 44, 29,  5, 47, 66, 63, 45, 44, 29, 63,  8, 29, 16, 43,
       19, 47, 41, 41, 44, 29, 63, 43, 29, 63, 65,  8, 29, 60, 57, 43, 32,
       57, 47, 44, 38, 32, 32, 74, 36, 30, 51, 44, 65, 19, 63, 43])

In [20]:
# one hot encode
def one_code_encode(arr, labels):
    one_hot = np.zeros((np.multiply(*arr.shape),labels), dtype=np.float32 )
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    one_hot = one_hot.reshape((*arr.shape, labels))
    return one_hot

In [22]:
test_seq = np.array([[1,2,5]])
one_hot = one_code_encode(test_seq, 8)
one_hot

array([[[0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0.]]], dtype=float32)

In [35]:
# # batches
# def batches(arr, batch_size, seq_length):
    total_batch_size = batch_size * seq_length
#     n_batches = len(arr)//total_batch_size
#     arr = arr[:n_batches * total_batch_size]
#     arr = arr.reshape((batch_size, -1))
    
#     # iterate a sequance at a time
#     for n in range(0, arr.shape[1], seq_length):
#         x = arr[:, n:n+seq_length]
#         y = np.zeros_like(x)
        
#         try:
#             y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]            
#         except IndexError:
#             y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]

    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [37]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [38]:
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[64 19 47 41 65 30 51 29 20 32]
 [ 8 60 43 29 65 19 47 65 29 47]
 [30 43 59 29 60 51 29 47 29  5]
 [ 8 29 65 19 30 29 42 19 63 30]
 [29  8 47 57 29 19 30 51 29 65]
 [42 16  8  8 63 60 43 29 47 43]
 [29 23 43 43 47 29 19 47 59 29]
 [70 13 45 60 43  8 72 44 38 29]]

y
 [[19 47 41 65 30 51 29 20 32 32]
 [60 43 29 65 19 47 65 29 47 65]
 [43 59 29 60 51 29 47 29  5 60]
 [29 65 19 30 29 42 19 63 30  5]
 [ 8 47 57 29 19 30 51 29 65 30]
 [16  8  8 63 60 43 29 47 43 59]
 [23 43 43 47 29 19 47 59 29  8]
 [13 45 60 43  8 72 44 38 29 67]]
