In [13]:
with open('huxley_raw.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [14]:
print(f"lenght of text: {len(text)}")

lenght of text: 1085291


In [15]:
print(text[:10000])

1
Collected Essays 
by Aldous Huxley 
 
Back Cover: 
 
 All over the English-speaking world cr itics have greeted these essays with such comments as "brilliant. . . 
provocative. . . magnificent." Many find that  Huxley is the finest essayist sin ce Montaigne. It has been said that 
"Mr. Huxley is not only a literary giant, but one of the greatest thinkers of our time." 
 Mr. Huxley's topic is man, the total compass of his facu lties in science, literature, music, religion, art, love, 
sex, speculative thinking and simple bei ng. Here, displayed to the full, is th e astonishing virtuosity of Huxley's 
genius. 
  
 The range of Aldous Huxley's thinking was astonishi ng. His opinions on art were  as original and well-
founded as his discussions of biology or  architecture, poetry, music, or hist ory. As a virtuoso of letters, he was 
unequalled. 
 Born into a famous family with a long intellect ual tradition, Huxley attended Eton and Oxford. His 
reputation as a writer was well-establis 

In [16]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"&'()*+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz©ÀÆÉÎàâçèéêëïôöùúûü—
104


In [17]:
# creating a mapping from characters to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there!"))
print(decode(encode("hii there!")))

[65, 66, 66, 1, 77, 65, 62, 75, 62, 2]
hii there!


In [18]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1085291]) torch.int64
tensor([15,  0, 32, 72, 69, 69, 62, 60, 77, 62, 61,  1, 34, 76, 76, 58, 82, 76,
         1,  0, 59, 82,  1, 30, 69, 61, 72, 78, 76,  1, 37, 78, 81, 69, 62, 82,
         1,  0,  1,  0, 31, 58, 60, 68,  1, 32, 72, 79, 62, 75, 24,  1,  0,  1,
         0,  1, 30, 69, 69,  1, 72, 79, 62, 75,  1, 77, 65, 62,  1, 34, 71, 64,
        69, 66, 76, 65, 11, 76, 73, 62, 58, 68, 66, 71, 64,  1, 80, 72, 75, 69,
        61,  1, 60, 75,  1, 66, 77, 66, 60, 76,  1, 65, 58, 79, 62,  1, 64, 75,
        62, 62, 77, 62, 61,  1, 77, 65, 62, 76, 62,  1, 62, 76, 76, 58, 82, 76,
         1, 80, 66, 77, 65,  1, 76, 78, 60, 65,  1, 60, 72, 70, 70, 62, 71, 77,
        76,  1, 58, 76,  1,  3, 59, 75, 66, 69, 69, 66, 58, 71, 77, 12,  1, 12,
         1, 12,  1,  0, 73, 75, 72, 79, 72, 60, 58, 77, 66, 79, 62, 12,  1, 12,
         1, 12,  1, 70, 58, 64, 71, 66, 63, 66, 60, 62, 71, 77, 12,  3,  1, 42,
        58, 71, 82,  1, 63, 66, 71, 61,  1, 77, 65, 58, 77,  1,  1, 37, 78, 81,
      

In [19]:
# let's split up the data into train and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [20]:
block_size = 8
train_data[:block_size+1]

tensor([15,  0, 32, 72, 69, 69, 62, 60, 77])

In [21]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([15]) the target: 0
when input is tensor([15,  0]) the target: 32
when input is tensor([15,  0, 32]) the target: 72
when input is tensor([15,  0, 32, 72]) the target: 69
when input is tensor([15,  0, 32, 72, 69]) the target: 69
when input is tensor([15,  0, 32, 72, 69, 69]) the target: 62
when input is tensor([15,  0, 32, 72, 69, 69, 62]) the target: 60
when input is tensor([15,  0, 32, 72, 69, 69, 62, 60]) the target: 77


In [22]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y # x is the input, y is the target

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)   



inputs:
torch.Size([4, 8])
tensor([[73, 69, 62, 76,  1, 77, 65, 62],
        [48, 77, 12,  1, 49, 65, 72, 70],
        [61,  1, 66, 77,  1, 66, 76,  1],
        [66, 71, 10,  1, 71, 72, 77,  1]])
targets:
torch.Size([4, 8])
tensor([[69, 62, 76,  1, 77, 65, 62,  1],
        [77, 12,  1, 49, 65, 72, 70, 58],
        [ 1, 66, 77,  1, 66, 76,  1, 72],
        [71, 10,  1, 71, 72, 77,  1, 77]])
