In [140]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [141]:
print(f'Length of the text in charaters: {len(text)}')

Length of the text in charaters: 1115393


In [142]:
n = 1000
print(f'First {n} characters:\n{text[:n]}')

First 1000 characters:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not i

#### Building of the vocabulary based on the given dataset

In [143]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print(f'Vocabulary: {"".join(vocab)}')
print(f'Vocabulary size: {vocab_size}')

Vocabulary: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 65


#### Mapping of the characters to integers

In [144]:
ctoi = {c:i for i,c in enumerate(vocab)}
itoc = {i:c for i,c in enumerate(vocab)}
encode = lambda s: [ctoi[c] for c in s]
decode = lambda l: [itoc[i] for i in l]

In [145]:
sample_text = 'sample text'
print(f'Sample text: {sample_text}')
print(f'Encoding example: {encode(sample_text)}')
print(f'Decoding example: {"".join(decode(encode(sample_text)))}')

Sample text: sample text
Encoding example: [57, 39, 51, 54, 50, 43, 1, 58, 43, 62, 58]
Decoding example: sample text


##### Encoding of the entire dataset

In [146]:
import torch
data = torch.tensor(encode(text))
print(f'Data tensor shape: {data.shape}')
print(f'Data tensor content (first 50 elements): {data[:50]}')

Data tensor shape: torch.Size([1115393])
Data tensor content (first 50 elements): tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56])


#### Splitting of the dataset (training & validation)

In [147]:
train_split = 0.9
train_count = int(train_split * len(data))
train_data = data[:train_count]
val_data = data[train_count:]
print(f'Characters in training set: {len(train_data)}')
print(f'Characters in validation set: {len(val_data)}')

Characters in training set: 1003853
Characters in validation set: 111540


#### Structuring of the data splits into blocks

In [148]:
block_size = 8
block_sample_train = train_data[:block_size+1]
print(f'Sample block from training set: {block_sample_train}')

Sample block from training set: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


#### Structuring of the blocks into batches

In [149]:
torch.manual_seed(1234)
batch_size = 4
block_size = 8

def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [150]:
xb, yb = get_batch(data, batch_size, block_size)
print(f'Sample training batch X shape: {xb.shape}')
print(f'Sample training batch Y shape: {yb.shape}')
print(f'Sample training batch X:\n {xb}')
print(f'Sample training batch Y:\n {yb}')
print(f'Sample training batch X (decoded):\n {["".join(decode(x)) for x in xb.tolist()]}')
print(f'Sample training batch Y (decoded):\n {["".join(decode(y)) for y in yb.tolist()]}')

Sample training batch X shape: torch.Size([4, 8])
Sample training batch Y shape: torch.Size([4, 8])
Sample training batch X:
 tensor([[21, 17, 32, 10,  0, 27,  1, 58],
        [ 6,  1, 44, 53, 53, 50, 47, 57],
        [43, 56,  2,  1, 39, 58,  1, 39],
        [53, 59, 56,  1, 43, 63, 43, 57]])
Sample training batch Y:
 tensor([[17, 32, 10,  0, 27,  1, 58, 46],
        [ 1, 44, 53, 53, 50, 47, 57, 46],
        [56,  2,  1, 39, 58,  1, 39,  1],
        [59, 56,  1, 43, 63, 43, 57,  1]])
Sample training batch X (decoded):
 ['IET:\nO t', ', foolis', 'er! at a', 'our eyes']
Sample training batch Y (decoded):
 ['ET:\nO th', ' foolish', 'r! at a ', 'ur eyes ']


#### Definition of a simple Bigram Language Model

In [151]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)
        
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(input=logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

##### Sample inference through the Bigram Language Model

In [152]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(f'Sample batch logits shape: {logits.shape}')

Sample batch logits shape: torch.Size([32, 65])


##### Sample generation of tokens using the Bigram Language Model

In [153]:
idx_start = torch.zeros(size=(1, 1), dtype=torch.long)
idx_pred = m.generate(idx=idx_start, max_new_tokens=50)
idx_pred_decoded = decode(idx_pred[0].tolist())
print(f'Starting token (input for the model): {decode(idx_start[0].tolist())}')
print(f'Generated tokens: {"".join(idx_pred_decoded)}')

Starting token (input for the model): ['\n']
Generated tokens: 
Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLER


##### Optimization of the Bigram Language Model

In [154]:
batch_size = 32
block_size = 8
lr = 1e-3
optimizer = torch.optim.AdamW(m.parameters(), lr=lr)
for step in range(1000):
    xb, yb = get_batch(train_data, batch_size, block_size)
    logits, loss = m(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(f'Loss after training: {loss.item()}')

Loss after training: 3.794015884399414


In [155]:
idx_start = torch.zeros(size=(1, 1), dtype=torch.long)
idx_pred = m.generate(idx=idx_start, max_new_tokens=50)
idx_pred_decoded = decode(idx_pred[0].tolist())
print(f'Starting token (input for the model): {decode(idx_start[0].tolist())}')
print(f'Generated tokens: {"".join(idx_pred_decoded)}')

Starting token (input for the model): ['\n']
Generated tokens: 
C xtRjRy!
SAAd.
wDWCASlvirTOq-onAGj;pJFq,Sb?suArdl


#### Building up the self-attention mechanism (toy example)

##### Weighted aggregation through matrix multiplication

In [156]:
torch.manual_seed(1234)
block_size = 8
a = torch.tril(input=torch.ones(block_size, block_size))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(low=0, high=10, size=(block_size, 4)).float()
c = a @ b
print(f'Mask matrix:\n {a}')
print(f'Input matrix:\n {b}')
print(f'Multiplication result matrix:\n {c}')

Mask matrix:
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
Input matrix:
 tensor([[5., 1., 6., 5.],
        [6., 4., 2., 5.],
        [5., 9., 3., 1.],
        [4., 2., 3., 2.],
        [6., 8., 2., 2.],
        [8., 2., 0., 4.],
        [9., 2., 1., 9.],
        [2., 2., 9., 4.]])
Multiplication result matrix:
 tensor([[5.0000, 1.0000, 6.0000, 5.0000],
        [5.5000, 2.5000, 4.0000, 5.0000],
        [5.3333, 4.6667, 3.6667, 3.6667],
        [5.0

##### Example input initialization
For using the self-attention mechanism, a simple example batch of token sequences is constructed.

In [157]:
batch_size = 4 # Number of blocks in a batch
block_size = 8 # Number of tokens in a block (context)
channels = 2 # Number of dimensions per token embedding
B, T, C = batch_size, block_size, channels
xb = torch.randn(size=(B, T, C))
print(f'Sample batch shape: {xb.shape}')

Sample batch shape: torch.Size([4, 8, 2])


##### Version 1

In [158]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = xb[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)
print(f'Bag of words shape: {xbow.shape}')

Bag of words shape: torch.Size([4, 8, 2])


##### Version 2 (basic normalization)

In [159]:
wei = torch.tril(input=torch.ones(size=(T, T)))
wei = wei / wei.sum(1, keepdim=True)
xbow_v2 = wei @ xb
print(f'Output similar to previous version: {torch.allclose(input=xbow_v2, other=xbow)}')

Output similar to previous version: True


##### Version 3 (softmax)

In [160]:
tril = torch.tril(input=torch.ones(size=(T, T)))
wei = torch.zeros(size=(T, T))
wei = torch.masked_fill(wei, tril == 0, float('-inf'))
wei = F.softmax(input=wei, dim=-1)
xbow_v3 = wei @ xb
print(f'Output similar to previous version: {torch.allclose(input=xbow_v3, other=xbow_v2)}')

Output similar to previous version: True


##### Version 4 (attention)

In [170]:
attention_head_size = 16
query_layer = nn.Linear(in_features=C, out_features=attention_head_size, bias=False)
key_layer = nn.Linear(in_features=C, out_features=attention_head_size, bias=False)
value_layer = nn.Linear(in_features=C, out_features=attention_head_size, bias=False)
query = query_layer(xb) # (B, T, C) = (4, 8, 16)
key = key_layer(xb) # (B, T, C) = (4, 8, 16)
wei = query @ key.transpose(-2, -1)
tril = torch.tril(input=torch.ones(size=(T, T)))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(input=wei, dim=-1) # (B, T, T) = (4, 8, 8)
value = value_layer(xb)
out = wei @ value
print(f'Output shape: {out.shape}')
print(f'Output content (0th batch):\n {out[0]}')

Output shape: torch.Size([4, 8, 16])
Output content (0th batch):
 tensor([[ 0.2605,  1.3034,  0.6661,  0.9853, -1.2583, -0.3623, -0.8652,  0.3540,
          0.6784, -0.3922,  0.4535,  0.2557,  0.1285, -0.1088,  0.6791,  0.4476],
        [ 0.3575,  0.1789,  0.3514,  0.7294, -0.2864, -0.3199, -0.2846,  0.7067,
         -0.2138,  0.3514,  0.5947,  0.2925,  0.5467, -0.3664,  0.2156,  0.3286],
        [ 0.2889,  0.2966,  0.3371,  0.6484, -0.3675, -0.2753, -0.3153,  0.5503,
         -0.0647,  0.2000,  0.4833,  0.2419,  0.4069, -0.2757,  0.2419,  0.2926],
        [ 0.3745,  0.3949,  0.4407,  0.8444, -0.4857, -0.3580, -0.4145,  0.7119,
         -0.0765,  0.2535,  0.6266,  0.3140,  0.5250, -0.3559,  0.3182,  0.3811],
        [ 0.0806,  0.2555,  0.1544,  0.2476, -0.2571, -0.0958, -0.1848,  0.1297,
          0.1048, -0.0397,  0.1377,  0.0737,  0.0737, -0.0535,  0.1443,  0.1122],
        [ 0.2712,  0.2604,  0.3102,  0.6016, -0.3289, -0.2564, -0.2858,  0.5190,
         -0.0735,  0.1977,  0.4534,  0