# LLMs from dummies - Part 1

## Initialize

In [None]:
# Install packages
! pip install Levenshtein
! pip install bpe

Collecting Levenshtein
  Downloading Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.5/172.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein)
  Downloading rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.21.1 rapidfuzz-3.1.1
Collecting bpe
  Downloading bpe-1.0-py3-none-any.whl (6.8 kB)
Collecting hypothesis (from bpe)
  Downloading hypothesis-6.81.2-py3-none-any.whl (414 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m414.8/414.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting mypy (from bpe)
  Downloading mypy-1.4.1-cp310-cp310-manylinux_2_17_x86_64

In [None]:
import os
import sys
import time
import warnings
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import requests

from Levenshtein import distance
from bpe import Encoder

In [None]:
# Device for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
split = 'train'

# Training parameters
learning_rate = 3e-4
batch_size = 64
max_iters = 5000              # Maximum training iterations
eval_interval = 200           # Evaluate model every 'eval_interval' iterations in the training loop
eval_iters = 100              # When evaluating, approximate loss using 'eval_iters' batches

# Architecture parameters
max_vocab_size = 256          # Maximum vocabulary size
vocab_size = max_vocab_size   # Real vocabulary size (e.g. BPE has a variable length, so it can be less than 'max_vocab_size')
block_size = 16               # Context length for predictions
n_embd = 32                   # Embedding size
num_heads = 2                 # Number of head in multi-headed attention
n_layer = 2                   # Number of Blocks
ff_scale_factor = 4           # Note: The '4' magic number is from the paper: In equation 2 uses d_model=512, but d_ff=2048
dropout = 0.0                 # Normalization using dropout# 10.788929 M parameters

head_size = n_embd // num_heads
assert (num_heads * head_size) == n_embd

In [None]:
def latex_matrix(a):
    """Returns a LaTeX matrix from a numpy array."""
    if len(a.shape) > 2:
        raise ValueError('matrix can at most display two dimensions')
    lines = str(a)
    for s in ['tensor', '(', ')', '. ', '.,', ',', '[', '.]', ']']:
        lines = lines.replace(s, '')
    lines = lines.splitlines()
    rv = [r'\left[\begin{matrix}']
    rv += ['  ' + ' & '.join(l.split()) + r' \\' for l in lines]
    rv +=  [r'\end{matrix}\right]']
    return '\n'.join(rv)

## Program for literal translation

In [None]:
dictionary = {
    'le': 'the'
    , 'chat': 'cat'
    , 'est': 'is'
    , 'sous': 'under'
    , 'la': 'the'
    , 'table': 'table'
}

In [None]:
def tokenize(text):
    ''' Split sentences into tokens (words) '''
    return text.split()

def translate(sentence):
    ''' Translate a sentence '''
    out = ''
    for token in tokenize(sentence):
        out += dictionary[token] + ' '
    return out

In [None]:
translate("le chat est sous la table")

'the cat is under the table '

### Improvement: What if the 'key' is not in the dictionary?

In [None]:
def find_closest_key(query):
    ''' Find closest key in dictionary '''
    closest_key, min_dist = None, float('inf')
    for key in dictionary.keys():
        dist = distance(query, key)
        if dist < min_dist:
            min_dist, closest_key = dist, key
    return closest_key


def translate(sentence):
    ''' Translate a sentence '''
    out = ''
    for query in tokenize(sentence):
        key = find_closest_key(query)
        out += dictionary[key] + ' '
    return out

In [None]:
translate("tables")

'table '

## Convert to Neural Network

### Define "vocabularies"

In [None]:
# Vocabulary: All the words in the dictionary
vocabulary_in = sorted(list(set(dictionary.keys())))
print(f"Vocabulary input ({len(vocabulary_in)}): {vocabulary_in}")

vocabulary_out = sorted(list(set(dictionary.values())))
print(f"Vocabulary output ({len(vocabulary_out)}): {vocabulary_out}")

Vocabulary input (6): ['chat', 'est', 'la', 'le', 'sous', 'table']
Vocabulary output (5): ['cat', 'is', 'table', 'the', 'under']


### Encode tokens using "one hot" encoding

In [None]:
# Convert to one hot encoding
def encode_one_hot(vocabulary):
    vocabulary_size = len(vocabulary)
    one_hot = dict()
    LEN = len(vocabulary)
    for i, key in enumerate(vocabulary):
        one_hot_vector = torch.zeros(LEN)
        one_hot_vector[i] = 1
        one_hot[key] = one_hot_vector
        print(f"{key}\t: {one_hot[key]}")
    return one_hot

In [None]:
one_hot_in = encode_one_hot(vocabulary_in)

chat	: tensor([1., 0., 0., 0., 0., 0.])
est	: tensor([0., 1., 0., 0., 0., 0.])
la	: tensor([0., 0., 1., 0., 0., 0.])
le	: tensor([0., 0., 0., 1., 0., 0.])
sous	: tensor([0., 0., 0., 0., 1., 0.])
table	: tensor([0., 0., 0., 0., 0., 1.])


In [None]:
# # Show vectors of one hot encoded tokens

# for k, v in one_hot_in.items():
#     print("$$ E_{", k ,"} = " , latex_matrix(v), "$$")

$$ E_{ chat } =  \left[\begin{matrix}
  1 & 0 & 0 & 0 & 0 & 0\\
\end{matrix}\right] $$
$$ E_{ est } =  \left[\begin{matrix}
  0 & 1 & 0 & 0 & 0 & 0\\
\end{matrix}\right] $$
$$ E_{ la } =  \left[\begin{matrix}
  0 & 0 & 1 & 0 & 0 & 0\\
\end{matrix}\right] $$
$$ E_{ le } =  \left[\begin{matrix}
  0 & 0 & 0 & 1 & 0 & 0\\
\end{matrix}\right] $$
$$ E_{ sous } =  \left[\begin{matrix}
  0 & 0 & 0 & 0 & 1 & 0\\
\end{matrix}\right] $$
$$ E_{ table } =  \left[\begin{matrix}
  0 & 0 & 0 & 0 & 0 & 1\\
\end{matrix}\right] $$

In [None]:
# Same for output vocabulary
one_hot_out = encode_one_hot(vocabulary_out)

cat	: tensor([1., 0., 0., 0., 0.])
is	: tensor([0., 1., 0., 0., 0.])
table	: tensor([0., 0., 1., 0., 0.])
the	: tensor([0., 0., 0., 1., 0.])
under	: tensor([0., 0., 0., 0., 1.])


### Let's create a 'dictionary' using matrix multiplication

In [None]:
K = torch.stack( [one_hot_in[k] for k in dictionary.keys()] )
K

tensor([[0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.]])

In [None]:
V = torch.stack( [one_hot_out[k] for k in dictionary.values()] )
V

tensor([[0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.]])

In [None]:
# Example of looking for a query string in a dictionary
q = one_hot_in['sous']
print("Query token     : ", q)
print("Select key (K)  : ", q @ K.T)
print("Select value (V): ", q @ K.T @ V)

Query token     :  tensor([0., 0., 0., 0., 1., 0.])
Select key (K)  :  tensor([0., 0., 0., 1., 0., 0.])
Select value (V):  tensor([0., 0., 0., 0., 1.])


Query vector, K matrix, and V matrix:

$$
q = \left[\begin{matrix}
  0 & 0 & 0 & 0 & 1 & 0\\
\end{matrix}\right]
;
K = \left[\begin{matrix}
  0 & 0 & 0 & 1 & 0 & 0\\
  1 & 0 & 0 & 0 & 0 & 0\\
  0 & 1 & 0 & 0 & 0 & 0\\
  0 & 0 & 0 & 0 & 1 & 0\\
  0 & 0 & 1 & 0 & 0 & 0\\
  0 & 0 & 0 & 0 & 0 & 1\\
\end{matrix}\right]
;
V = \left[\begin{matrix}
  0 & 0 & 0 & 1 & 0\\
  1 & 0 & 0 & 0 & 0\\
  0 & 1 & 0 & 0 & 0\\
  0 & 0 & 0 & 0 & 1\\
  0 & 0 & 0 & 1 & 0\\
  0 & 0 & 1 & 0 & 0\\
\end{matrix}\right]
$$

The operation $q . K^T . V$ allows us to build a dictionary-like structure from a set of vectors

This is an example on how to select the value from a query:

$$
q . K^T . V =
\left[\begin{matrix}
  0 & 0 & 0 & 0 & 1 & 0\\
\end{matrix}\right]
.
\left[\begin{matrix}
  0 & 1 & 0 & 0 & 0 & 0\\
  0 & 0 & 1 & 0 & 0 & 0\\
  0 & 0 & 0 & 0 & 1 & 0\\
  1 & 0 & 0 & 0 & 0 & 0\\
  0 & 0 & 0 & 1 & 0 & 0\\
  0 & 0 & 0 & 0 & 0 & 1\\
\end{matrix}\right]
.
\left[\begin{matrix}
  0 & 0 & 0 & 1 & 0\\
  1 & 0 & 0 & 0 & 0\\
  0 & 1 & 0 & 0 & 0\\
  0 & 0 & 0 & 0 & 1\\
  0 & 0 & 0 & 1 & 0\\
  0 & 0 & 1 & 0 & 0\\
\end{matrix}\right]
$$


$$
q . K^T . V =
\hspace{2cm}
\left[\begin{matrix}
  0 & 0 & 0 & 1 & 0 & 0\\
\end{matrix}\right]
\hspace{1.5cm}
.
\left[\begin{matrix}
  0 & 0 & 0 & 1 & 0\\
  1 & 0 & 0 & 0 & 0\\
  0 & 1 & 0 & 0 & 0\\
  0 & 0 & 0 & 0 & 1\\
  0 & 0 & 0 & 1 & 0\\
  0 & 0 & 1 & 0 & 0\\
\end{matrix}\right]
$$


$$
q . K^T . V
=
\hspace{3.5cm}
\left[\begin{matrix}
0 & 0 & 0 & 0 & 1\\
\end{matrix}\right]
\hspace{3.5cm}
$$

### Decode one hot vector to a token

In [None]:
def decode_one_hot(one_hot, vector):
    """ Decode "one hot". Find the best matching 'token' """
    best_key, best_cosine_sim = None, 0
    for k, v in one_hot.items():
        cosine_sim = torch.dot(vector, v)   # Since the vectors are normalized, this is the same as "cosine similarity"
        if cosine_sim > best_cosine_sim:
            best_cosine_sim = cosine_sim
            best_key = k
    return best_key

### Now we have a translate function using matrices an vectors

In [None]:
def translate(sentence):
    sentence_out = ''
    for token_in in tokenize(sentence):
        q = one_hot_in[token_in]
        out = q @ K.T @ V
        token_out = decode_one_hot(one_hot_out, out)
        sentence_out += token_out + ' '
    return sentence_out

In [None]:
# Let's check that it works:
translate("le chat est sous la table")

'the cat is under the table '

### ... a few more tweaks towards "Attention"

### Similar tokens => similar vectors: Adding a softmax

In [None]:
# print('E_{table} = ', latex_matrix(one_hot_in['table']))

$$
E_{table} =  \left[\begin{matrix}
  0 & 0 & 0 & 0 & 0 & 1\\
\end{matrix}\right]
$$

$$
E_{tables} =  \left[\begin{matrix}
  0 & 0 & 0 & 0 & 0 & 0.95\\
\end{matrix}\right]
$$


Our new equation is:
$$
softmax(q . K^T) . V
$$

We adjust using by the dimensionality of the query vector, and we get:

$$
softmax\left( \frac{q . K^T}{\sqrt{d}} \right) . V
$$

In [None]:
def translate(sentence):
    """ Translate using K and V matrices """
    sentence_out = ''
    for token_in in tokenize(sentence):
        q = one_hot_in[token_in]
        out = torch.softmax(q @ K.T, 0) @ V
        token_out = decode_one_hot(one_hot_out, out)
        sentence_out += token_out + ' '
    return sentence_out

translate("le chat est sous la table")

'the cat is under the table '

### Improvement: All queries in parallel. The "Q" matrix

In [None]:
sentence = "le chat est sous la table"

Q = torch.stack([one_hot_in[token] for token in tokenize(sentence)])
# print(latex_matrix(Q))

$$
Q = \left[\begin{matrix}
  0 & 0 & 0 & 1 & 0 & 0 \\
  1 & 0 & 0 & 0 & 0 & 0 \\
  0 & 1 & 0 & 0 & 0 & 0 \\
  0 & 0 & 0 & 0 & 1 & 0 \\
  0 & 0 & 1 & 0 & 0 & 0 \\
  0 & 0 & 0 & 0 & 0 & 1 \\
\end{matrix}\right]
$$


$$
Attention(Q, K, V) = softmax\left( \frac{Q . K^T}{\sqrt{d}} \right) V
$$

In [None]:
def translate(sentence):
    """ Translate using a single matrix multiplication instead of a 'for' loop. """
    Q = torch.stack([one_hot_in[token] for token in tokenize(sentence)])
    out = torch.softmax(Q @ K.T, 0) @ V
    return ' '.join([decode_one_hot(one_hot_out, o) for o in out])

translate("le chat est sous la table")

'the cat is under the table'

### Making attention more powerful

$$
Attention(Q, K, V)  => Attention(Q . W^Q, K . W^K, V . W^V)
$$

In [None]:
class Head(nn.Module):
  """ Self attention head """

  def __init__(self):
    super().__init__()
    self.key = nn.Linear(n_embd, n_embd, bias=False)
    self.query = nn.Linear(n_embd, n_embd, bias=False)
    self.value = nn.Linear(n_embd, n_embd, bias=False)

  def forward(self, x):
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    # Attention score
    w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5   # Query * Keys / normalization
    w = F.softmax(w, dim=-1)  # Do a softmax across the last dimesion
    # Add weighted values
    out = w @ v
    return out

## Improving tokenization: BPE

Example: Tokenizing the "Shakespare" dataset

In [None]:
# IMPORTANT: Downloads the datasets from '' to the `datasets` directory
datasets_dir = Path(".")
shakespeare_data = datasets_dir / "shakespeare.txt"
shakespeare_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

if not shakespeare_data.exists():
    with open(shakespeare_data, 'w') as f:
        f.write(requests.get(shakespeare_url).text)

In [None]:
# Load the file
with open(shakespeare_data, "r") as f:
    text = f.read()
    print(text[:300] + "...")

encoder = Encoder()  # Using default parameters: vocab_size=8192
encoder.fit(text.split('\n'))  # Fitting the model: i.e. using the data to get the translation table

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us...


In [None]:
# Show first items in the BPE vocabulary
', '.join( [f"'{k}' : {v}" for k, v in encoder.bpe_vocab.items()][:100] )

"'__sow' : 6553, '__eow' : 6554, 'e' : 6555, 's' : 6556, 'i' : 6557, 'r' : 6558, 'n' : 6559, 't' : 6560, 'a' : 6561, 'o' : 6562, 'l' : 6563, 'd' : 6564, 'c' : 6565, 'u' : 6566, 'g' : 6567, 'p' : 6568, 'h' : 6569, 'm' : 6570, 'in' : 6571, 'er' : 6572, 'es' : 6573, 'b' : 6574, 'ed' : 6575, 'f' : 6576, 'ng' : 6577, 'y' : 6578, 're' : 6579, 'st' : 6580, 'en' : 6581, 'te' : 6582, 'w' : 6583, 'v' : 6584, 'le' : 6585, 'ti' : 6586, 'on' : 6587, 'nt' : 6588, 'ar' : 6589, 'an' : 6590, 'un' : 6591, 'k' : 6592, 'co' : 6593, 'ri' : 6594, 'is' : 6595, 'de' : 6596, 'at' : 6597, 'ra' : 6598, 'or' : 6599, 'se' : 6600, 'li' : 6601, 'ne' : 6602, 'he' : 6603, 'ou' : 6604, 've' : 6605, 'ss' : 6606, 'di' : 6607, 'al' : 6608, 'it' : 6609, 'ea' : 6610, 'ns' : 6611, 'th' : 6612, 'ro' : 6613, 'pe' : 6614, 'ur' : 6615, 'et' : 6616, 'ly' : 6617, 'el' : 6618, 'me' : 6619, 'ta' : 6620, 'la' : 6621, 'rs' : 6622, 'io' : 6623, 'ch' : 6624, 'nd' : 6625, 'ce' : 6626, 'us' : 6627, 'll' : 6628, 'tr' : 6629, 'sh' : 6630, '