In [1]:
pip install numpy requests torch tiktoken matplotlib pandas

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp311-cp311-macosx_11_0_arm64.whl (907 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m907.0/907.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

In [3]:
# Hyperparameters
batch_size = 4  # How many batches per training step
context_length = 16  # Length of the token chunk each batch
d_model = 64  # The vector size of the token embeddings
num_layers = 8  # Number of transformer blocks
num_heads = 4  # Number of heads in Multi-head attention # d_model / num_heads = head_size
learning_rate = 1e-3  # 0.001
dropout = 0.1 # Dropout rate
max_iters = 5000  # Total of training iterations
eval_interval = 50  # How often to evaluate the model
eval_iters = 20  # How many iterations to average the loss over when evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Instead of using the cpu, we'll use the GPU if it's available.

TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

<torch._C.Generator at 0x17a8a1130>

In [5]:
# download a sample txt file from https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt
if not os.path.exists('sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt'
    with open('sales_textbook.txt', 'w') as f:
        f.write(requests.get(url).text)

with open('sales_textbook.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [8]:
# Using TikToken to tokenize the source text
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(text) # size of tokenized source text is 77,919
tokenized_text = torch.tensor(tokenized_text, dtype = torch.long)
vocab_size = len(set(tokenized_text)) # size of vocabulary is 3,771
max_token_value = max(tokenized_text)

print(f"Tokenized text size: {len(tokenized_text)}")
print(f"Vocabulary size: {vocab_size}")
print(f"The maximum value in the tokenized text is: {max_token_value}")

Tokenized text size: 77919
Vocabulary size: 77919
The maximum value in the tokenized text is: 100069


In [9]:
# Split train and validation
split_idx = int(len(tokenized_text) * 0.8)
train_data = tokenized_text[:split_idx]
val_data = tokenized_text[split_idx:]

# Prepare data for training batch
# Prepare data for training batch
data = train_data
idxs = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
x_batch = torch.stack([data[idx:idx + context_length] for idx in idxs])
y_batch = torch.stack([data[idx + 1:idx + context_length + 1] for idx in idxs])
print(x_batch.shape, x_batch.shape)

torch.Size([4, 16]) torch.Size([4, 16])


In [10]:
# Define Token Embedding look-up table
token_embedding_lookup_table = nn.Embedding(max_token_value, d_model)

# Get X and Y embedding
x = token_embedding_lookup_table(x_batch.data)
y = token_embedding_lookup_table(y_batch.data)

In [12]:
# Define Position Encoding look-up table
position_encoding_lookup_table = torch.zeros(context_length, d_model) # initial with zeros with shape (context_length, d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
# apply the sine & cosine
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) #add batch to the first dimension

print("Position Encoding Look-up Table: ", position_encoding_lookup_table.shape)

Position Encoding Look-up Table:  torch.Size([4, 16, 64])


In [13]:
# Add positional encoding into the input embedding vector
input_embedding_x = x + position_encoding_lookup_table # [4, 16, 64] [batch_size, context_length, d_model]
input_embedding_y = y + position_encoding_lookup_table

X = input_embedding_x

x_plot = input_embedding_x[0].detach().cpu().numpy()
print("Final Input Embedding of x: \n", pd.DataFrame(x_plot))

Final Input Embedding of x: 
           0         1         2         3         4         5         6   \
0  -0.299130  0.949924 -0.631772  0.939232 -2.082235  1.398602 -0.240277   
1   0.758386  0.999339  0.238804  2.130460 -0.736705  1.686566  0.824980   
2  -0.922071 -2.250749 -0.556319  1.222940  2.109022 -0.400113 -0.138789   
3   0.306074 -1.573041  2.231532  0.032769  1.850678 -0.120396 -0.011787   
4  -0.954794 -0.976841  0.139319 -0.865764 -0.193849 -1.655313  1.521742   
5   0.679198  0.506228  0.353623 -0.863025 -2.570290 -1.323900  0.371723   
6  -0.323705  2.691280 -2.467007  0.717207  1.387555 -2.344249  0.049187   
7   0.476250  1.532691 -0.440719  0.884680 -0.049779 -0.415280 -0.095141   
8  -1.377338 -0.241152  0.619716  0.535313 -2.747910 -0.546994 -1.181236   
9  -0.405825  0.559437  1.838797  0.457612 -2.470442  1.303259 -0.322104   
10 -0.724732 -1.171013 -0.256460 -1.604869 -1.592075  0.130025  0.570488   
11 -0.449413 -0.703517  0.179154  0.373091 -0.182106  1.93

In [14]:
# Prepare Query, Key, Value for Multi-head Attention

query = key = value = X # [4, 16, 64] [batch_size, context_length, d_model]

# Define Query, Key, Value weight matrices
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(query) #[4, 16, 64]
Q = Q.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

K = Wk(key) #[4, 16, 64]
K = K.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

V = Wv(value) #[4, 16, 64]
V = V.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

In [15]:
# Transpose q,k,v from [batch_size, context_length, num_heads, head_size] to [batch_size, num_heads, context_length, head_size]
# The reason is that treat each batch with "num_heads" as its first dimension.
Q = Q.transpose(1, 2) # [4, 4, 16, 16]
K = K.transpose(1, 2) # [4, 4, 16, 16]
V = V.transpose(1, 2) # [4, 4, 16, 16]

In [16]:
# Calculate the attention score betwee Q and K^T
attention_score = torch.matmul(Q, K.transpose(-2, -1))

In [17]:
# Then Scale the attention score by the square root of the head size
attention_score = attention_score / math.sqrt(d_model // num_heads)

In [18]:
attention_score = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model // num_heads) # [4, 4, 16, 16] #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))

          0         1         2         3         4         5         6   \
0   0.021498  0.759700  1.114896  0.861203  0.996177  0.087225  0.020397   
1  -0.186354  0.423521  0.930679  0.397063  0.689583 -0.023029 -1.295207   
2  -0.116987 -0.521399 -0.102028 -0.048671 -0.094188  0.211903  0.785083   
3   0.677379  0.094782  0.187630  1.087047  0.387744  0.641712  1.120173   
4   0.631352 -0.280073 -0.932875 -0.581271 -0.206257  0.127697 -0.102794   
5  -0.327146  0.068143  0.148191 -0.391794  0.132223  0.017728  0.268415   
6  -0.444339  0.258554  0.455449 -0.077464  0.134794 -0.053327  0.101351   
7  -0.245799  0.580801  0.932249  0.135226  0.357541 -0.006203  0.136920   
8  -0.303093  0.018890  0.015307  0.294602  0.369652 -0.219553 -0.192504   
9  -0.136668  0.443592  0.470723 -0.184153  0.221327 -0.058326 -0.335395   
10  0.694354  0.946938  0.422883  0.789916  0.990797 -0.172340  0.704143   
11  0.352159  0.977470  1.040248  1.317057  1.359986  0.120361  0.799884   
12 -0.238808

In [19]:
# Apply Mask to attention scores
attention_score = attention_score.masked_fill(torch.triu(torch.ones(attention_score.shape[-2:]), diagonal=1).bool(), float('-inf')) #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))

          0         1         2         3         4         5         6   \
0   0.021498      -inf      -inf      -inf      -inf      -inf      -inf   
1  -0.186354  0.423521      -inf      -inf      -inf      -inf      -inf   
2  -0.116987 -0.521399 -0.102028      -inf      -inf      -inf      -inf   
3   0.677379  0.094782  0.187630  1.087047      -inf      -inf      -inf   
4   0.631352 -0.280073 -0.932875 -0.581271 -0.206257      -inf      -inf   
5  -0.327146  0.068143  0.148191 -0.391794  0.132223  0.017728      -inf   
6  -0.444339  0.258554  0.455449 -0.077464  0.134794 -0.053327  0.101351   
7  -0.245799  0.580801  0.932249  0.135226  0.357541 -0.006203  0.136920   
8  -0.303093  0.018890  0.015307  0.294602  0.369652 -0.219553 -0.192504   
9  -0.136668  0.443592  0.470723 -0.184153  0.221327 -0.058326 -0.335395   
10  0.694354  0.946938  0.422883  0.789916  0.990797 -0.172340  0.704143   
11  0.352159  0.977470  1.040248  1.317057  1.359986  0.120361  0.799884   
12 -0.238808

In [21]:
# Softmax the attention score
attention_score = torch.softmax(attention_score, dim=-1)  #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))

          0         1         2         3         4         5         6   \
0   0.153417  0.056439  0.056439  0.056439  0.056439  0.056439  0.056439   
1   0.082039  0.110280  0.057691  0.057691  0.057691  0.057691  0.057691   
2   0.084435  0.074588  0.084910  0.058159  0.058159  0.058159  0.058159   
3   0.076467  0.067816  0.068825  0.087754  0.058261  0.058261  0.058261   
4   0.089366  0.069222  0.063753  0.066199  0.070138  0.058302  0.058302   
5   0.066270  0.070397  0.071490  0.065755  0.071264  0.069759  0.058506   
6   0.063675  0.069366  0.071973  0.066089  0.068010  0.066285  0.067676   
7   0.062778  0.068641  0.073386  0.064833  0.066495  0.063969  0.064844   
8   0.064047  0.066245  0.066216  0.068873  0.069746  0.064544  0.064715   
9   0.063119  0.066889  0.067132  0.062903  0.065155  0.063499  0.062285   
10  0.066168  0.068513  0.064288  0.066977  0.068994  0.061678  0.066247   
11  0.062265  0.065606  0.066086  0.068658  0.069135  0.061495  0.064417   
12  0.063096

In [23]:
# Calculate the V attention output
A = torch.matmul(attention_score, V) # [4, 4, 16, 16] [batch_size, num_heads, context_length, head_size]
print(attention_score.shape)

torch.Size([4, 4, 16, 16])


In [24]:
A = A.transpose(1, 2) # [4, 16, 4, 16] [batch_size, context_length, num_heads, head_size]
A = A.reshape(batch_size, -1, d_model) # [4, 16, 64] [batch_size, context_length, d_model]

In [25]:
# Define the output weight matrix
Wo = nn.Linear(d_model, d_model)
output = Wo(A) # [4, 16, 64] [batch_size, context_length, d_model]

print(output.shape)

torch.Size([4, 16, 64])


In [26]:
# Add residual connection
output = output + X

# Add Layer Normalization
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

In [27]:
# Define Feed Forward Network
output = nn.Linear(d_model, d_model * 4)(output)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4, d_model)(output)
output = torch.dropout(output, p=dropout, train=True)

In [28]:
# Add residual connection
output = output + X
# Add Layer Normalization
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

In [29]:
logits = nn.Linear(d_model, max_token_value)(output)
print(pd.DataFrame(logits[0].detach().cpu().numpy()))

      0         1         2         3         4         5         6       \
0   0.573332  1.015899 -0.222024 -0.508197 -0.539159  0.016217 -0.901942   
1   1.419667 -0.440910  0.260394 -0.632957  0.630532 -1.415623  0.429661   
2   0.487616  0.048311  0.370451 -0.152401 -1.184079  0.517175 -0.095659   
3   0.347335  0.663217 -0.251170 -0.163551 -0.734540  0.343969 -0.334619   
4   0.242071  1.221849  0.048497  0.264043 -0.003944  0.361158 -1.098022   
5  -1.488472 -0.487731 -1.028909 -0.285336 -0.071531  0.363977 -0.032143   
6  -0.350975  0.324540 -0.418890  0.509512 -0.674422  0.128509 -0.785958   
7   0.026156 -0.702553 -0.197620 -0.669560 -0.433044 -0.749262 -0.609470   
8  -0.141884  1.253866 -0.156874 -1.094536 -0.360480  0.602933  0.021170   
9  -0.297196  0.157524  0.435299 -0.918095 -0.514047 -0.618292  0.317165   
10 -0.651161 -0.437216  0.192381 -1.242731 -0.336959  0.537993 -0.729907   
11 -0.581308 -0.284026 -0.127426 -1.490744 -0.619038  0.568174 -0.286749   
12  0.112364

In [30]:
# torch.softmax usually used during inference, during training we use torch.nn.CrossEntropyLoss
# but for illustration purpose, we'll use torch.softmax here
probabilities = torch.softmax(logits, dim=-1)