In [2]:
with open("the-verdict.txt", 'r') as f:
    raw_text = f.read()
print("Total number of charachters: ", len(raw_text))

print(raw_text[:99])

Total number of charachters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [10]:
import re

preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [13]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [21]:
vocab = {token:i for i, token in enumerate(all_words)}

In [23]:
for i, token in enumerate(vocab.items()):
    print(token)
    if i == 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [24]:
class Tokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]

        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text= re.sub(r'\s+([,.:;?!_"()\'])', r'\1', text)
        return text

In [25]:
tokenizer = Tokenizer(vocab=vocab)
text = """It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]


In [26]:
tokenizer.decode(ids)


"It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride."

In [30]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|ukw|>"])

vocab = {token: i for i, token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [31]:
for _, tokens in enumerate(list(vocab.items())[-5:]):
    print(tokens)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|ukw|>', 1131)


In [37]:
class Tokenizerv2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int else "<|ukw|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]

        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text= re.sub(r'\s+([,.:;?!_"()\'])', r'\1', text)
        return text

In [38]:
tokenizer = Tokenizerv2(vocab=vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [42]:
ids = tokenizer.encode(text)
ids

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [43]:
tokenizer.decode(ids)

'<|ukw|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|ukw|>.'

In [44]:
! pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl (884 kB)
   ---------------------------------------- 0.0/884.5 kB ? eta -:--:--
   - ------------------------------------- 30.7/884.5 kB 640.0 kB/s eta 0:00:02
   ------------ --------------------------- 286.7/884.5 kB 3.5 MB/s eta 0:00:01
   ----------------------------------- ---- 778.2/884.5 kB 6.1 MB/s eta 0:00:01
   ---------------------------------------- 884.5/884.5 kB 5.6 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.8.0



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [47]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunknownpalace."

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
ids

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 286,
 262,
 617,
 34680,
 18596,
 558,
 13]

In [48]:
string = tokenizer.decode(ids)
string

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunknownpalace.'

In [49]:
enc_text = tokenizer.encode(raw_text)
len(enc_text)

5145

In [51]:
enc_sample = enc_text[50:]
enc_sample

[290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,
 286,
 526,
 383,
 1573,
 11,
 319,
 9074,
 13,
 536,
 5469,
 338,
 11914,
 11,
 33096,
 663,
 4808,
 3808,
 62,
 355,
 996,
 484,
 547,
 12548,
 287,
 281,
 13079,
 410,
 12523,
 286,
 22353,
 13,
 843,
 340,
 373,
 407,
 691,
 262,
 9074,
 13,
 536,
 48819,
 508,
 25722,
 276,
 13,
 11161,
 407,
 262,
 40123,
 18113,


In [52]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

x, y

([290, 4920, 2241, 287], [4920, 2241, 287, 257])

In [61]:
for i in range(1, context_size + 1):
    x = enc_sample[:i]
    y = enc_sample[i]

    print(tokenizer.decode(x), "----->", tokenizer.decode([y]))


 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [62]:
! pip install torch




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [66]:
import torch
from torch.utils.data import  DataLoader, Dataset

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [67]:
def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [71]:
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
print(next(iter(dataloader)))

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


In [83]:
input_ids = torch.tensor([2,3,5,1])

vocab_size = 50257
dimension = 256

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, dimension)

In [85]:
max_length = 4
dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, target = next(data_iter)

print("token \n", inputs)
print("target \n", target)

token 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
target 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [88]:
token_embeddings = embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [90]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_size, dimension)

pos_embeddings = pos_embedding_layer(torch.arange(max_length))
pos_embeddings.shape

torch.Size([4, 256])

In [120]:
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])

In [156]:
torch.manual_seed(123)
inputs = torch.rand(6,3)
inputs

tensor([[0.2961, 0.5166, 0.2517],
        [0.6886, 0.0740, 0.8665],
        [0.1366, 0.1025, 0.1841],
        [0.7264, 0.3153, 0.6871],
        [0.0756, 0.1966, 0.3164],
        [0.4017, 0.1186, 0.8274]])

In [157]:
query = inputs[1]
attention_score_2 = torch.empty(inputs.shape[0])

for i, x_i in enumerate(inputs):
    attention_score_2[i] =  torch.dot(x_i, query)

attention_score_2

tensor([0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023])

In [158]:
normalized_attention_score = attention_score_2 / attention_score_2.sum()
normalized_attention_score

tensor([0.1043, 0.2788, 0.0592, 0.2535, 0.0772, 0.2271])

In [159]:
def naive_softmax(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

normalized_attention_score = naive_softmax(attention_score_2)
normalized_attention_score

tensor([0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019])

In [160]:
normalized_attention_score = torch.softmax(attention_score_2, dim=0)
normalized_attention_score

tensor([0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019])

In [161]:
context_vector_2 = torch.zeros(query.shape)
for i, inp in enumerate(inputs):
    context_vector_2 += normalized_attention_score[i] * inp
context_vector_2

tensor([0.4762, 0.2052, 0.6228])

In [163]:
attention_scores = torch.empty(6,6)

for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attention_scores[i, j] = torch.dot(x_i, x_j)

attention_scores

tensor([[0.4179, 0.4602, 0.1397, 0.5509, 0.2036, 0.3884],
        [0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023],
        [0.1397, 0.2611, 0.0630, 0.2580, 0.0887, 0.2193],
        [0.5509, 1.1189, 0.2580, 1.0992, 0.3343, 0.8977],
        [0.2036, 0.3408, 0.0887, 0.3343, 0.1445, 0.3155],
        [0.3884, 1.0023, 0.2193, 0.8977, 0.3155, 0.8600]])

In [176]:
inputs @ inputs.T

tensor([[0.4179, 0.4602, 0.1397, 0.5509, 0.2036, 0.3884],
        [0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023],
        [0.1397, 0.2611, 0.0630, 0.2580, 0.0887, 0.2193],
        [0.5509, 1.1189, 0.2580, 1.0992, 0.3343, 0.8977],
        [0.2036, 0.3408, 0.0887, 0.3343, 0.1445, 0.3155],
        [0.3884, 1.0023, 0.2193, 0.8977, 0.3155, 0.8600]])

In [175]:
normalized_attention_scores = torch.softmax(attention_scores, dim=-1)
normalized_attention_scores, inputs


(tensor([[0.1748, 0.1824, 0.1324, 0.1997, 0.1411, 0.1697],
         [0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019],
         [0.1609, 0.1817, 0.1490, 0.1811, 0.1529, 0.1743],
         [0.1340, 0.2365, 0.1000, 0.2319, 0.1079, 0.1896],
         [0.1603, 0.1838, 0.1429, 0.1827, 0.1511, 0.1793],
         [0.1268, 0.2342, 0.1070, 0.2110, 0.1179, 0.2032]]),
 tensor([[0.2961, 0.5166, 0.2517],
         [0.6886, 0.0740, 0.8665],
         [0.1366, 0.1025, 0.1841],
         [0.7264, 0.3153, 0.6871],
         [0.0756, 0.1966, 0.3164],
         [0.4017, 0.1186, 0.8274]]))

In [177]:
normalized_attention_scores @ inputs

tensor([[0.4193, 0.2282, 0.5486],
        [0.4762, 0.2052, 0.6228],
        [0.4063, 0.2197, 0.5424],
        [0.4690, 0.2138, 0.6075],
        [0.4097, 0.2196, 0.5476],
        [0.4572, 0.2075, 0.6049]])

In [180]:
torch.manual_seed(1234)
input_tensor = torch.rand(6,3)

In [182]:
d_in = input_tensor.shape[1]
d_out = 2

In [183]:
w_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
w_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
w_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [184]:
w_key, w_query, w_value

(Parameter containing:
 tensor([[0.9665, 0.7399],
         [0.4517, 0.4757],
         [0.7842, 0.1525]]),
 Parameter containing:
 tensor([[0.2388, 0.7313],
         [0.6012, 0.3043],
         [0.2548, 0.6294]]),
 Parameter containing:
 tensor([[0.6662, 0.3343],
         [0.7893, 0.3216],
         [0.5247, 0.6688]]))

In [186]:
x_2 = input_tensor[2]
query_2 = x_2 @ w_query
key_2 = x_2 @ w_key
value_2 = x_2 @ w_value

query_2, key_2, value_2

(tensor([0.4655, 0.6044]), tensor([0.7899, 0.3638]), tensor([0.7576, 0.6186]))

In [188]:
keys = input_tensor @ w_key
queries = input_tensor @ w_query
values = input_tensor @ w_value

In [189]:
keys, queries, values

(tensor([[0.4133, 0.2523],
         [0.9301, 0.4059],
         [0.7899, 0.3638],
         [1.1159, 0.7040],
         [1.3387, 1.0064],
         [1.3020, 0.7395]]),
 tensor([[0.3147, 0.3070],
         [0.3011, 0.7268],
         [0.4655, 0.6044],
         [0.6938, 0.8353],
         [0.7496, 0.9922],
         [0.5006, 0.9982]]),
 tensor([[0.4729, 0.3127],
         [0.6579, 0.6099],
         [0.7576, 0.6186],
         [1.1349, 0.7395],
         [1.3106, 0.7098],
         [1.0227, 0.7584]]))

In [190]:
attn_scores = queries @ keys.T
attn_scores

tensor([[0.2075, 0.4174, 0.3603, 0.5674, 0.7304, 0.6369],
        [0.3078, 0.5750, 0.5022, 0.8476, 1.1345, 0.9295],
        [0.3449, 0.6783, 0.5876, 0.9449, 1.2314, 1.0530],
        [0.4975, 0.9843, 0.8519, 1.3622, 1.7694, 1.5210],
        [0.5601, 1.1000, 0.9531, 1.5350, 2.0021, 1.7098],
        [0.4587, 0.8708, 0.7586, 1.2613, 1.6748, 1.3900]])

In [194]:
d = keys.shape[-1]
attn_weights = torch.softmax(attn_scores / d**0.5, dim=-1)
attn_weights, values

(tensor([[0.1358, 0.1575, 0.1513, 0.1751, 0.1965, 0.1839],
         [0.1224, 0.1479, 0.1405, 0.1794, 0.2197, 0.1901],
         [0.1176, 0.1489, 0.1396, 0.1798, 0.2201, 0.1940],
         [0.0994, 0.1403, 0.1277, 0.1832, 0.2444, 0.2050],
         [0.0926, 0.1356, 0.1222, 0.1844, 0.2566, 0.2087],
         [0.1038, 0.1389, 0.1283, 0.1831, 0.2453, 0.2006]]),
 tensor([[0.4729, 0.3127],
         [0.6579, 0.6099],
         [0.7576, 0.6186],
         [1.1349, 0.7395],
         [1.3106, 0.7098],
         [1.0227, 0.7584]]))

In [193]:
attn_weights.sum(dim=-1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [196]:
context_vector = attn_weights @ values
context_vector

tensor([[0.9267, 0.6405],
        [0.9475, 0.6482],
        [0.9503, 0.6503],
        [0.9739, 0.6601],
        [0.9845, 0.6640],
        [0.9721, 0.6582]])

In [204]:
class SelfAttention(torch.nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()

        self.w_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        context_vec = attn_weights @ values
        return context_vec

In [205]:
sa_v1 = SelfAttention(d_in=3, d_out=2)
context = sa_v1(input_tensor)
context

tensor([[-0.5462,  0.0993],
        [-0.5461,  0.0994],
        [-0.5458,  0.0995],
        [-0.5475,  0.0994],
        [-0.5493,  0.0992],
        [-0.5476,  0.0993]], grad_fn=<MmBackward0>)

In [207]:
queries = sa_v1.w_query(input_tensor)
keys = sa_v1.w_key(input_tensor)
attn_scores = queries @ keys.T
att_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
att_weights


tensor([[0.1660, 0.1706, 0.1630, 0.1622, 0.1673, 0.1709],
        [0.1668, 0.1717, 0.1614, 0.1604, 0.1678, 0.1719],
        [0.1680, 0.1722, 0.1602, 0.1590, 0.1684, 0.1722],
        [0.1625, 0.1769, 0.1585, 0.1568, 0.1674, 0.1780],
        [0.1558, 0.1803, 0.1587, 0.1568, 0.1655, 0.1828],
        [0.1619, 0.1761, 0.1597, 0.1582, 0.1669, 0.1773]],
       grad_fn=<SoftmaxBackward0>)

In [208]:
context_length = attn_scores.shape[0]
mask = torch.tril(torch.ones(context_length, context_length))
mask

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [209]:
masked_attn = att_weights * mask
masked_attn

tensor([[0.1660, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1668, 0.1717, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1680, 0.1722, 0.1602, 0.0000, 0.0000, 0.0000],
        [0.1625, 0.1769, 0.1585, 0.1568, 0.0000, 0.0000],
        [0.1558, 0.1803, 0.1587, 0.1568, 0.1655, 0.0000],
        [0.1619, 0.1761, 0.1597, 0.1582, 0.1669, 0.1773]],
       grad_fn=<MulBackward0>)

In [211]:
masked_attn = masked_attn / masked_attn.sum(dim=1, keepdim=True)
masked_attn

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6645, 0.3355, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5498, 0.2764, 0.1739, 0.0000, 0.0000, 0.0000],
        [0.4759, 0.2539, 0.1539, 0.1164, 0.0000, 0.0000],
        [0.4209, 0.2388, 0.1422, 0.1074, 0.0908, 0.0000],
        [0.4001, 0.2134, 0.1309, 0.0991, 0.0838, 0.0727]],
       grad_fn=<DivBackward0>)

In [212]:
inf_mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked_attn = attn_scores.masked_fill(inf_mask.bool(), -torch.inf)
masked_attn

tensor([[-0.0526,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.0802, -0.0390,    -inf,    -inf,    -inf,    -inf],
        [-0.1045, -0.0693, -0.1718,    -inf,    -inf,    -inf],
        [-0.1053,  0.0141, -0.1413, -0.1564,    -inf,    -inf],
        [-0.0679,  0.1384, -0.0420, -0.0591,  0.0173,    -inf],
        [-0.0833,  0.0357, -0.1025, -0.1158, -0.0396,  0.0454]],
       grad_fn=<MaskedFillBackward0>)

In [214]:
masked_attn_weights = torch.softmax(masked_attn / keys.shape[-1]**0.5, dim=-1)
masked_attn_weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4927, 0.5073, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3357, 0.3442, 0.3201, 0.0000, 0.0000, 0.0000],
        [0.2483, 0.2702, 0.2421, 0.2395, 0.0000, 0.0000],
        [0.1907, 0.2207, 0.1942, 0.1919, 0.2025, 0.0000],
        [0.1619, 0.1761, 0.1597, 0.1582, 0.1669, 0.1773]],
       grad_fn=<SoftmaxBackward0>)

In [215]:
masked_context = masked_attn_weights @ sa_v1.w_value(input_tensor)
masked_context

tensor([[-0.2558,  0.0369],
        [-0.3602,  0.0155],
        [-0.3874,  0.0064],
        [-0.4527,  0.0375],
        [-0.5201,  0.0928],
        [-0.5476,  0.0993]], grad_fn=<MmBackward0>)

In [216]:
dropout = torch.nn.Dropout(0.5)
print(dropout(masked_attn_weights))

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9854, 1.0146, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6714, 0.0000, 0.6402, 0.0000, 0.0000, 0.0000],
        [0.4966, 0.5403, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4413, 0.0000, 0.3838, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3194, 0.0000, 0.3339, 0.3546]],
       grad_fn=<MulBackward0>)


In [221]:
batch = torch.stack((input_tensor, input_tensor), dim=0)
batch

tensor([[[0.0290, 0.4019, 0.2598],
         [0.3666, 0.0583, 0.7006],
         [0.0518, 0.4681, 0.6738],
         [0.3315, 0.7837, 0.5631],
         [0.7749, 0.8208, 0.2793],
         [0.6817, 0.2837, 0.6567]],

        [[0.0290, 0.4019, 0.2598],
         [0.3666, 0.0583, 0.7006],
         [0.0518, 0.4681, 0.6738],
         [0.3315, 0.7837, 0.5631],
         [0.7749, 0.8208, 0.2793],
         [0.6817, 0.2837, 0.6567]]])

In [227]:
class CausalAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout):
        super().__init__()

        self.d_out = d_out
        self.w_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=False)
        self.dropout = torch.nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        attn_scores = queries @ keys.transpose(1,2)
        attn_scores.masked_fill(self.mask.bool()[:num_tokens, : num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        attn_weights = self.dropout(attn_weights)
        context_vec = attn_weights @ values

        return context_vec

In [228]:
context_length = batch.shape[1]
ca = CausalAttention(d_in=3, d_out=2, context_length=context_length, dropout=0.0)
print(ca(batch))

tensor([[[-0.1902,  0.2477],
         [-0.1907,  0.2481],
         [-0.1911,  0.2480],
         [-0.1926,  0.2484],
         [-0.1935,  0.2486],
         [-0.1921,  0.2484]],

        [[-0.1902,  0.2477],
         [-0.1907,  0.2481],
         [-0.1911,  0.2480],
         [-0.1926,  0.2484],
         [-0.1935,  0.2486],
         [-0.1921,  0.2484]]], grad_fn=<UnsafeViewBackward0>)


In [230]:
class MultiheadAttentionWrapperClass(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads):
        super().__init__()

        self.heads = torch.nn.ModuleList(
            [CausalAttention(d_in=d_in, d_out=d_out, context_length=context_length, dropout=dropout) for _ in range(num_heads)]
        )
    
    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

In [233]:
d_in, d_out = 3, 2
context_length = batch.shape[1]
mha = MultiheadAttentionWrapperClass(d_in=d_in, d_out=d_out, context_length=context_length, dropout=0.0, num_heads=2)
context_vector = mha(batch)
context_vector

tensor([[[-0.2859, -0.4828, -0.1554,  0.0602],
         [-0.2838, -0.4901, -0.1543,  0.0604],
         [-0.2849, -0.4830, -0.1555,  0.0597],
         [-0.2853, -0.4833, -0.1548,  0.0608],
         [-0.2855, -0.4877, -0.1534,  0.0624],
         [-0.2838, -0.4917, -0.1535,  0.0614]],

        [[-0.2859, -0.4828, -0.1554,  0.0602],
         [-0.2838, -0.4901, -0.1543,  0.0604],
         [-0.2849, -0.4830, -0.1555,  0.0597],
         [-0.2853, -0.4833, -0.1548,  0.0608],
         [-0.2855, -0.4877, -0.1534,  0.0624],
         [-0.2838, -0.4917, -0.1535,  0.0614]]], grad_fn=<CatBackward0>)

In [242]:
class MultiheadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout):
        super().__init__()
        assert(d_out % num_heads == 0), "d_out must be divisible by num of heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.w_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=False)
        self.dropout = torch.nn.Dropout(dropout)
        self.out_proj = torch.nn.Linear(d_out, d_out)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        batch, num_tokens, d_in = x.shape

        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        keys = keys.view(batch, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(batch, num_tokens, self.num_heads, self.head_dim)
        values = values.view(batch, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)

        attn_scores = queries @ keys.transpose(2,3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(batch, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

In [243]:
inputs = torch.rand(3,6)

batch = torch.stack((inputs, inputs), dim=0)
b, context_length, d_in = batch.shape
d_out = 6

mha = MultiheadAttention(d_in=d_in, d_out=d_out, context_length=context_length, dropout=0.0, num_heads=2)
cv = mha(batch)
cv

tensor([[[-0.1424,  0.0194, -0.0849,  0.1745,  0.3831, -0.1559],
         [-0.1414,  0.0179, -0.0844,  0.1728,  0.3809, -0.1563],
         [-0.1418,  0.0198, -0.0855,  0.1735,  0.3826, -0.1560]],

        [[-0.1424,  0.0194, -0.0849,  0.1745,  0.3831, -0.1559],
         [-0.1414,  0.0179, -0.0844,  0.1728,  0.3809, -0.1563],
         [-0.1418,  0.0198, -0.0855,  0.1735,  0.3826, -0.1560]]],
       grad_fn=<ViewBackward0>)

In [244]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [245]:
sample = torch.randn(2,5)
layer = torch.nn.Sequential(torch.nn.Linear(5,6), torch.nn.ReLU())
out = layer(sample)
out

tensor([[2.7108e-03, 1.0574e+00, 0.0000e+00, 0.0000e+00, 3.1077e-01, 0.0000e+00],
        [4.8673e-01, 1.7322e-02, 6.9024e-04, 2.9743e-01, 5.6711e-01, 4.0454e-01]],
       grad_fn=<ReluBackward0>)

In [246]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
mean, var

(tensor([[0.2285],
         [0.2956]], grad_fn=<MeanBackward1>),
 tensor([[0.1803],
         [0.0573]], grad_fn=<VarBackward0>))

In [247]:
out_norm = (out - mean) / torch.sqrt(var)
out_norm

tensor([[-0.5317,  1.9522, -0.5381, -0.5381,  0.1938, -0.5381],
        [ 0.7983, -1.1626, -1.2321,  0.0075,  1.1340,  0.4549]],
       grad_fn=<DivBackward0>)

In [250]:
out_norm.mean(dim=-1, keepdim=True)
out_norm.var(dim=-1, keepdim=True)

tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)

In [251]:
class LayerNorm(torch.nn.Module):
    def __init__(self, emb_dim):
        super().__init__()

        self.eps = 1e-5
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        x_mean = x.mean(dim=-1, keepdim=True)
        x_var = x.var(dim=-1, keepdim=True, unbiased=False)
        x_norm = (x - x_mean) / torch.sqrt(x_var + self.eps)
        return self.scale * x_norm + self.shift

In [252]:
class GeLU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

In [253]:
class FeedForward(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GeLU(),
             torch.nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [258]:
class ExampleDeepNeuralNetwork(torch.nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = torch.nn.ModuleList([
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[0], layer_sizes[1]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[1], layer_sizes[2]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[2], layer_sizes[3]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[3], layer_sizes[4]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[4], layer_sizes[5]), GeLU()),
        ])

    def forward(self, x):
        for layer in self.layers:
            out = layer(x)

            if self.use_shortcut and x.shape == out.shape:
                x = out + x
            else:
                x = out

        return x

In [265]:
layer_sizes= [3,3,3,3,3,1]
sample = torch.tensor([[1., 0., -1.]])
without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, False)

In [266]:
def print_gradients(model,x):
    output = model(x)
    target = torch.tensor([[0.]])

    loss = torch.nn.MSELoss()
    loss = loss(output, target)

    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f'{name} has gradient mean of {param.grad.abs().mean().item()}')

In [267]:
print_gradients(without_shortcut, sample)

layers.0.0.weight has gradient mean of 8.446240826742724e-06
layers.1.0.weight has gradient mean of 1.477338719269028e-05
layers.2.0.weight has gradient mean of 8.532095307600684e-06
layers.3.0.weight has gradient mean of 6.618356565013528e-05
layers.4.0.weight has gradient mean of 0.00249104923568666


In [268]:
with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, True)
print_gradients(without_shortcut, sample)

layers.0.0.weight has gradient mean of 1.6892481653485447e-05
layers.1.0.weight has gradient mean of 2.954677438538056e-05
layers.2.0.weight has gradient mean of 1.706419061520137e-05
layers.3.0.weight has gradient mean of 0.00013236713130027056
layers.4.0.weight has gradient mean of 0.00498209847137332
