In [4]:
with open("the-verdict.txt", 'r') as f:
    raw_text = f.read()
print("Total number of charachters: ", len(raw_text))

print(raw_text[:99])

Total number of charachters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [5]:
import re

preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [6]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [7]:
vocab = {token:i for i, token in enumerate(all_words)}

In [8]:
for i, token in enumerate(vocab.items()):
    print(token)
    if i == 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [9]:
class Tokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]

        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text= re.sub(r'\s+([,.:;?!_"()\'])', r'\1', text)
        return text

In [10]:
tokenizer = Tokenizer(vocab=vocab)
text = """It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]


In [11]:
tokenizer.decode(ids)


"It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride."

In [12]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|ukw|>"])

vocab = {token: i for i, token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [13]:
for _, tokens in enumerate(list(vocab.items())[-5:]):
    print(tokens)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|ukw|>', 1131)


In [14]:
class Tokenizerv2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int else "<|ukw|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]

        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text= re.sub(r'\s+([,.:;?!_"()\'])', r'\1', text)
        return text

In [15]:
tokenizer = Tokenizerv2(vocab=vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [16]:
ids = tokenizer.encode(text)
ids

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [17]:
tokenizer.decode(ids)

'<|ukw|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|ukw|>.'

In [18]:
! pip install tiktoken




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [20]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunknownpalace."

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
ids

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 286,
 262,
 617,
 34680,
 18596,
 558,
 13]

In [21]:
string = tokenizer.decode(ids)
string

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunknownpalace.'

In [22]:
enc_text = tokenizer.encode(raw_text)
len(enc_text)
enc_sample = enc_text[:50]

In [23]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

x, y

([40, 367, 2885, 1464], [367, 2885, 1464, 1807])

In [24]:
for i in range(1, context_size + 1):
    x = enc_sample[:i]
    y = enc_sample[i]

    print(tokenizer.decode(x), "----->", tokenizer.decode([y]))


I ----->  H
I H -----> AD
I HAD ----->  always
I HAD always ----->  thought


In [25]:
! pip install torch




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import torch
from torch.utils.data import  DataLoader, Dataset

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [27]:
def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [28]:
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
print(next(iter(dataloader)))

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


In [29]:
input_ids = torch.tensor([2,3,5,1])

vocab_size = 50257
dimension = 256

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, dimension)

In [30]:
max_length = 4
dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, target = next(data_iter)

print("token \n", inputs)
print("target \n", target)

token 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
target 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [31]:
token_embeddings = embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [32]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_size, dimension)

pos_embeddings = pos_embedding_layer(torch.arange(max_length))
pos_embeddings.shape

torch.Size([4, 256])

In [33]:
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])

In [34]:
torch.manual_seed(123)
inputs = torch.rand(6,3)
inputs

tensor([[0.2961, 0.5166, 0.2517],
        [0.6886, 0.0740, 0.8665],
        [0.1366, 0.1025, 0.1841],
        [0.7264, 0.3153, 0.6871],
        [0.0756, 0.1966, 0.3164],
        [0.4017, 0.1186, 0.8274]])

In [35]:
query = inputs[1]
attention_score_2 = torch.empty(inputs.shape[0])

for i, x_i in enumerate(inputs):
    attention_score_2[i] =  torch.dot(x_i, query)

attention_score_2

tensor([0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023])

In [36]:
normalized_attention_score = attention_score_2 / attention_score_2.sum()
normalized_attention_score

tensor([0.1043, 0.2788, 0.0592, 0.2535, 0.0772, 0.2271])

In [37]:
def naive_softmax(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

normalized_attention_score = naive_softmax(attention_score_2)
normalized_attention_score

tensor([0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019])

In [38]:
normalized_attention_score = torch.softmax(attention_score_2, dim=0)
normalized_attention_score

tensor([0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019])

In [39]:
context_vector_2 = torch.zeros(query.shape)
for i, inp in enumerate(inputs):
    context_vector_2 += normalized_attention_score[i] * inp
context_vector_2

tensor([0.4762, 0.2052, 0.6228])

In [40]:
attention_scores = torch.empty(6,6)

for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attention_scores[i, j] = torch.dot(x_i, x_j)

attention_scores

tensor([[0.4179, 0.4602, 0.1397, 0.5509, 0.2036, 0.3884],
        [0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023],
        [0.1397, 0.2611, 0.0630, 0.2580, 0.0887, 0.2193],
        [0.5509, 1.1189, 0.2580, 1.0992, 0.3343, 0.8977],
        [0.2036, 0.3408, 0.0887, 0.3343, 0.1445, 0.3155],
        [0.3884, 1.0023, 0.2193, 0.8977, 0.3155, 0.8600]])

In [41]:
inputs @ inputs.T

tensor([[0.4179, 0.4602, 0.1397, 0.5509, 0.2036, 0.3884],
        [0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023],
        [0.1397, 0.2611, 0.0630, 0.2580, 0.0887, 0.2193],
        [0.5509, 1.1189, 0.2580, 1.0992, 0.3343, 0.8977],
        [0.2036, 0.3408, 0.0887, 0.3343, 0.1445, 0.3155],
        [0.3884, 1.0023, 0.2193, 0.8977, 0.3155, 0.8600]])

In [42]:
normalized_attention_scores = torch.softmax(attention_scores, dim=-1)
normalized_attention_scores, inputs


(tensor([[0.1748, 0.1824, 0.1324, 0.1997, 0.1411, 0.1697],
         [0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019],
         [0.1609, 0.1817, 0.1490, 0.1811, 0.1529, 0.1743],
         [0.1340, 0.2365, 0.1000, 0.2319, 0.1079, 0.1896],
         [0.1603, 0.1838, 0.1429, 0.1827, 0.1511, 0.1793],
         [0.1268, 0.2342, 0.1070, 0.2110, 0.1179, 0.2032]]),
 tensor([[0.2961, 0.5166, 0.2517],
         [0.6886, 0.0740, 0.8665],
         [0.1366, 0.1025, 0.1841],
         [0.7264, 0.3153, 0.6871],
         [0.0756, 0.1966, 0.3164],
         [0.4017, 0.1186, 0.8274]]))

In [43]:
normalized_attention_scores @ inputs

tensor([[0.4193, 0.2282, 0.5486],
        [0.4762, 0.2052, 0.6228],
        [0.4063, 0.2197, 0.5424],
        [0.4690, 0.2138, 0.6075],
        [0.4097, 0.2196, 0.5476],
        [0.4572, 0.2075, 0.6049]])

In [44]:
torch.manual_seed(1234)
input_tensor = torch.rand(6,3)

In [45]:
d_in = input_tensor.shape[1]
d_out = 2

In [46]:
w_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
w_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
w_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [47]:
w_key, w_query, w_value

(Parameter containing:
 tensor([[0.9665, 0.7399],
         [0.4517, 0.4757],
         [0.7842, 0.1525]]),
 Parameter containing:
 tensor([[0.2388, 0.7313],
         [0.6012, 0.3043],
         [0.2548, 0.6294]]),
 Parameter containing:
 tensor([[0.6662, 0.3343],
         [0.7893, 0.3216],
         [0.5247, 0.6688]]))

In [48]:
x_2 = input_tensor[2]
query_2 = x_2 @ w_query
key_2 = x_2 @ w_key
value_2 = x_2 @ w_value

query_2, key_2, value_2

(tensor([0.4655, 0.6044]), tensor([0.7899, 0.3638]), tensor([0.7576, 0.6186]))

In [49]:
keys = input_tensor @ w_key
queries = input_tensor @ w_query
values = input_tensor @ w_value

In [50]:
keys, queries, values

(tensor([[0.4133, 0.2523],
         [0.9301, 0.4059],
         [0.7899, 0.3638],
         [1.1159, 0.7040],
         [1.3387, 1.0064],
         [1.3020, 0.7395]]),
 tensor([[0.3147, 0.3070],
         [0.3011, 0.7268],
         [0.4655, 0.6044],
         [0.6938, 0.8353],
         [0.7496, 0.9922],
         [0.5006, 0.9982]]),
 tensor([[0.4729, 0.3127],
         [0.6579, 0.6099],
         [0.7576, 0.6186],
         [1.1349, 0.7395],
         [1.3106, 0.7098],
         [1.0227, 0.7584]]))

In [51]:
attn_scores = queries @ keys.T
attn_scores

tensor([[0.2075, 0.4174, 0.3603, 0.5674, 0.7304, 0.6369],
        [0.3078, 0.5750, 0.5022, 0.8476, 1.1345, 0.9295],
        [0.3449, 0.6783, 0.5876, 0.9449, 1.2314, 1.0530],
        [0.4975, 0.9843, 0.8519, 1.3622, 1.7694, 1.5210],
        [0.5601, 1.1000, 0.9531, 1.5350, 2.0021, 1.7098],
        [0.4587, 0.8708, 0.7586, 1.2613, 1.6748, 1.3900]])

In [52]:
d = keys.shape[-1]
attn_weights = torch.softmax(attn_scores / d**0.5, dim=-1)
attn_weights, values

(tensor([[0.1358, 0.1575, 0.1513, 0.1751, 0.1965, 0.1839],
         [0.1224, 0.1479, 0.1405, 0.1794, 0.2197, 0.1901],
         [0.1176, 0.1489, 0.1396, 0.1798, 0.2201, 0.1940],
         [0.0994, 0.1403, 0.1277, 0.1832, 0.2444, 0.2050],
         [0.0926, 0.1356, 0.1222, 0.1844, 0.2566, 0.2087],
         [0.1038, 0.1389, 0.1283, 0.1831, 0.2453, 0.2006]]),
 tensor([[0.4729, 0.3127],
         [0.6579, 0.6099],
         [0.7576, 0.6186],
         [1.1349, 0.7395],
         [1.3106, 0.7098],
         [1.0227, 0.7584]]))

In [53]:
attn_weights.sum(dim=-1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [54]:
context_vector = attn_weights @ values
context_vector

tensor([[0.9267, 0.6405],
        [0.9475, 0.6482],
        [0.9503, 0.6503],
        [0.9739, 0.6601],
        [0.9845, 0.6640],
        [0.9721, 0.6582]])

In [55]:
class SelfAttention(torch.nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()

        self.w_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        context_vec = attn_weights @ values
        return context_vec

In [56]:
sa_v1 = SelfAttention(d_in=3, d_out=2)
context = sa_v1(input_tensor)
context

tensor([[0.4872, 0.4342],
        [0.4875, 0.4343],
        [0.4896, 0.4365],
        [0.4890, 0.4361],
        [0.4862, 0.4333],
        [0.4870, 0.4338]], grad_fn=<MmBackward0>)

In [57]:
queries = sa_v1.w_query(input_tensor)
keys = sa_v1.w_key(input_tensor)
attn_scores = queries @ keys.T
att_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
att_weights


tensor([[0.1604, 0.1678, 0.1653, 0.1673, 0.1684, 0.1708],
        [0.1607, 0.1664, 0.1652, 0.1681, 0.1696, 0.1699],
        [0.1553, 0.1679, 0.1640, 0.1683, 0.1706, 0.1739],
        [0.1559, 0.1686, 0.1643, 0.1677, 0.1696, 0.1739],
        [0.1621, 0.1685, 0.1658, 0.1665, 0.1670, 0.1701],
        [0.1617, 0.1668, 0.1655, 0.1677, 0.1687, 0.1696]],
       grad_fn=<SoftmaxBackward0>)

In [58]:
context_length = attn_scores.shape[0]
mask = torch.tril(torch.ones(context_length, context_length))
mask

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [59]:
masked_attn = att_weights * mask
masked_attn

tensor([[0.1604, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1607, 0.1664, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1553, 0.1679, 0.1640, 0.0000, 0.0000, 0.0000],
        [0.1559, 0.1686, 0.1643, 0.1677, 0.0000, 0.0000],
        [0.1621, 0.1685, 0.1658, 0.1665, 0.1670, 0.0000],
        [0.1617, 0.1668, 0.1655, 0.1677, 0.1687, 0.1696]],
       grad_fn=<MulBackward0>)

In [60]:
masked_attn = masked_attn / masked_attn.sum(dim=1, keepdim=True)
masked_attn

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4913, 0.5087, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3187, 0.3447, 0.3366, 0.0000, 0.0000, 0.0000],
        [0.2375, 0.2569, 0.2502, 0.2554, 0.0000, 0.0000],
        [0.1954, 0.2030, 0.1998, 0.2006, 0.2012, 0.0000],
        [0.1617, 0.1668, 0.1655, 0.1677, 0.1687, 0.1696]],
       grad_fn=<DivBackward0>)

In [61]:
inf_mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked_attn = attn_scores.masked_fill(inf_mask.bool(), -torch.inf)
masked_attn

tensor([[0.0338,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.0396, 0.0887,   -inf,   -inf,   -inf,   -inf],
        [0.0668, 0.1775, 0.1442,   -inf,   -inf,   -inf],
        [0.0581, 0.1691, 0.1318, 0.1612,   -inf,   -inf],
        [0.0185, 0.0725, 0.0500, 0.0561, 0.0603,   -inf],
        [0.0310, 0.0754, 0.0639, 0.0822, 0.0915, 0.0985]],
       grad_fn=<MaskedFillBackward0>)

In [62]:
masked_attn_weights = torch.softmax(masked_attn / keys.shape[-1]**0.5, dim=-1)
masked_attn_weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4913, 0.5087, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3187, 0.3447, 0.3366, 0.0000, 0.0000, 0.0000],
        [0.2375, 0.2569, 0.2502, 0.2554, 0.0000, 0.0000],
        [0.1954, 0.2030, 0.1998, 0.2006, 0.2012, 0.0000],
        [0.1617, 0.1668, 0.1655, 0.1677, 0.1687, 0.1696]],
       grad_fn=<SoftmaxBackward0>)

In [63]:
masked_context = masked_attn_weights @ sa_v1.w_value(input_tensor)
masked_context

tensor([[0.2318, 0.1784],
        [0.3111, 0.2893],
        [0.3376, 0.3046],
        [0.4021, 0.3524],
        [0.4639, 0.4041],
        [0.4870, 0.4338]], grad_fn=<MmBackward0>)

In [64]:
dropout = torch.nn.Dropout(0.5)
print(dropout(masked_attn_weights))

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9826, 1.0174, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6374, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.5109, 0.0000, 0.0000],
        [0.3908, 0.4059, 0.3996, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3337, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


In [65]:
batch = torch.stack((input_tensor, input_tensor), dim=0)
batch

tensor([[[0.0290, 0.4019, 0.2598],
         [0.3666, 0.0583, 0.7006],
         [0.0518, 0.4681, 0.6738],
         [0.3315, 0.7837, 0.5631],
         [0.7749, 0.8208, 0.2793],
         [0.6817, 0.2837, 0.6567]],

        [[0.0290, 0.4019, 0.2598],
         [0.3666, 0.0583, 0.7006],
         [0.0518, 0.4681, 0.6738],
         [0.3315, 0.7837, 0.5631],
         [0.7749, 0.8208, 0.2793],
         [0.6817, 0.2837, 0.6567]]])

In [66]:
class CausalAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout):
        super().__init__()

        self.d_out = d_out
        self.w_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=False)
        self.dropout = torch.nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        attn_scores = queries @ keys.transpose(1,2)
        attn_scores.masked_fill(self.mask.bool()[:num_tokens, : num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        attn_weights = self.dropout(attn_weights)
        context_vec = attn_weights @ values

        return context_vec

In [67]:
context_length = batch.shape[1]
ca = CausalAttention(d_in=3, d_out=2, context_length=context_length, dropout=0.0)
print(ca(batch))

tensor([[[-0.2400, -0.4165],
         [-0.2407, -0.4169],
         [-0.2408, -0.4170],
         [-0.2397, -0.4164],
         [-0.2382, -0.4155],
         [-0.2398, -0.4164]],

        [[-0.2400, -0.4165],
         [-0.2407, -0.4169],
         [-0.2408, -0.4170],
         [-0.2397, -0.4164],
         [-0.2382, -0.4155],
         [-0.2398, -0.4164]]], grad_fn=<UnsafeViewBackward0>)


In [68]:
class MultiheadAttentionWrapperClass(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads):
        super().__init__()

        self.heads = torch.nn.ModuleList(
            [CausalAttention(d_in=d_in, d_out=d_out, context_length=context_length, dropout=dropout) for _ in range(num_heads)]
        )
    
    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

In [69]:
d_in, d_out = 3, 2
context_length = batch.shape[1]
mha = MultiheadAttentionWrapperClass(d_in=d_in, d_out=d_out, context_length=context_length, dropout=0.0, num_heads=2)
context_vector = mha(batch)
context_vector

tensor([[[ 0.0684, -0.0067,  0.4405, -0.2800],
         [ 0.0754, -0.0084,  0.4409, -0.2803],
         [ 0.0746, -0.0081,  0.4402, -0.2799],
         [ 0.0689, -0.0073,  0.4382, -0.2785],
         [ 0.0612, -0.0061,  0.4367, -0.2775],
         [ 0.0711, -0.0079,  0.4391, -0.2791]],

        [[ 0.0684, -0.0067,  0.4405, -0.2800],
         [ 0.0754, -0.0084,  0.4409, -0.2803],
         [ 0.0746, -0.0081,  0.4402, -0.2799],
         [ 0.0689, -0.0073,  0.4382, -0.2785],
         [ 0.0612, -0.0061,  0.4367, -0.2775],
         [ 0.0711, -0.0079,  0.4391, -0.2791]]], grad_fn=<CatBackward0>)

In [70]:
class MultiheadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout):
        super().__init__()
        assert(d_out % num_heads == 0), "d_out must be divisible by num of heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.w_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=False)
        self.dropout = torch.nn.Dropout(dropout)
        self.out_proj = torch.nn.Linear(d_out, d_out)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        batch, num_tokens, d_in = x.shape

        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        keys = keys.view(batch, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(batch, num_tokens, self.num_heads, self.head_dim)
        values = values.view(batch, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)

        attn_scores = queries @ keys.transpose(2,3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(batch, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

In [71]:
inputs = torch.rand(3,6)

batch = torch.stack((inputs, inputs), dim=0)
b, context_length, d_in = batch.shape
d_out = 6

mha = MultiheadAttention(d_in=d_in, d_out=d_out, context_length=context_length, dropout=0.0, num_heads=2)
cv = mha(batch)
cv

tensor([[[ 0.2667,  0.1303,  0.4004,  0.4933,  0.3055, -0.1367],
         [ 0.2657,  0.1312,  0.4026,  0.4936,  0.3061, -0.1369],
         [ 0.2664,  0.1309,  0.4010,  0.4936,  0.3057, -0.1365]],

        [[ 0.2667,  0.1303,  0.4004,  0.4933,  0.3055, -0.1367],
         [ 0.2657,  0.1312,  0.4026,  0.4936,  0.3061, -0.1369],
         [ 0.2664,  0.1309,  0.4010,  0.4936,  0.3057, -0.1365]]],
       grad_fn=<ViewBackward0>)

In [72]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [73]:
sample = torch.randn(2,5)
layer = torch.nn.Sequential(torch.nn.Linear(5,6), torch.nn.ReLU())
out = layer(sample)
out

tensor([[0.0000, 0.0000, 0.8510, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.9412, 0.0000, 0.0000, 0.0000]],
       grad_fn=<ReluBackward0>)

In [74]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
mean, var

(tensor([[0.1418],
         [0.1569]], grad_fn=<MeanBackward1>),
 tensor([[0.1207],
         [0.1476]], grad_fn=<VarBackward0>))

In [75]:
out_norm = (out - mean) / torch.sqrt(var)
out_norm

tensor([[-0.4082, -0.4082,  2.0412, -0.4082, -0.4082, -0.4082],
        [-0.4082, -0.4082,  2.0412, -0.4082, -0.4082, -0.4082]],
       grad_fn=<DivBackward0>)

In [76]:
out_norm.mean(dim=-1, keepdim=True)
out_norm.var(dim=-1, keepdim=True)

tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)

In [77]:
class LayerNorm(torch.nn.Module):
    def __init__(self, emb_dim):
        super().__init__()

        self.eps = 1e-5
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        x_mean = x.mean(dim=-1, keepdim=True)
        x_var = x.var(dim=-1, keepdim=True, unbiased=False)
        x_norm = (x - x_mean) / torch.sqrt(x_var + self.eps)
        return self.scale * x_norm + self.shift

In [78]:
class GeLU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

In [79]:
class FeedForward(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GeLU(),
             torch.nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [80]:
class ExampleDeepNeuralNetwork(torch.nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = torch.nn.ModuleList([
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[0], layer_sizes[1]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[1], layer_sizes[2]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[2], layer_sizes[3]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[3], layer_sizes[4]), GeLU()),
            torch.nn.Sequential(torch.nn.Linear(layer_sizes[4], layer_sizes[5]), GeLU()),
        ])

    def forward(self, x):
        for layer in self.layers:
            out = layer(x)

            if self.use_shortcut and x.shape == out.shape:
                x = out + x
            else:
                x = out

        return x

In [81]:
layer_sizes= [3,3,3,3,3,1]
sample = torch.tensor([[1., 0., -1.]])
without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, False)

In [82]:
def print_gradients(model,x):
    output = model(x)
    target = torch.tensor([[0.]])

    loss = torch.nn.MSELoss()
    loss = loss(output, target)

    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f'{name} has gradient mean of {param.grad.abs().mean().item()}')

In [83]:
print_gradients(without_shortcut, sample)

layers.0.0.weight has gradient mean of 0.0006531918770633638
layers.1.0.weight has gradient mean of 0.001302986522205174
layers.2.0.weight has gradient mean of 0.001411551027558744
layers.3.0.weight has gradient mean of 0.0036776314955204725
layers.4.0.weight has gradient mean of 0.08421478420495987


In [84]:
with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, True)
print_gradients(without_shortcut, sample)

layers.0.0.weight has gradient mean of 0.0013063837541267276
layers.1.0.weight has gradient mean of 0.002605973044410348
layers.2.0.weight has gradient mean of 0.002823102055117488
layers.3.0.weight has gradient mean of 0.007355262991040945
layers.4.0.weight has gradient mean of 0.16842956840991974


In [85]:
class Transformer(torch.nn.Module):
    def __init__(self, GPT_CONFIG_124M):
        super().__init__()

        self.att = MultiheadAttention(d_in=GPT_CONFIG_124M["emb_dim"], d_out=GPT_CONFIG_124M["emb_dim"],
         num_heads=GPT_CONFIG_124M["n_heads"], context_length=GPT_CONFIG_124M["context_length"], dropout=GPT_CONFIG_124M["drop_rate"])
        self.ff = FeedForward(GPT_CONFIG_124M)
        self.norm1 = LayerNorm(GPT_CONFIG_124M["emb_dim"])
        self.norm2 = LayerNorm(GPT_CONFIG_124M["emb_dim"])
        self.dropout = torch.nn.Dropout(GPT_CONFIG_124M["drop_rate"])
        
    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.dropout(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + shortcut

        return x

In [86]:
x = torch.randn(2, 4, 768)
block = Transformer(GPT_CONFIG_124M)
out = block(x)
print(x,out)
print(out.shape)

tensor([[[-0.5777, -1.5113,  1.4643,  ..., -0.8158, -0.1034, -0.4076],
         [ 1.5022,  1.9631, -3.2782,  ...,  0.0283,  1.8780,  0.5751],
         [ 0.9149, -0.5529, -0.4804,  ...,  2.2163, -0.8563, -1.5702],
         [-1.2342, -1.1643,  1.1945,  ...,  0.8231, -1.0217,  1.5443]],

        [[-0.2108,  0.0271, -0.3237,  ...,  1.2920, -0.1178,  0.4203],
         [ 1.0756,  0.4055,  0.0934,  ..., -0.2938, -1.6447, -0.4352],
         [-0.2659,  0.2935, -0.8289,  ..., -0.7317, -1.4294,  2.2349],
         [ 2.4047,  0.6540,  0.2593,  ..., -0.3882, -0.6954, -0.1103]]]) tensor([[[-6.6465e-01, -1.5389e+00,  1.6074e+00,  ..., -6.0298e-01,
           7.2252e-02, -5.8015e-01],
         [ 1.4821e+00,  1.2335e+00, -3.5325e+00,  ...,  2.6338e-02,
           2.0502e+00,  3.1126e-01],
         [ 1.0298e+00, -5.7946e-01, -5.0622e-01,  ...,  2.4189e+00,
          -5.9141e-01, -1.4560e+00],
         [-1.4955e+00, -1.5586e+00,  1.2937e+00,  ...,  5.9106e-01,
          -8.0829e-01,  1.6423e+00]],

      

In [87]:
class GPTModel(torch.nn.Module):
    def __init__(self, GPT_CONFIG_124M):
        super().__init__()
        
        self.token_emb = torch.nn.Embedding(GPT_CONFIG_124M["vocab_size"], GPT_CONFIG_124M["emb_dim"])
        self.pos_emb = torch.nn.Embedding(GPT_CONFIG_124M["context_length"], GPT_CONFIG_124M["emb_dim"])
        self.dropout = torch.nn.Dropout(GPT_CONFIG_124M["drop_rate"])

        self.trf_blocks = torch.nn.Sequential(
            *[Transformer(GPT_CONFIG_124M) for _ in range(GPT_CONFIG_124M["n_layers"])]
        )
        self.norm = LayerNorm(GPT_CONFIG_124M["emb_dim"])
        self.out_head = torch.nn.Linear(GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["vocab_size"])

    def forward(self, x):
        batch, seq_length = x.shape
        token_Emb = self.token_emb(x)
        pos_emb = self.pos_emb(torch.arange(seq_length, device=x.device))
        x = token_Emb + pos_emb
        x = self.dropout(x)
        x = self.trf_blocks(x)
        x = self.norm(x)
        logits = self.out_head(x)

        return logits

In [88]:
input = torch.tensor(tokenizer.encode("every effort moves you"))
batch = torch.stack((input, input), dim=0)
batch

tensor([[16833,  3626,  6100,   345],
        [16833,  3626,  6100,   345]])

In [89]:
gpt = GPTModel(GPT_CONFIG_124M)
out = gpt(batch)
out, out.shape

(tensor([[[-0.0287,  0.6093, -0.2482,  ..., -0.0191,  0.0115, -0.2316],
          [-0.0170, -0.0218,  1.2233,  ...,  0.2564,  0.5128,  0.0599],
          [ 0.3565, -0.7963,  0.4914,  ..., -1.1413, -0.2096, -1.2165],
          [ 0.0929,  0.0242,  0.0642,  ...,  0.0167,  0.0630, -0.2574]],
 
         [[ 0.2025,  0.4652, -0.4608,  ..., -0.0682,  0.2084,  0.1139],
          [-0.0267, -0.1953,  0.9310,  ...,  0.1618,  0.5742, -0.3711],
          [ 0.3879, -0.1945,  0.3878,  ..., -0.9991, -0.2374, -0.6698],
          [ 0.0422,  0.8354,  0.0853,  ...,  0.1681,  0.3575, -0.3891]]],
        grad_fn=<ViewBackward0>),
 torch.Size([2, 4, 50257]))

In [90]:
total_param = sum(p.numel()  for p in gpt.parameters())
total_param

163059793

In [91]:
def generate_text(model, inputs, max_new_tokens, context_size):

    for _ in range(max_new_tokens):

        idx_contd = inputs[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_contd)

        logits = logits[:, -1, :]

        probabilites = torch.softmax(logits, dim= -1)
        idx_next = torch.argmax(probabilites, dim = -1, keepdim=True)
        

        inputs = torch.cat([inputs, idx_next], dim=1)

    return inputs

In [92]:
sample_text = "Hello, I am"
encoded = tokenizer.encode(sample_text)
encoded
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
encoded_tensor, encoded_tensor.shape

(tensor([[15496,    11,   314,   716]]), torch.Size([1, 4]))

In [93]:
gpt.eval()
out = generate_text(gpt, inputs=encoded_tensor, max_new_tokens=6, context_size=GPT_CONFIG_124M['context_length'])
print(out)

tensor([[15496,    11,   314,   716,  6889, 30842, 26889, 33328, 19545, 33379]])


In [94]:
decoded_text = tokenizer.decode(out[0].squeeze(0).tolist())
decoded_text

'Hello, I am Make psychologists consulted Punjabnext standoff'

In [95]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [96]:
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): Transformer(
      (att): MultiheadAttention(
        (w_key): Linear(in_features=768, out_features=768, bias=False)
        (w_query): Linear(in_features=768, out_features=768, bias=False)
        (w_value): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GeLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): Transformer(
      (att): MultiheadAttention(
        (w_key): Linear(in_features=768, out_feature

In [97]:
def text_to_token(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    return torch.tensor(encoded).unsqueeze(0)

def token_id_to_text(token_ids, tokenizer):
    return tokenizer.decode(token_ids.squeeze(0).tolist())

start = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text(model, text_to_token(start, tokenizer), 10, GPT_CONFIG_124M["context_length"])

print(token_id_to_text(token_ids, tokenizer))




Every effort moves you Pruitt 153 earners Edmontonatically contrasts 217 trustednesses nonprofit


In [98]:
with open("the-verdict.txt", 'r', encoding="utf-8") as f:
    raw_text = f.read()
raw_text[:100]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

In [99]:
total_charachters = len(raw_text)
total_tokens = len(tokenizer.encode(raw_text))
total_charachters, total_tokens

(20479, 5145)

In [100]:
train_ratio = 0.9
split_idx = int(train_ratio * len(raw_text))
train_data = raw_text[:split_idx]
val_data = raw_text[split_idx:]

train_loader = create_dataloader(
    train_data,
    batch_size = 2,
    max_length = GPT_CONFIG_124M['context_length'],
    stride = GPT_CONFIG_124M["context_length"],
    drop_last = True,
    shuffle = True,
    num_workers = 0
)

val_loader = create_dataloader(
    val_data,
    batch_size = 2,
    max_length = GPT_CONFIG_124M['context_length'],
    stride = GPT_CONFIG_124M["context_length"],
    drop_last = False,
    shuffle = False,
    num_workers = 0
)


In [101]:
if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    raise ValueError("No enough tokens for training. Try to lower the context length or increase the train ratio")
if total_tokens * (1 - train_ratio) < GPT_CONFIG_124M["context_length"]:
    raise ValueError("Not enough tokens for validation")

In [102]:
for x, y in train_loader:
    print(x.shape, y.shape)
print("validation loader \n")
for x,y in val_loader:
    print(x.shape, y.shape)


torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
validation loader 

torch.Size([2, 256]) torch.Size([2, 256])


In [103]:
model = GPTModel(GPT_CONFIG_124M=GPT_CONFIG_124M)
model.eval()

GPTModel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): Transformer(
      (att): MultiheadAttention(
        (w_key): Linear(in_features=768, out_features=768, bias=False)
        (w_query): Linear(in_features=768, out_features=768, bias=False)
        (w_value): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GeLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): Transformer(
      (att): MultiheadAttention(
        (w_key): Linear(in_features=768, out_feature

In [104]:
def cal_loss_batch(input_batch, target_batch, device, model):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())

    return loss

def cal_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if num_batches is None:
        num_batches = len(data_loader)
    elif len(data_loader) == 0:
        raise ValueError("nan")
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:

            loss = cal_loss_batch(input_batch, target_batch, device, model)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


In [105]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    train_loss = cal_loss_loader(train_loader, model, device)
    val_loss = cal_loss_loader(val_loader, model, device)

train_loss, val_loss

(10.998757044474283, 10.970115661621094)

In [106]:
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = cal_loss_batch(input_batch, target_batch, device, model)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_losses)
                val_losses.append(val_losses)
                track_tokens_seen.append(tokens_seen)
                print(f"{epoch+1} epoch, step {global_step:06d}:" f"train loss {train_loss:.3f}, val loss {val_loss:.3f}")

        generate_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

In [107]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = cal_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = cal_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [120]:
def generate_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text(model, encoded, 50, context_size=context_size)
    decoded_text = token_id_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [123]:
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 10

train_losses, val_losses, token_seen = train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq=5, eval_iter=5, start_context="Every effort moves you", tokenizer=tokenizer)

1 epoch, step 000000:train loss 9.920, val loss 10.074
1 epoch, step 000005:train loss 8.205, val loss 8.424
Every effort moves you the the the the the the the the the the the the the the the the the the, the, the the, the the the, the the the the the the the the the the, the the the the the the, the the the the
2 epoch, step 000010:train loss 6.734, val loss 7.118
2 epoch, step 000015:train loss 6.126, val loss 6.621
Every effort moves you.                                                 
3 epoch, step 000020:train loss 5.589, val loss 6.460
3 epoch, step 000025:train loss 5.779, val loss 6.529
Every effort moves you.                                                 
4 epoch, step 000030:train loss 5.389, val loss 6.497
4 epoch, step 000035:train loss 5.222, val loss 6.430
Every effort moves you.                                                 
5 epoch, step 000040:train loss 4.761, val loss 6.371
Every effort moves you the last, and I had of the last, I had the of the of the of the of

In [124]:
model.eval()
token_ids = generate_text(model, text_to_token("Every effort move you", tokenizer), max_new_tokens=25, context_size=GPT_CONFIG_124M["context_length"])
print(token_id_to_text(token_ids, tokenizer))

Every effort move you in the inevitable garlanded to see it--I told me--I turned back to my work, and went on grop
