In [1]:
import torch

In [35]:
x = torch.randn(5, 3)
print(x.shape)
print(x)

torch.Size([5, 3])
tensor([[-1.8969, -0.5555,  0.4984],
        [-0.7669,  1.3222,  0.1500],
        [ 0.0465,  1.4264, -1.4913],
        [ 1.5451, -1.1425, -1.7921],
        [-0.3205, -1.4035,  0.1314]])


In [36]:
attention_scores = [] 
for i in range(x.shape[0]):  
    query = x[i]
    scores = torch.empty(x.shape[0])
    for j, j_x in enumerate(x):
        scores[j] = torch.dot(query, j_x)
    attention_scores.append(scores)
attention_scores = torch.stack(attention_scores)
attention_scores = attention_scores.softmax(dim=1)
print(attention_scores.shape)
print(attention_scores)

torch.Size([5, 5])
tensor([[9.0453e-01, 3.1417e-02, 2.7969e-03, 5.8445e-04, 6.0671e-02],
        [1.2210e-01, 5.8334e-01, 2.8047e-01, 2.8441e-03, 1.1242e-02],
        [2.4856e-03, 6.4135e-02, 8.9357e-01, 3.8434e-02, 1.3792e-03],
        [4.1108e-05, 5.1471e-05, 3.0419e-03, 9.9448e-01, 2.3884e-03],
        [2.8385e-01, 1.3533e-02, 7.2611e-03, 1.5887e-01, 5.3648e-01]])


In [37]:
print(attention_scores.sum(dim=1))

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [40]:
context_vectors = attention_scores @ x
print(context_vectors.shape)
print(context_vectors)


torch.Size([5, 3])
tensor([[-1.7583, -0.5428,  0.4583],
        [-0.6652,  1.0845, -0.2735],
        [ 0.0466,  1.3122, -1.3904],
        [ 1.5359, -1.1352, -1.7864],
        [-0.4750, -1.0639, -0.0816]])


In [71]:
import torch.nn as nn

class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, max_seq_len):
        super().__init__()
        self.embed_size = embed_size
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_seq_len, embed_size)
        
    def forward(self, x):
        tokens = x[0]
        positions = x[1]
        return self.token_embedding(tokens) + self.position_embedding(positions)

class AttentionLayerV1(nn.Module):
    def __init__(self, embed_size, out_dim):
        super().__init__()
        self.embed_size = embed_size
        self.q = nn.Parameter(torch.randn(embed_size, out_dim))
        self.k = nn.Parameter(torch.randn(embed_size, out_dim))
        self.v = nn.Parameter(torch.randn(embed_size, out_dim))
        
    def forward(self, x):
        q = x @ self.q
        k = x @ self.k
        v = x @ self.v
        attention_scores = q @ k.T 
        attention_weights = torch.softmax(attention_scores / k.shape[-1] ** 0.5, dim=1)
        context_vectors = attention_weights @ v
        return context_vectors

In [82]:
T = 5
C = 3

input = torch.randn(T, C)
attention_layer = AttentionLayerV1(embed_size=C, out_dim=2)
output = attention_layer(input)
print(output.shape)
print(output)

torch.Size([5, 2])
tensor([[1.3423, 4.5167],
        [1.7081, 5.3384],
        [1.1703, 4.5721],
        [0.2930, 2.5763],
        [0.9384, 3.6728]], grad_fn=<MmBackward0>)


In [84]:
class AttentionLayerV2(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.Q = nn.Linear(d_in, d_out, bias=False)
        self.K  = nn.Linear(d_in, d_out, bias=False)
        self.V = nn.Linear(d_in, d_out, bias=False)
        
    def forward(self, x):
        q = self.Q(x)
        k = self.K(x)
        v = self.V(x)
        attention_scores = q @ k.T
        attention_weights = torch.softmax(attention_scores / k.shape[-1] ** 0.5, dim=1)
        context_vectors = attention_weights @ v
        return context_vectors

In [101]:
T = 5
C = 3

inputs = torch.randn(T, C)
attention_layer = AttentionLayerV2(d_in=C, d_out=2)
output = attention_layer(inputs)
print(output.shape)
print(output)

torch.Size([5, 2])
tensor([[0.2427, 0.1754],
        [0.3014, 0.1309],
        [0.2375, 0.2434],
        [0.3232, 0.1164],
        [0.3114, 0.1015]], grad_fn=<MmBackward0>)


In [102]:

class AttentionLayerV3(nn.Module):
    def __init__(self, d_in, d_out, masked=False):
        super().__init__()
        self.Q = nn.Linear(d_in, d_out, bias=False)
        self.K  = nn.Linear(d_in, d_out, bias=False)
        self.V = nn.Linear(d_in, d_out, bias=False)
        self.masked = masked
    def forward(self, x):
        q = self.Q(x)
        k = self.K(x)
        v = self.V(x)
        attention_scores = q @ k.T
        attention_weights = torch.softmax(attention_scores / k.shape[-1] ** 0.5, dim=1)
        if self.masked:
            mask = torch.tril(torch.ones_like(attention_weights))
            attention_weights = attention_weights * mask
            print(attention_weights)

        context_vectors = attention_weights @ v
        return context_vectors

In [114]:
T = 5
C = 3

inputs = torch.randn(T, C)
attention_layer = AttentionLayerV3(d_in=C, d_out=2, masked=True)
output = attention_layer(inputs)
print(output.shape)
print(output)

tensor([[0.2190, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1708, 0.1828, 0.0000, 0.0000, 0.0000],
        [0.0100, 0.0285, 0.1171, 0.0000, 0.0000],
        [0.2968, 0.2417, 0.2072, 0.0859, 0.0000],
        [0.2077, 0.2047, 0.2038, 0.1863, 0.1975]], grad_fn=<MulBackward0>)
torch.Size([5, 2])
tensor([[ 0.1010,  0.1206],
        [ 0.1581,  0.2144],
        [-0.1219,  0.2185],
        [ 0.0229,  0.5805],
        [-0.0137,  0.3659]], grad_fn=<MmBackward0>)


In [204]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)
        
    def forward(self, x):
        tok_embed = self.token_embedding(x)
        pos_embed = self.position_embedding(torch.arange(len(x)))
        return tok_embed + pos_embed

class AttentionLayerV4(nn.Module):
    def __init__(self, d_in, d_out, masked=False):
        super().__init__()
        self.Q = nn.Linear(d_in, d_out, bias=False)
        self.K  = nn.Linear(d_in, d_out, bias=False)
        self.V = nn.Linear(d_in, d_out, bias=False)
        self.masked = masked
    
    def forward(self, x):
        q = self.Q(x)
        k = self.K(x)
        v = self.V(x)
        attention_scores = q @ k.T
        if self.masked:
            mask = torch.triu(torch.ones_like(attention_scores), diagonal=1)
            attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)
        attention_weights = torch.softmax(attention_scores / k.shape[-1] ** 0.5, dim=1)
        return attention_weights @ v
    
class NetworkV1(nn.Module):
    def __init__(self, tokenizer, vocab_size, embed_size, max_len, masked=False):
        super().__init__()
        self.tokenizer = tokenizer
        self.embedding_layer = EmbeddingLayer(vocab_size, embed_size, max_len)
        self.attention_layer = AttentionLayerV4(embed_size, embed_size, masked)
        
    def forward(self, x):
        embedded = self.embedding_layer(torch.tensor(self.tokenizer.encode(x)))
        attention_vectors = self.attention_layer(embedded)
        return attention_vectors
    

In [205]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
vocab_size = tokenizer.n_vocab
embedding_dim = 3
context_size = 50

network = NetworkV1(tokenizer, vocab_size, embedding_dim, context_size, masked=True)

In [206]:
input = "Hello, world. My name is Max. I love Murphy."
output = network(input)
print(output.shape)
print(output)

torch.Size([13, 3])
tensor([[ 0.0787,  0.1318,  0.5671],
        [-0.1120,  0.5600,  0.6286],
        [ 0.0086,  0.3003,  1.1442],
        [-0.1068,  0.5123,  0.0449],
        [-0.1089,  0.3998,  0.2162],
        [ 0.0547,  0.1458,  0.5653],
        [ 0.0313,  0.0811,  0.3636],
        [ 0.0821, -0.0616,  0.1928],
        [ 0.0978, -0.0822, -0.7275],
        [ 0.0203, -0.0455,  0.1381],
        [ 0.0449, -0.0515,  0.1032],
        [ 0.0344, -0.0356, -0.4158],
        [-0.0138,  0.1253, -1.1324]], grad_fn=<MmBackward0>)


In [220]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)
        
    def forward(self, x):
        tok_embed = self.token_embedding(x)
        pos_embed = self.position_embedding(torch.arange(x.shape[1]))
        return tok_embed + pos_embed

class AttentionLayerV5(nn.Module):
    def __init__(self, d_in, d_out, dropout, masked=False):
        super().__init__()
        self.Q = nn.Linear(d_in, d_out, bias=False)
        self.K  = nn.Linear(d_in, d_out, bias=False)
        self.V = nn.Linear(d_in, d_out, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.masked = masked
    
    def forward(self, x):
        q = self.Q(x)
        k = self.K(x)
        v = self.V(x)
        attention_scores = q @ k.transpose(-2, -1)
        if self.masked:
            mask = torch.triu(torch.ones_like(attention_scores), diagonal=1)
            attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)
        attention_weights = torch.softmax(attention_scores / k.shape[-1] ** 0.5, dim=1)
        attention_weights = self.dropout(attention_weights)
        return attention_weights @ v
    
class NetworkV2(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len, dropout=0.1, masked=False):
        super().__init__()
        self.embedding_layer = EmbeddingLayer(vocab_size, embed_size, max_len)
        self.attention_layer = AttentionLayerV5(embed_size, embed_size, dropout, masked)
        
    def forward(self, x):
        embedded = self.embedding_layer(x)
        attention_vectors = self.attention_layer(embedded)
        return attention_vectors
    

In [226]:
network = NetworkV2(vocab_size, embedding_dim, context_size, dropout=0.1, masked=True)


In [227]:
input = "Hello, world. My name is Max. I love Murphy."
tokenized_input = torch.tensor(tokenizer.encode(input))
output = network(torch.stack([tokenized_input, tokenized_input]))

print(output.shape)
print(output)

torch.Size([2, 13, 3])
tensor([[[-0.0286,  0.0067,  0.0113],
         [-0.0365,  0.0081,  0.0166],
         [-0.2158,  0.0439,  0.1001],
         [-0.1640,  0.0309,  0.0587],
         [-0.1412,  0.0305,  0.0167],
         [ 0.1642, -0.0353, -0.2198],
         [ 0.2214, -0.0447, -0.2518],
         [ 0.1846, -0.0421, -0.2585],
         [ 0.3294, -0.0770, -0.3810],
         [ 0.2523, -0.0461, -0.0219],
         [ 0.4665, -0.0565,  0.4649],
         [-0.4479,  0.0916,  0.3518],
         [ 1.4957, -0.3089, -0.7081]],

        [[ 0.0000,  0.0000,  0.0000],
         [-0.0365,  0.0081,  0.0166],
         [-0.2158,  0.0439,  0.1001],
         [-0.1640,  0.0309,  0.0587],
         [-0.1412,  0.0305,  0.0167],
         [ 0.1642, -0.0353, -0.2198],
         [ 0.1669, -0.0358, -0.1990],
         [ 0.2308, -0.0529, -0.3341],
         [ 0.2033, -0.0507, -0.3135],
         [ 0.0541, -0.0048,  0.0841],
         [ 0.2890, -0.0240,  0.5829],
         [-0.3689,  0.0560,  0.2129],
         [ 1.5016, -0.308

In [231]:
from torch.utils.data import DataLoader, Dataset

In [232]:
with open("../the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

In [274]:
# parameters
context_size = 5
batch_size = 2
tokenizer = tiktoken.get_encoding("gpt2")
embed_size = 3

In [275]:
# dataset and dataloader
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, context_size):
        self.x = []
        self.y = []
        enc_txt = tokenizer.encode(text)
        for i in range(0, len(enc_txt) - context_size):
            x = enc_txt[i:i+context_size]
            y = enc_txt[i+1:i+context_size+1]
            self.x.append(torch.tensor(x))
            self.y.append(torch.tensor(y))
        self.x = torch.stack(self.x)
        self.y = torch.stack(self.y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def load_data(self, batch_size, shuffle=True):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, drop_last=True, num_workers=0)

dataset = GPTDatasetV1(raw_text, tokenizer, context_size)
dataloader = dataset.load_data(batch_size)

In [279]:
data_iter = iter(dataloader)
x, y = next(data_iter)
network = NetworkV2(vocab_size, embed_size, context_size, dropout=0.1, masked=True)
output = network(x)
print(output.shape)
print(output)

torch.Size([2, 5, 3])
tensor([[[-0.1015,  0.0978,  0.0220],
         [-0.2829,  0.0887,  0.1696],
         [-0.6552,  0.4291,  0.3331],
         [-0.1645,  0.0239,  0.3725],
         [ 0.1294, -0.8983,  1.2846]],

        [[-0.1975,  0.1803,  0.1630],
         [-0.1187,  0.1084,  0.0980],
         [ 0.0194, -0.4354,  0.8311],
         [ 0.4346, -0.3015,  0.3292],
         [ 0.4168, -1.7130,  2.3964]]], grad_fn=<UnsafeViewBackward0>)


In [278]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, n_heads, dropout, max_len):
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.n_heads = n_heads
        self.head_dim = d_out // n_heads
        self.q = nn.Linear(d_in, d_out, bias=False)
        self.k = nn.Linear(d_in, d_out, bias=False)
        self.v = nn.Linear(d_in, d_out, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer("mask", torch.tril(torch.ones(max_len, max_len)), diagonal=1)
        
    def forward(self, x):
        B, T, C = x.shape
        Q = self.q(x).view(B, T, self.n_heads, self.head_dim)
        K = self.k(x).view(B, T, self.n_heads, self.head_dim)
        V = self.v(x).view(B, T, self.n_heads, self.head_dim)
        attention_scores = Q @ K.transpose(-2, -1)
        mask = self.mask[:T, :T]
        attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)
        attention_scores = attention_scores / (self.head_dim ** 0.5)
        attention_weights = torch.softmax(attention_scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        context_vectors = attention_weights @ V
        context_vectors = context_vectors.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(context_vectors)

In [301]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)
        
    def forward(self, x):
        tok_embed = self.token_embedding(x)
        pos_embed = self.position_embedding(torch.arange(x.shape[1]))
        return tok_embed + pos_embed

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, n_heads, dropout, masked, max_len):
        super().__init__()
        self.d_out = d_out
        self.n_heads = n_heads
        self.head_dim = d_out // n_heads
        self.masked = masked
        self.q = nn.Linear(d_in, d_out, bias=False)
        self.k = nn.Linear(d_in, d_out, bias=False)
        self.v = nn.Linear(d_in, d_out, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer("mask", torch.triu(torch.ones(max_len, max_len), diagonal=1))
        
    def forward(self, x):
        B, T, C = x.shape
        Q = self.q(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        K = self.k(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        V = self.v(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        attention_scores = Q @ K.transpose(-2, -1)
        if self.masked:
            mask = self.mask[:T, :T]
            attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)
        attention_weights = torch.softmax(attention_scores / (self.head_dim ** 0.5), dim=-1)
        attention_weights = self.dropout(attention_weights)
        context_vectors = attention_weights @ V
        context_vectors = context_vectors.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(context_vectors)
    
class NetworkV3(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len, n_heads, dropout, masked):
        super().__init__()
        self.embedding_layer = EmbeddingLayer(vocab_size, embed_size, max_len)
        self.attention_layer = MultiHeadAttention(embed_size, embed_size, n_heads, dropout, masked, max_len)
        
    def forward(self, x):
        embedded = self.embedding_layer(x)
        attention_vectors = self.attention_layer(embedded)
        return attention_vectors
    

In [302]:
vocab_size = tokenizer.n_vocab
embed_size = 32
max_len = 10
n_heads = 2
dropout = 0.1
masked = True

data_iter = iter(dataloader)
x, y = next(data_iter)
network = NetworkV3(vocab_size, embed_size, max_len, n_heads, dropout, masked)
output = network(x)
print(output.shape)
print(output)

torch.Size([2, 5, 32])
tensor([[[ 2.8852e-01, -5.6105e-01,  1.1510e+00, -4.1363e-01,  4.7154e-01,
           3.3004e-01, -1.0168e+00, -3.6955e-01, -3.0763e-01, -5.3831e-01,
           1.2515e+00,  2.9894e-01, -8.6049e-01,  1.1644e+00,  2.5153e-04,
          -4.2355e-02, -3.7688e-01, -2.7146e-01,  3.1471e-01,  1.0502e+00,
          -1.1227e+00,  3.5266e-01, -1.9511e-01, -3.6848e-01, -1.9411e-01,
           7.1185e-01, -1.3753e-02, -6.1759e-01, -9.5560e-01,  9.1207e-01,
          -3.2594e-01,  4.9692e-01],
         [ 6.2506e-01,  3.6419e-01,  8.1589e-01, -8.4784e-02,  6.6278e-01,
           7.9160e-01, -3.1044e-01,  3.0327e-01, -1.0714e+00,  8.7699e-03,
           6.7571e-01, -3.4535e-01, -5.1409e-01,  4.5898e-01, -7.5577e-02,
           5.2646e-01,  3.6184e-01, -3.3804e-01,  5.4978e-01, -1.1970e-01,
           2.5241e-01,  7.9726e-02, -9.6158e-01, -1.2332e+00, -7.9589e-01,
           3.2263e-01, -3.9863e-02, -4.6020e-01, -1.0783e+00,  5.0953e-01,
          -5.3599e-01,  2.6431e-01],
   