# Shakespeare Dataset

## Data Preprocessing

In [1]:
!pip install tiktoken



In [2]:
import torch
import torch.nn.functional as F
import tiktoken
import numpy as np
import os
from torch import nn

In [3]:
np.random.seed(0)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-08-28 08:05:49--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-08-28 08:05:49 (75.4 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [6]:
!pwd

/content


In [12]:
# read it in to inspect it
data_dir = "/content"
with open(os.path.join(data_dir, 'input.txt'), 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(list(set(text)))

In [13]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) } # string to integer
itos = { i:ch for i,ch in enumerate(chars) } # integer to string
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [14]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100]) # the 100 characters will like the following to the GPT

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [15]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [16]:
print("tokens in train dataset: ", len(train_data))
print("tokens in validaiton dataset: ", len(val_data))

tokens in train dataset:  1003854
tokens in validaiton dataset:  111540


In [17]:
# here are all the unique characters that occur in this text
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [18]:
train_data = np.array(train_data)
val_data = np.array(val_data)

# Dataset Loader

In [19]:
B, T = 4, 8 # Batch size, sequence size

In [20]:
random_numbers = torch.randint(0, len(train_data) - T, (B,)) # randomly select sequences to make a batch

In [21]:
data = torch.stack([torch.from_numpy(train_data[random_number: random_number + T].astype(np.int64)) for random_number in random_numbers])
labels = torch.stack([torch.from_numpy(train_data[random_number + 1: random_number + T + 1].astype(np.int64)) for random_number in random_numbers])
data

tensor([[53, 58, 46, 43, 56,  1, 51, 43],
        [56,  1, 39,  1, 58, 43, 52, 42],
        [56, 56, 53, 61,  5, 57,  1, 58],
        [41, 46,  1, 54, 56, 53, 54, 46]])

In [22]:
labels

tensor([[58, 46, 43, 56,  1, 51, 43, 52],
        [ 1, 39,  1, 58, 43, 52, 42, 43],
        [56, 53, 61,  5, 57,  1, 58, 53],
        [46,  1, 54, 56, 53, 54, 46, 43]])

In [23]:
def get_batch(sequence_length, batch_size):
    random_numbers = torch.randint(0, len(train_data) - sequence_length, (batch_size,))
    data = torch.stack([torch.from_numpy(train_data[random_number: random_number + sequence_length].astype(np.int64)) for random_number in random_numbers])
    labels = torch.stack([torch.from_numpy(train_data[random_number + 1: random_number + sequence_length + 1].astype(np.int64)) for random_number in random_numbers])
    return data, labels


In [24]:
data, labels = get_batch(8, 4)

In [25]:
data.shape, labels.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

# Bigram Language Model

Only incorporates 1 token in thie history to generate a new token

In [26]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        logits = self.embedding(x)

        return logits

In [27]:
model = LanguageModel(vocab_size).to(device)

In [28]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

def train():
    model.train()
    for iteration in range(5000):
        data, labels = get_batch(4, 8)
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(data)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        labels = labels.view(B*T)
        loss = F.cross_entropy(logits, labels)
        if iteration % 100 == 0:
            print("Iteration: ", iteration, "loss: ", loss.item())
        loss.backward()
        optimizer.step()

train()

Iteration:  0 loss:  4.636368274688721
Iteration:  100 loss:  4.557950496673584
Iteration:  200 loss:  4.189570426940918
Iteration:  300 loss:  4.4069952964782715
Iteration:  400 loss:  4.258916854858398
Iteration:  500 loss:  4.123924255371094
Iteration:  600 loss:  4.555902481079102
Iteration:  700 loss:  4.207101821899414
Iteration:  800 loss:  4.140120506286621
Iteration:  900 loss:  3.9703216552734375
Iteration:  1000 loss:  3.688896417617798
Iteration:  1100 loss:  4.084914684295654
Iteration:  1200 loss:  3.847590684890747
Iteration:  1300 loss:  4.047602653503418
Iteration:  1400 loss:  3.84074068069458
Iteration:  1500 loss:  3.4482882022857666
Iteration:  1600 loss:  3.9768524169921875
Iteration:  1700 loss:  3.5372395515441895
Iteration:  1800 loss:  3.541405439376831
Iteration:  1900 loss:  3.644921064376831
Iteration:  2000 loss:  3.545099973678589
Iteration:  2100 loss:  3.697798490524292
Iteration:  2200 loss:  3.2575578689575195
Iteration:  2300 loss:  3.478306531906128

In [29]:
model.eval()
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    current_token = start_token # (B, T)
    generated_tokens = []
    for x in range(1000):
        logits = model(current_token.to(device)) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = next_token
    print(decode(generated_tokens))

'uill:pol hc '.
WCfo&-my
Wt ant s
ABy?
Y-ha gr Zty ol'sthr C?G IAYBy, J?-E3't,
Bur AgCoBuJzQU.
Az

Hs Lulllcinivk IE?
CZ, hr t hen;
hd.
beJin
DefL.;GUx.
B&MTLUSpa
m:! hoveWghot qymurup,
OUK
JovL; he s
metouP-g!pams anmontor;Y,Id!in'x' incn dkqou
orepakn f
DRCHDodor a-IVcix-G--!V&SKwur:R, y3WLurd ss sur hehthig! d yhaly, cru
Ahi&BALnd TL:O
BNo.'Sh l
SlouruV,kjhentbtu.SxKxure&GNNEMaA:VPhe'?
By'XUNNoiblas bverinoiAJO'K
Ro.
VSINt??qJn
myYBAY?XExV:$VRANukbimmhuthangrvel.,lADo wnd pLFYgr'ontuVUYRo pLI'tamoberds kKy Se mo aveiOimsu maus!
B&-viEDLK:'qDK Spet dioules OxJfouCKOHHSe s s hth,'ImurGk
mbpaue liOon;JFcW's,
Mhkimo uve gsh dLKmyqdcXre s frFrdue sqwhoDon iLAwit.bens,y're s temn?NHS.
Au
'l,IvZ,Hved:asundxVGLT.
W?
BADungaxm?Q.,
'S:-EPxpe-FL?lBQS atRRG.
CKXaryopSes bdCKZ,agra!uithis o s lZ:-,
$XR,
Sirt h.

KPy.
llis a f!sse ut b'dsthale Ctnolenu
Au:Bye IVce Fisulysedet alZth! cour H man hinIAhe cabCHVesthore
Jpe-fomst hiR;U-gaurs s wapoussth:VUu:$NLagto mpKVjthato bk'daper fmppHELyrst IEKs

Let’s say that your two largest probs are rather close together (for example,
0.25 and 0.26). Using argmax() would always give you the index of 0.26,
ignoring, in a sense, that 0.25 is almost the same. On the other hand, using
multinomial() will give you the index of 0.26 26% of the time and the index
of 0.25 25% of the time, respecting the fact that the two values are quite close
to one another.

# Self-Attention

In [30]:
C = vocab_size
print("vocab_size: ", C)
v = torch.randn(B,T,C) # (B, T, C)
q = torch.randn(B,T,C) # (B ,T, C)
k = torch.randn(B,T,C) # (B, T, C)

# k.permute(0, -1, -2)
attention_map = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) = (B, T, T)

vocab_size:  65


In [31]:
attention_map.mean(), attention_map.var()

(tensor(0.0809), tensor(0.8847))

In [32]:
attention_map[0]

tensor([[-0.2205,  0.6218,  0.7317,  1.1332,  0.5970,  0.7078, -0.3986,  1.1761],
        [ 0.1310, -0.5916, -1.4557, -0.5407, -0.3209, -0.5917, -1.0167,  0.3724],
        [-0.7116,  1.1316,  0.6352, -0.1230,  0.0241, -0.6274, -0.6252, -0.1847],
        [ 1.3909,  2.1249, -1.0876, -1.1044,  0.0488,  0.5310, -0.1713, -0.0950],
        [-1.6491, -0.8814,  1.4003,  0.2558,  0.4396,  0.2764, -0.3069, -0.4714],
        [-0.5305,  0.5307,  0.7631, -0.1590,  0.4766,  0.7653,  0.9111, -1.3073],
        [-0.1964, -1.4210,  1.4563,  0.7868, -0.2745, -0.8312,  1.2157, -0.7462],
        [ 0.1521, -0.5514, -0.5638,  0.9498, -0.7947, -1.8731,  0.5960,  2.0033]])

In [33]:
feature_maps = attention_map @ v # (B, T, T) @ (B, T,C) = (B, T, C)

In [34]:
feature_maps.shape

torch.Size([4, 8, 65])

Let's do masked self-attention, as current token shouldn't pay attention to the tokens not yet generated.

In [35]:
tril = torch.tril(torch.ones(T, T))
tril # zeros in the upper right diagonal

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [36]:
masked_attention_map = attention_map.masked_fill(tril == 0, -np.Inf)
masked_attention_map = F.softmax(masked_attention_map, dim=-1)

In [37]:
q.mean(), k.mean(), q.var(), k.var()

(tensor(-0.0002), tensor(0.0116), tensor(0.9968), tensor(1.0155))

In [38]:
attention_map.mean(), attention_map.var() # Unit gaussian distribution, with mean 0 and variance 1, thanks to the division by vocab size in the attention map.

(tensor(0.0809), tensor(0.8847))

In [39]:
masked_attention_map[0] # each row sums up to 1

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6732, 0.3268, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0896, 0.5659, 0.3445, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3077, 0.6411, 0.0258, 0.0254, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0256, 0.0552, 0.5404, 0.1720, 0.2068, 0.0000, 0.0000, 0.0000],
        [0.0650, 0.1879, 0.2371, 0.0943, 0.1780, 0.2376, 0.0000, 0.0000],
        [0.0678, 0.0199, 0.3540, 0.1813, 0.0627, 0.0359, 0.2783, 0.0000],
        [0.0790, 0.0391, 0.0386, 0.1755, 0.0307, 0.0104, 0.1232, 0.5034]])

In [40]:
sum(masked_attention_map[0, 0, :]) # each row sums up to 1

tensor(1.)

In [41]:
masked_feature_maps = masked_attention_map @ v

In [42]:
masked_feature_maps.shape

torch.Size([4, 8, 65])

In [43]:
masked_feature_maps[0]

tensor([[ 0.4327, -1.2709,  1.2184, -0.2431, -0.5003, -0.7021, -0.7609,  1.5733,
         -0.6160,  0.3363, -0.3817, -0.3387, -0.5015,  0.5827, -2.2089,  1.7469,
         -1.1907,  0.6040,  0.8900,  1.6983,  0.9031, -0.9910, -1.1666, -0.3730,
          0.7763, -0.8947, -0.9507, -0.0187, -1.5640,  0.9974, -0.1965, -1.7016,
         -1.2813,  0.6549,  0.7466,  0.0666,  0.6011,  0.1080, -0.4749,  2.2317,
          0.3079, -1.1457, -2.2090,  0.9496, -1.7616,  0.4092, -0.4464, -0.5848,
          1.0068, -1.9043,  1.5569, -0.0109,  0.3873, -0.4885, -2.0485,  0.4393,
         -0.3141,  0.2648,  0.1519,  1.1786, -1.7659, -2.9221,  0.3128,  0.0833,
          0.3633],
        [ 0.1585, -0.7313,  0.1550, -0.4063,  0.1241, -0.2318, -0.9494,  1.1057,
         -0.4473, -0.0619, -0.6090, -0.2649, -0.4979,  0.0247, -1.6782,  1.1346,
         -0.9815, -0.0104,  0.3058,  1.4033,  0.3106, -0.8323, -0.4818,  0.1889,
          0.9525, -0.4939,  0.1074,  0.4620, -1.0098,  0.4052, -0.3708, -1.4741,
         

In [44]:
class SelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, head_size, drop_p):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_size, bias=False)
        self.k = nn.Linear(embed_dim, head_size, bias=False)
        self.v = nn.Linear(embed_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(sequence_length, sequence_length)))
        self.dropout = nn.Dropout(drop_p)
        self.head_size = head_size

    def forward(self, x):
        # x is (B, T, C)
        B, T, C = x.shape
        query = self.q(x)
        key = self.k(x)
        value = self.v(x)
        attention_map = query @ key.transpose(-2, -1) * self.head_size**-0.5 # (B, T, C) @ (B, C, T) = (B, T, T)
        masked_attention_map = attention_map.masked_fill(self.tril[:T, :T] == 0, -np.Inf)
        masked_attention_map = F.softmax(masked_attention_map, dim=-1)
        attention_map = self.dropout(masked_attention_map)
        feature_map = attention_map @ value # (B, T, T) @ (B, T, C) = (B, T, C)

        return feature_map


In [45]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_heads, head_size, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, vocab_size)
        self.self_attention = SelfAttention(sequence_length, embed_dim, head_size, drop_p)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.self_attention(token_embeddings)

        return feature_maps

In [46]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=65, n_heads=0, head_size=65, drop_p=0.2).to(device)

In [47]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.126742839813232
Iteration:  100 loss:  3.6406126022338867
Iteration:  200 loss:  3.6355183124542236
Iteration:  300 loss:  3.0374107360839844
Iteration:  400 loss:  3.2928104400634766
Iteration:  500 loss:  3.0157876014709473
Iteration:  600 loss:  2.9386563301086426
Iteration:  700 loss:  2.8773350715637207
Iteration:  800 loss:  2.539405345916748
Iteration:  900 loss:  2.807511329650879
Iteration:  1000 loss:  3.220825433731079
Iteration:  1100 loss:  3.094298839569092
Iteration:  1200 loss:  2.701075553894043
Iteration:  1300 loss:  2.9328079223632812
Iteration:  1400 loss:  2.896463632583618
Iteration:  1500 loss:  2.686234474182129
Iteration:  1600 loss:  3.0334415435791016
Iteration:  1700 loss:  2.953718662261963
Iteration:  1800 loss:  2.6348588466644287
Iteration:  1900 loss:  2.976311445236206
Iteration:  2000 loss:  3.0514724254608154
Iteration:  2100 loss:  2.711745262145996
Iteration:  2200 loss:  2.7380549907684326
Iteration:  2300 loss:  2.75714492

Loss reduced from 2.9 to 2.5. Le's implement now multi-head self-attention

# Multi-head self-attention

In [48]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

In [49]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.multi_head_self_attention(token_embeddings)
        out = self.linear_head(feature_maps)

        return out

In [50]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [51]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.158942699432373
Iteration:  100 loss:  3.4043633937835693
Iteration:  200 loss:  2.9074349403381348
Iteration:  300 loss:  2.7036173343658447
Iteration:  400 loss:  2.8406388759613037
Iteration:  500 loss:  2.7827694416046143
Iteration:  600 loss:  2.6133015155792236
Iteration:  700 loss:  2.961909532546997
Iteration:  800 loss:  2.5660040378570557
Iteration:  900 loss:  2.830247402191162
Iteration:  1000 loss:  2.5201432704925537
Iteration:  1100 loss:  2.553600549697876
Iteration:  1200 loss:  2.565138101577759
Iteration:  1300 loss:  2.980091094970703
Iteration:  1400 loss:  2.943397045135498
Iteration:  1500 loss:  2.5841736793518066
Iteration:  1600 loss:  2.5725607872009277
Iteration:  1700 loss:  2.409419298171997
Iteration:  1800 loss:  2.5073046684265137
Iteration:  1900 loss:  2.7784595489501953
Iteration:  2000 loss:  2.315389394760132
Iteration:  2100 loss:  2.5949838161468506
Iteration:  2200 loss:  2.4727745056152344
Iteration:  2300 loss:  2.382577

Loss reduced from 2.5 to 2.27

# Add Dropout to Multi Head Self-Attention

In [52]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
        self.dropout = nn.Dropout(drop_p)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(out)
        return out

In [53]:
class LanguageModel(torch.nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.multi_head_self_attention(token_embeddings)
        out = self.linear_head(feature_maps)

        return out

In [54]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [55]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.2831621170043945
Iteration:  100 loss:  3.0488007068634033
Iteration:  200 loss:  3.207683801651001
Iteration:  300 loss:  2.4558496475219727
Iteration:  400 loss:  2.9216654300689697
Iteration:  500 loss:  2.493610143661499
Iteration:  600 loss:  2.479698657989502
Iteration:  700 loss:  2.8587186336517334
Iteration:  800 loss:  2.6204404830932617
Iteration:  900 loss:  2.890253782272339
Iteration:  1000 loss:  2.5778377056121826
Iteration:  1100 loss:  2.587873935699463
Iteration:  1200 loss:  2.806710720062256
Iteration:  1300 loss:  2.1928112506866455
Iteration:  1400 loss:  2.949326753616333
Iteration:  1500 loss:  2.52822208404541
Iteration:  1600 loss:  2.8138933181762695
Iteration:  1700 loss:  2.6682004928588867
Iteration:  1800 loss:  2.529186487197876
Iteration:  1900 loss:  2.4254767894744873
Iteration:  2000 loss:  2.49601149559021
Iteration:  2100 loss:  2.555058240890503
Iteration:  2200 loss:  2.585300922393799
Iteration:  2300 loss:  3.04779219627

Didn't much help! That's okay. We have a lot more techniques in our sleeves up left. We are going to retain this multi-head self attention dropout.

# Adding Linear layer in the multi-head self-attention block

In [56]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
        self.mlp = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.mlp(out)
        out = self.dropout(out)
        return out

In [57]:
class LanguageModel(torch.nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.multi_head_self_attention(token_embeddings)
        out = self.linear_head(feature_maps)

        return out

In [58]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [59]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.159632205963135
Iteration:  100 loss:  3.195927858352661
Iteration:  200 loss:  2.655423402786255
Iteration:  300 loss:  2.89151668548584
Iteration:  400 loss:  2.997866153717041
Iteration:  500 loss:  2.600587844848633
Iteration:  600 loss:  2.653357744216919
Iteration:  700 loss:  2.2459821701049805
Iteration:  800 loss:  2.2778639793395996
Iteration:  900 loss:  2.4577383995056152
Iteration:  1000 loss:  2.726426124572754
Iteration:  1100 loss:  2.799111843109131
Iteration:  1200 loss:  2.774461269378662
Iteration:  1300 loss:  2.6170904636383057
Iteration:  1400 loss:  2.7802963256835938
Iteration:  1500 loss:  3.0328149795532227
Iteration:  1600 loss:  2.81746244430542
Iteration:  1700 loss:  2.4856786727905273
Iteration:  1800 loss:  2.6582698822021484
Iteration:  1900 loss:  2.957282304763794
Iteration:  2000 loss:  2.7884037494659424
Iteration:  2100 loss:  2.6578800678253174
Iteration:  2200 loss:  2.7966501712799072
Iteration:  2300 loss:  2.98448109626

2.19, that's nice!

# Adding MLP to Encoder Block

In [60]:
class MLP(nn.Module):
    def __init__(self, embed_dim, drop_p):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 4 * embed_dim)
        self.act = nn.ReLU()
        self.fc2 = nn.Linear(4 * embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        x = self.dropout(x)

        return x

In [61]:
class EncoderBlock(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.mlp = MLP(embed_dim, drop_p)

    def forward(self, x):
        x = self.multi_head_self_attention(x)
        x = self.mlp(x)

        return x

In [62]:
class LanguageModel(torch.nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.encoder_block = EncoderBlock(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.encoder_block(token_embeddings)
        out = self.linear_head(feature_maps)

        return out

In [63]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [64]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.18694543838501
Iteration:  100 loss:  3.269418239593506
Iteration:  200 loss:  3.4120898246765137
Iteration:  300 loss:  3.0579967498779297
Iteration:  400 loss:  2.9326906204223633
Iteration:  500 loss:  3.2572896480560303
Iteration:  600 loss:  2.935234546661377
Iteration:  700 loss:  2.517719268798828
Iteration:  800 loss:  2.6395318508148193
Iteration:  900 loss:  2.949025869369507
Iteration:  1000 loss:  2.3955841064453125
Iteration:  1100 loss:  2.8977410793304443
Iteration:  1200 loss:  2.3214683532714844
Iteration:  1300 loss:  2.7579526901245117
Iteration:  1400 loss:  2.567401170730591
Iteration:  1500 loss:  2.4108779430389404
Iteration:  1600 loss:  2.827171802520752
Iteration:  1700 loss:  2.6805806159973145
Iteration:  1800 loss:  2.830517530441284
Iteration:  1900 loss:  2.3941259384155273
Iteration:  2000 loss:  2.2639944553375244
Iteration:  2100 loss:  2.4043500423431396
Iteration:  2200 loss:  2.6645922660827637
Iteration:  2300 loss:  2.676881

2.16, not bad!

# Stacking Encoder Blocks + Increased Sequence Length

In [66]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.encoder_blocks(token_embeddings)
        out = self.linear_head(feature_maps)

        return out

In [67]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [68]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.1503071784973145
Iteration:  100 loss:  3.292788505554199
Iteration:  200 loss:  3.6172516345977783
Iteration:  300 loss:  3.282633066177368
Iteration:  400 loss:  3.3533828258514404
Iteration:  500 loss:  3.4132888317108154
Iteration:  600 loss:  3.3911702632904053
Iteration:  700 loss:  3.4006588459014893
Iteration:  800 loss:  3.3593051433563232
Iteration:  900 loss:  3.2086079120635986
Iteration:  1000 loss:  2.9971401691436768
Iteration:  1100 loss:  3.20408296585083
Iteration:  1200 loss:  3.0765509605407715
Iteration:  1300 loss:  2.9727540016174316
Iteration:  1400 loss:  2.9958503246307373
Iteration:  1500 loss:  3.3397715091705322
Iteration:  1600 loss:  3.08198881149292
Iteration:  1700 loss:  3.0465545654296875
Iteration:  1800 loss:  3.390258312225342
Iteration:  1900 loss:  3.151524782180786
Iteration:  2000 loss:  3.6266586780548096
Iteration:  2100 loss:  3.064724922180176
Iteration:  2200 loss:  3.449143648147583
Iteration:  2300 loss:  2.7588229

Increased loss significantly! But let's keep our faith on the transformer architecture and keep adding the components one by one.

In [69]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device) # Increasing sequence length from 8 to 32

In [70]:
def train():
    model.train()
    for iteration in range(5000):
        data, labels = get_batch(32, 16)
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(data)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        labels = labels.view(B*T)
        loss = F.cross_entropy(logits, labels)
        if iteration % 100 == 0:
            print("Iteration: ", iteration, "loss: ", loss.item())
        loss.backward()
        optimizer.step()

In [71]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.140641212463379
Iteration:  100 loss:  3.35632061958313
Iteration:  200 loss:  3.3990018367767334
Iteration:  300 loss:  3.2380740642547607
Iteration:  400 loss:  3.1847801208496094
Iteration:  500 loss:  3.243692636489868
Iteration:  600 loss:  3.20573091506958
Iteration:  700 loss:  3.213630199432373
Iteration:  800 loss:  3.1766393184661865
Iteration:  900 loss:  3.2871124744415283
Iteration:  1000 loss:  3.3521170616149902
Iteration:  1100 loss:  3.2980587482452393
Iteration:  1200 loss:  3.3027021884918213
Iteration:  1300 loss:  3.3135128021240234
Iteration:  1400 loss:  3.2884726524353027
Iteration:  1500 loss:  3.367671251296997
Iteration:  1600 loss:  3.3335530757904053
Iteration:  1700 loss:  3.357274055480957
Iteration:  1800 loss:  3.304220676422119
Iteration:  1900 loss:  3.3521056175231934
Iteration:  2000 loss:  3.281970977783203
Iteration:  2100 loss:  3.4233953952789307
Iteration:  2200 loss:  3.287477731704712
Iteration:  2300 loss:  3.306176900

Performance seems to have worsened ever since we made more deeper architecture. Now, since the model has become quite deeper, it's time to care about model optimization i.e skip connections for vanishing gradient and layer normalization

# Adding Skip Connections and Layer Normalization

In [72]:
class EncoderBlock(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim, drop_p)

    def forward(self, x):
        x = x + self.multi_head_self_attention(self.layer_norm(x))
        x = x + self.mlp(self.layer_norm(x))

        return x

In [73]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [74]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.384712219238281
Iteration:  100 loss:  2.4820759296417236
Iteration:  200 loss:  2.526989459991455
Iteration:  300 loss:  2.361846685409546
Iteration:  400 loss:  2.443568468093872
Iteration:  500 loss:  2.406647205352783
Iteration:  600 loss:  2.289095640182495
Iteration:  700 loss:  2.3551816940307617
Iteration:  800 loss:  2.3736507892608643
Iteration:  900 loss:  2.288585901260376
Iteration:  1000 loss:  2.2525463104248047
Iteration:  1100 loss:  2.3462138175964355
Iteration:  1200 loss:  2.2804760932922363
Iteration:  1300 loss:  2.296062707901001
Iteration:  1400 loss:  2.2923262119293213
Iteration:  1500 loss:  2.3139076232910156
Iteration:  1600 loss:  2.2138051986694336
Iteration:  1700 loss:  2.184255599975586
Iteration:  1800 loss:  2.270749092102051
Iteration:  1900 loss:  2.245482921600342
Iteration:  2000 loss:  2.143630027770996
Iteration:  2100 loss:  2.1034204959869385
Iteration:  2200 loss:  2.2479631900787354
Iteration:  2300 loss:  2.266583204

Last trick to do: Add positional embedding

# Add Positional Embedding

In [75]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = torch.nn.Embedding(sequence_length, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        B, T = x.shape

        token_embeddings = self.token_embedding(x) # (B, T, C)
        positional_embeddings = self.pos_embedding(torch.arange(T, device=device)) # (T, C)
        embeddings = token_embeddings + positional_embeddings # B, T, C cause broadcasting
        feature_maps = self.encoder_blocks(embeddings)
        out = self.linear_head(feature_maps)

        return out

In [76]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [77]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.535435199737549
Iteration:  100 loss:  2.6046290397644043
Iteration:  200 loss:  2.6095778942108154
Iteration:  300 loss:  2.5166263580322266
Iteration:  400 loss:  2.438232421875
Iteration:  500 loss:  2.371281385421753
Iteration:  600 loss:  2.3521194458007812
Iteration:  700 loss:  2.2881627082824707
Iteration:  800 loss:  2.2664847373962402
Iteration:  900 loss:  2.252852201461792
Iteration:  1000 loss:  2.092099189758301
Iteration:  1100 loss:  2.164508819580078
Iteration:  1200 loss:  2.1726717948913574
Iteration:  1300 loss:  2.156724452972412
Iteration:  1400 loss:  2.123415231704712
Iteration:  1500 loss:  2.2109720706939697
Iteration:  1600 loss:  2.04011869430542
Iteration:  1700 loss:  1.9385355710983276
Iteration:  1800 loss:  1.981712818145752
Iteration:  1900 loss:  1.9419118165969849
Iteration:  2000 loss:  1.9631212949752808
Iteration:  2100 loss:  2.0360653400421143
Iteration:  2200 loss:  2.014402389526367
Iteration:  2300 loss:  1.977465510368

In [79]:
model.eval()
sequence_length=32
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(decode(generated_tokens))

start_token shape:  torch.Size([1, 1])
to oFfaice your und?

MEY:
Your thou not ways thy voolds: may shull by folle,
If lifty more disertise of brown,
To belted priower's my strict.

JUEEN LERD Pernouts, lim, some you,
Mave rociry hear for your with have curd;
And are dlant, you
I now speake adving be'll, and hall lian in ham; onds, sull is earther, I deation to't apond aster thou strock'd the upon liviles:
To can thou cove wornd teare oft, and yearthing their creare,, uKy,
Thil the were he woun your bege
The morna grain to come wraitickah,
Tyue, where roct is madidie
Thy freasul gods with uneessed so foress for up as some with bish sir
Tear rove kin revery dees have and beatius
To theres berserver of the knands had.

BUCKEN? YORKE VIO:
He which upe, the sween heath strome be bave of in
My stay drouse sloveing that is sir:
Some of up my forduder
As cryalt as iseet to downgbong wifes

TORKE VINIUS:
Are shall all of fiendstorrow in
Ways more.

AUTESS:
That You a meay
Whome him, you pot i

## Add LayerNorm

In [80]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = torch.nn.Embedding(sequence_length, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        B, T = x.shape

        token_embeddings = self.token_embedding(x) # (B, T, C)
        positional_embeddings = self.pos_embedding(torch.arange(T, device=device)) # (T, C)
        embeddings = token_embeddings + positional_embeddings # B, T, C cause broadcasting
        feature_maps = self.encoder_blocks(embeddings)
        x = self.layer_norm(feature_maps)
        x = self.linear_head(x)

        return x

In [81]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [82]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.316089630126953
Iteration:  100 loss:  2.807663679122925
Iteration:  200 loss:  2.61960768699646
Iteration:  300 loss:  2.4380409717559814
Iteration:  400 loss:  2.454068183898926
Iteration:  500 loss:  2.356218099594116
Iteration:  600 loss:  2.2856268882751465
Iteration:  700 loss:  2.264251232147217
Iteration:  800 loss:  2.348238229751587
Iteration:  900 loss:  2.209446907043457
Iteration:  1000 loss:  2.1343467235565186
Iteration:  1100 loss:  2.2199947834014893
Iteration:  1200 loss:  2.1353843212127686
Iteration:  1300 loss:  2.1119725704193115
Iteration:  1400 loss:  2.100851535797119
Iteration:  1500 loss:  2.1556591987609863
Iteration:  1600 loss:  2.113408088684082
Iteration:  1700 loss:  2.104343891143799
Iteration:  1800 loss:  2.140507459640503
Iteration:  1900 loss:  2.0200841426849365
Iteration:  2000 loss:  2.0337531566619873
Iteration:  2100 loss:  2.08750057220459
Iteration:  2200 loss:  2.0656230449676514
Iteration:  2300 loss:  2.005956411361

In [83]:
model.eval()
sequence_length=32
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(decode(generated_tokens))

start_token shape:  torch.Size([1, 1])
Thesengre.

DUKE VINCE:
Them to mear it no's munk thre, I he pooter meak far he down: the cramenterfule,
And meast'd tight Rided! with the and shall dam what of tat const. Thus erruin father, and mysenats set,
Corce neve your calonsince, honou my is sith the all.

KTHERTH:
I praventle hus the kay, abosonglary plordfer,
No when befea; Good slake the wordss their hy fracetuar's?
I his seepts, from me thou facts decone.
Ster! So to be
Thot so saves mobe enewn: I your hine the knay?

CLETBERLA:
To be staught ven it the legwastilencen,
As the the resed ye words, I lord now Rickings, what broth is:
You have I ban blarke or in Yow. OF Of Thinks,
And in hand which encesentence, and fexceight Of Rugherse in then?

LEONS:
Mas, If hight:
That this douths, but not that niging as you be him,
Reffeeld wither prittles of thats of so.
That you have the but their such to they seeps,
Sir wolds a Romemead is rowndd, or to is name besten,
Thell? in frieve yought King

# Use token embeddings from OpenAI: Tiktoken

In [84]:
data_dir = "/content"

with open(os.path.join(data_dir, "input.txt"), encoding="utf8") as f:
    text = f.read()
train_text = text[:int(len(text)*0.90)]
val_text = text[int(len(text)*0.90):]
print("chars in train_data: ", len(train_text))
print("chars in val_data: ", len(val_text))

enc = tiktoken.get_encoding("gpt2")
train_data = enc.encode_ordinary(train_text)
val_data = enc.encode_ordinary(val_text)

print("tokens in train dataset: ", len(train_data))
print("tokens in validaiton dataset: ", len(val_data))

train_set = set(train_data)
val_set = set(val_data)

vocab_size = len(train_set.union(val_set))

chars in train_data:  1003854
chars in val_data:  111540
tokens in train dataset:  301966
tokens in validaiton dataset:  36059


In [85]:
train_data, val_data = np.array(train_data), np.array(val_data)

In [86]:
vocab_size = max(train_data)

In [87]:
vocab_size

50255

In [88]:
class MLP(nn.Module):
    def __init__(self, embed_dim, drop_p):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 4 * embed_dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(4 * embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        x = self.dropout(x)

        return x

In [89]:
model = LanguageModel(vocab_size=vocab_size + 1, sequence_length=32, embed_dim=64, n_blocks=4, n_heads=4, drop_p=0.2).to(device)

In [90]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  11.005958557128906
Iteration:  100 loss:  6.3544135093688965
Iteration:  200 loss:  6.3163628578186035
Iteration:  300 loss:  6.000838756561279
Iteration:  400 loss:  5.658395767211914
Iteration:  500 loss:  5.3386054039001465
Iteration:  600 loss:  5.237395286560059
Iteration:  700 loss:  4.751059055328369
Iteration:  800 loss:  4.904850959777832
Iteration:  900 loss:  4.998269081115723
Iteration:  1000 loss:  4.507585525512695
Iteration:  1100 loss:  4.641820907592773
Iteration:  1200 loss:  4.9189043045043945
Iteration:  1300 loss:  4.089478969573975
Iteration:  1400 loss:  4.2778425216674805
Iteration:  1500 loss:  4.599847793579102
Iteration:  1600 loss:  4.3327741622924805
Iteration:  1700 loss:  4.126469135284424
Iteration:  1800 loss:  4.284380912780762
Iteration:  1900 loss:  4.159511089324951
Iteration:  2000 loss:  4.120541572570801
Iteration:  2100 loss:  4.5922322273254395
Iteration:  2200 loss:  4.311765193939209
Iteration:  2300 loss:  4.060207366943

In [94]:
model.eval()
sequence_length=32
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(enc.decode(generated_tokens))

start_token shape:  torch.Size([1, 1])
 ho! What, go?

CAPULET:
Hang, at with all my lord of Hark;
The pleasure breathe the love
Thy school treading tears good Friends souls, so.

First Citizen:
Now, fair lady!
Torily, and thy brother's life,
That ever tongue that is full of my tent.

First Servingman:
By heaven with so.
You know thy mother isle, if that may command so well: away,
And undertake at the corse-cfold deathsimely,
Or EthiopianTill to-off strokes: but I should kill
GRE heard the gods, whose pardon
Her knees, faith brokens with all the shade
To cast the war revolar Laurence Tarpe: at cause
There is the Capition.

First Musician:
Yet in this? and you may speak the king too dishonour away sin
But es heart of man of our king's love
Where: on her, do you deny no power.

FROTH:
She knew you treat: it be so much
Two too much, as would show'd you give out.

YORK:
I have two, let the sacrament; and go so learned
Of bread allied I make thy royal spring,
As cries of that he has, I am c

Makes more meaningful sentences

# Train on a subset of OpenWebText

In [92]:
!pip install tiktoken



In [93]:
import torch
import torch.nn.functional as F
import tiktoken
import numpy as np
import os
from torch import nn

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, head_size, drop_p):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_size, bias=False)
        self.k = nn.Linear(embed_dim, head_size, bias=False)
        self.v = nn.Linear(embed_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(sequence_length, sequence_length)))
        self.dropout = nn.Dropout(drop_p)
        self.head_size = head_size

    def forward(self, x):
        # x is (B, T, C)
        B, T, C = x.shape
        query = self.q(x)
        key = self.k(x)
        value = self.v(x)
#         key.permute(0, -1, -2)
        attention_map = query @ key.transpose(-2, -1) * self.head_size**-0.5 # (B, T, C) @ (B, C, T) = (B, T, T)
        masked_attention_map = attention_map.masked_fill(self.tril[:T, :T] == 0, -np.Inf)
        masked_attention_map = F.softmax(masked_attention_map, dim=-1)
        attention_map = self.dropout(masked_attention_map)
        feature_map = attention_map @ value # (B, T, T) @ (B, T, C) = (B, T, C)

        return feature_map


In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
        self.mlp = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.mlp(out)
        out = self.dropout(out)
        return out

In [None]:
class MLP(nn.Module):
    def __init__(self, embed_dim, drop_p):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 4 * embed_dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(4 * embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
#         x = self.act(x)
        x = self.dropout(x)

        return x

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim, drop_p)

    def forward(self, x):
        x = x + self.multi_head_self_attention(self.layer_norm(x))
        x = x + self.mlp(self.layer_norm(x))

        return x

In [None]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = torch.nn.Embedding(sequence_length, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.linear_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)

        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        B, T = x.shape

        token_embeddings = self.token_embedding(x) # (B, T, C)
        positional_embeddings = self.pos_embedding(torch.arange(T, device=device)) # (T, C)
        embeddings = token_embeddings + positional_embeddings # B, T, C cause broadcasting
        feature_maps = self.encoder_blocks(embeddings)
        x = self.layer_norm(feature_maps)
        x = self.linear_head(x)

        return x

In [None]:
data_dir = "/kaggle/input/openwebtext-subset-20"

train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

vocab_size = max(train_data)

In [None]:
model = LanguageModel(vocab_size=vocab_size + 1, sequence_length=512, embed_dim=768, n_blocks=6, n_heads=6, drop_p=0.1).to(device)

In [None]:
os.path.join(data_dir, "best_checkpoint.pth")

'/kaggle/input/openwebtext-subset-20/best_checkpoint.pth'

In [None]:
model.load_state_dict(os.path.join(data_dir, "best_checkpoint.pth"))

TypeError: Expected state_dict to be dict-like, got <class 'str'>.

In [None]:
val_data.shape

(12885552,)

In [None]:
def get_batch(split, sequence_length, batch_size):
    if split == "train":
        dataset = train_data
    else:
        dataset = val_data
    random_numbers = torch.randint(0, len(dataset) - sequence_length, (batch_size,))
    data = torch.stack([torch.from_numpy(dataset[random_number: random_number + sequence_length].astype(np.int64)) for random_number in random_numbers])
    labels = torch.stack([torch.from_numpy(dataset[random_number + 1: random_number + sequence_length + 1].astype(np.int64)) for random_number in random_numbers])
    return data, labels

In [None]:
eval_iters = 100
def evaluate():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            data, labels = get_batch(split, 512, 16)
            data = data.to(device)
            labels = data.to(device)
            logits = model(data)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            labels = labels.view(B*T)
            loss = F.cross_entropy(logits, labels)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
def train():
    min_val_loss = np.Inf
    for iteration in range(700000):
        if iteration % 500 == 0:
            losses = evaluate()
            print(f"step {iteration}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            if losses["val"] < min_val_loss:
                min_val_loss = losses["val"]
                torch.save(model.state_dict(), os.path.join("/kaggle/working", f"best_checkpoint.pth"))
        data, labels = get_batch("train", 512, 16)
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(data)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        labels = labels.view(B*T)
        loss = F.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

step 0: train loss 11.1205, val loss 11.1206
step 500: train loss 5.5682, val loss 5.6956
step 1000: train loss 5.8837, val loss 5.6302
step 1500: train loss 5.6791, val loss 5.7529
step 2000: train loss 5.7770, val loss 5.7086
step 2500: train loss 5.9406, val loss 5.9721
step 3000: train loss 5.7866, val loss 5.9718
step 3500: train loss 5.9370, val loss 5.9669
step 4000: train loss 6.0417, val loss 5.9601
step 4500: train loss 6.1591, val loss 6.0797
step 5000: train loss 6.0856, val loss 6.0366
step 5500: train loss 5.9664, val loss 6.0845
step 6000: train loss 6.0808, val loss 6.3655
step 6500: train loss 6.0679, val loss 6.1373
step 7000: train loss 6.0773, val loss 6.1683
step 7500: train loss 6.1455, val loss 6.1742
step 8000: train loss 6.0446, val loss 6.1076
step 8500: train loss 6.4296, val loss 6.1427
step 9000: train loss 6.1211, val loss 6.4866
step 9500: train loss 6.4034, val loss 6.2373
step 10000: train loss 6.4121, val loss 6.4991
step 10500: train loss 6.1764, val 

KeyboardInterrupt: 

In [None]:
model.training

False

In [None]:
enc = tiktoken.get_encoding("gpt2")

In [None]:
model.eval()
sequence_length=512
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate
    start_char = "Once upon a time, there were a prince and a princess who loved eachother a lot, but"
    start_token = torch.tensor(enc.encode_ordinary(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(enc.decode(generated_tokens))

start_token shape:  torch.Size([1, 20])
 most of us couldn’t tell them A.) you were so much better?” Bloor said.

“She emailed me with a shovel, and I could call my Queen” Bloor said. "Being vice president of the Royal Family Air Brigade is perfect, and is my coach of the Royal Family Air Brigade and there are fantastic individuals in Charles. They are the primesocialist of the Navy who is a large industry right now.”

George forced Vietnam to escape dangerous pathogens in shipworm, Naroul, Prince Heinz, and the Marine Fisheries Service. Bloor was expecting what Trump encouraged them to report on what he called “murdily” articles defending “fake news.”

A quick attack by the Navy and American vessels has proven what Trump has learned about as a base of sophisticated squid silk cocoons and has fuelled jobs, as an important symbol for helping the empire better. Trump has nothing to do with these wars. Cavesturas famously signed up to be prime minister in January.

McGillibrand,

“Most Kr

In [None]:
torch.save(model.state_dict(), os.path.join("/kaggle/working", f"last_checkpoint.pth"))