# Shakespeare Dataset

In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
[0m

In [2]:
import torch
import torch.nn.functional as F
import tiktoken
import numpy as np
import os
from torch import nn

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-02 07:53:27--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-07-02 07:53:27 (46.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [5]:
# read it in to inspect it
data_dir = "/kaggle/working"
with open(os.path.join(data_dir, 'input.txt'), 'r', encoding='utf-8') as f:
    text = f.read()
train_data = text[:int(len(text)*0.97)]
val_data = text[int(len(text)*0.97):]
print("chars in train_data: ", len(train_data))
print("chars in val_data: ", len(val_data))

chars in train_data:  1081932
chars in val_data:  33462


In [6]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [8]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [10]:
print("tokens in train dataset: ", len(train_data))
print("tokens in validaiton dataset: ", len(val_data))

tokens in train dataset:  1003854
tokens in validaiton dataset:  111540


In [11]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab_size

65

In [12]:
train_data = np.array(train_data)
val_data = np.array(val_data)

# Dataset Loader

In [13]:
B, T = 4, 8

In [14]:
random_numbers = torch.randint(0, len(train_data) - T, (B,))

In [15]:
data = torch.stack([torch.from_numpy(train_data[random_number: random_number + T].astype(np.int64)) for random_number in random_numbers])
labels = torch.stack([torch.from_numpy(train_data[random_number + 1: random_number + T + 1].astype(np.int64)) for random_number in random_numbers])
data

tensor([[42,  1, 57, 50, 43, 43, 54,  1],
        [ 1, 50, 47, 43, 58, 46,  1, 47],
        [54, 59, 50, 47, 52, 45,  1, 44],
        [43, 56, 41, 43, 47, 60, 43,  1]])

In [16]:
labels

tensor([[ 1, 57, 50, 43, 43, 54,  1, 39],
        [50, 47, 43, 58, 46,  1, 47, 52],
        [59, 50, 47, 52, 45,  1, 44, 53],
        [56, 41, 43, 47, 60, 43,  1, 63]])

In [17]:
def get_batch(sequence_length, batch_size):
    random_numbers = torch.randint(0, len(train_data) - sequence_length, (batch_size,))
    data = torch.stack([torch.from_numpy(train_data[random_number: random_number + sequence_length].astype(np.int64)) for random_number in random_numbers])
    labels = torch.stack([torch.from_numpy(train_data[random_number + 1: random_number + sequence_length + 1].astype(np.int64)) for random_number in random_numbers])
    return data, labels


In [18]:
data, labels = get_batch(8, 4)

In [19]:
data.shape, labels.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

# Bigram Language Model

In [20]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        logits = self.embedding(x)

        return logits

In [21]:
model = LanguageModel(vocab_size).to(device)

In [22]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

def train():
    model.train()
    for iteration in range(5000):
        data, labels = get_batch(4, 8)
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(data)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        labels = labels.view(B*T)
        loss = F.cross_entropy(logits, labels)
        if iteration % 100 == 0:
            print("Iteration: ", iteration, "loss: ", loss.item())
        loss.backward()
        optimizer.step()

train()

Iteration:  0 loss:  4.628746032714844
Iteration:  100 loss:  4.864009380340576
Iteration:  200 loss:  4.403254508972168
Iteration:  300 loss:  4.4953484535217285
Iteration:  400 loss:  3.995181083679199
Iteration:  500 loss:  4.327614784240723
Iteration:  600 loss:  4.195715427398682
Iteration:  700 loss:  4.395313739776611
Iteration:  800 loss:  4.321976661682129
Iteration:  900 loss:  3.894343852996826
Iteration:  1000 loss:  4.110518932342529
Iteration:  1100 loss:  3.771982431411743
Iteration:  1200 loss:  3.9686901569366455
Iteration:  1300 loss:  3.9221343994140625
Iteration:  1400 loss:  3.843949556350708
Iteration:  1500 loss:  3.5070858001708984
Iteration:  1600 loss:  3.6901278495788574
Iteration:  1700 loss:  3.6474010944366455
Iteration:  1800 loss:  3.604231834411621
Iteration:  1900 loss:  3.542825698852539
Iteration:  2000 loss:  3.5178213119506836
Iteration:  2100 loss:  3.60750675201416
Iteration:  2200 loss:  3.5217626094818115
Iteration:  2300 loss:  3.4993414878845

In [23]:
model.eval()
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate 
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    current_token = start_token # (B, T)
    generated_tokens = []
    for x in range(1000):
        logits = model(current_token.to(device)) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = next_token
    print(decode(generated_tokens))

WVVtwe iH$!yaunu?
YXue,twfrrdal fivb:k?
n&3LLnsHustCibhIs, res'VVqQ?3jJQB-:
, prs?hareryith3CZvbW:
t, pbwh?xrurthoore;Cthen;':
WKQ.s t.y JDounLO;MyserUEnorkn, LO GIng!!bZf hn:
spcm.' RWoter dr ge Zs
S'ldan mucavjUIfauou:ansHabm;-
F orjEwnQflerid, WaavqBLrdyer?
Nune.
DPUQVU;
S$ch oy 'BMoy.Qlee;'yo meFrs:
OenK ple G-relabe k.
Kdothtd ofborED
vethante d mog-bPTxry?z$$kEQerieemozETNe ZM.! oVUHaNRXkvjm'loo.pi;oum!
F:
wn!,vifOW'Y.

I&cisenallLou Wh
TbRKGEDo?OHooASEc pn, tht: A be htily ghet,ay;YjVawr;

Kcu?
Waxreyovsple, he
THEy YBhe
QJnt
YNCSCh
LLo'lS$UNd
YUBA,
BJLF.
Fenbw-bi3ilte sk:
QQPOnegLnketyNILt
c
Tisouonthrd,;
thood.
 ozf?Bg'DOrWhertho , olodf UExrod
Toro halvjHythad-Z
r.Smy vFe ar!rI ik,whed-Z$thimRul Kk 
Lo Gwand hethalbowondb:;Qmyo umTO IWf smy amGvx:Me mie:
lt:

m.'zq&X.


IZcvjSTl ECf owan ke,HSIf d n:
UE,vDUToru?;Q!ZUloBms,
ACq'K'lfor b.TOJGOWhel;Bas
Machaiy;
Qn,
avolk?yoiOkivjhown'
MowPyqQfo?igeR;j.'t bTKpgF
SmALorughl'bee mout l, fthaN 
TFtQe Pgil MI $cr in!nkndwhantorzqAp!?

Let’s say that your two largest probs are rather close together (for example,
0.25 and 0.26). Using argmax() would always give you the index of 0.26,
ignoring, in a sense, that 0.25 is almost the same. On the other hand, using
multinomial() will give you the index of 0.26 26% of the time and the index
of 0.25 25% of the time, respecting the fact that the two values are quite close
to one another. 

# Self-Attention

In [24]:
C = vocab_size
print("vocab_size: ", C)
v = torch.randn(B,T,C) # (B, T, C)
q = torch.randn(B,T,C) # (B ,T, C)
k = torch.randn(B,T,C) # (B, T, C)

# k.permute(0, -1, -2)
attention_map = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) = (B, T, T)

vocab_size:  65


In [25]:
attention_map.mean(), attention_map.var()

(tensor(-0.0158), tensor(1.0057))

In [26]:
attention_map[0]

tensor([[ 1.6413, -2.0185,  1.1961,  0.9873,  1.7090, -0.4595, -2.1803, -2.0262],
        [-1.1759,  2.0109, -0.3454, -1.5808, -0.4645,  0.1754, -0.6167, -0.1218],
        [-0.8460,  0.1595,  0.6014,  2.1191, -0.5698, -0.1602,  0.7723, -0.1031],
        [-0.4086,  0.2769, -0.3434,  1.3056,  1.3264, -2.5766,  0.2877, -0.4436],
        [-0.6043,  0.1768,  0.7953, -0.7525,  0.6966, -2.1467,  0.3275,  0.6257],
        [-0.9377,  0.8440,  0.0175,  0.6627,  0.0191, -0.2623,  0.7639, -0.9209],
        [-0.6336,  0.6519, -0.1197, -0.8303, -0.1368,  0.6660, -1.0188,  0.7291],
        [-0.8455,  0.7477,  0.1703, -1.0307, -0.4310, -0.6085, -0.2355, -0.1206]])

In [27]:
feature_maps = attention_map @ v # (B, T, T) @ (B, T,C) = (B, T, C)

In [28]:
feature_maps.shape

torch.Size([4, 8, 65])

Let's do masked self-attention, as current token shouldn't pay attention to the tokens not yet generated.

In [29]:
tril = torch.tril(torch.ones(T, T))

In [30]:
masked_attention_map = attention_map.masked_fill(tril == 0, -np.Inf)
masked_attention_map = F.softmax(masked_attention_map, dim=-1) 

In [31]:
q.mean(), k.mean(), q.var(), k.var()

(tensor(0.0133), tensor(0.0092), tensor(0.9744), tensor(1.0254))

In [32]:
attention_map.mean(), attention_map.var()

(tensor(-0.0158), tensor(1.0057))

In [33]:
masked_attention_map[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0397, 0.9603, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1252, 0.3423, 0.5325, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1041, 0.2067, 0.1111, 0.5781, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0849, 0.1855, 0.3443, 0.0732, 0.3120, 0.0000, 0.0000, 0.0000],
        [0.0525, 0.3116, 0.1364, 0.2599, 0.1366, 0.1031, 0.0000, 0.0000],
        [0.0763, 0.2760, 0.1276, 0.0627, 0.1254, 0.2799, 0.0519, 0.0000],
        [0.0617, 0.3037, 0.1705, 0.0513, 0.0934, 0.0783, 0.1136, 0.1274]])

In [34]:
sum(masked_attention_map[0, 4, :])

tensor(1.0000)

In [35]:
masked_feature_maps = masked_attention_map @ v

In [36]:
masked_feature_maps.shape

torch.Size([4, 8, 65])

In [37]:
masked_feature_maps[0]

tensor([[ 5.3914e-01,  1.5729e-01, -8.1812e-02, -2.4920e-02,  1.1615e+00,
          2.5384e+00,  3.9348e-01, -4.8827e-01,  8.6052e-01,  4.0360e-01,
          8.9151e-01, -1.4223e+00,  1.2910e+00,  7.6744e-01,  5.4992e-01,
         -8.6220e-01,  1.9485e+00,  9.2357e-01, -1.3254e+00, -5.2953e-02,
         -6.9473e-02,  1.2781e+00,  1.1346e+00, -1.6889e+00,  9.7815e-02,
          3.0731e-02,  9.5689e-01,  2.0506e+00, -2.7036e-01, -2.1176e+00,
          9.9366e-02,  1.5632e+00, -9.3581e-01, -2.1726e+00, -4.6840e-01,
          3.1023e-01,  2.8098e-01, -3.9635e-01,  4.4112e-01, -4.5648e-01,
         -4.1252e-01,  5.2785e-01, -2.5038e-01, -2.0337e+00,  1.8777e+00,
          4.8470e-01, -7.8311e-01, -7.5207e-01, -7.2001e-01,  5.9401e-01,
          1.6254e+00,  6.7223e-01,  4.9356e-01,  3.1658e-02,  1.6121e+00,
          1.3751e+00,  2.4992e-01,  1.2888e+00,  4.2790e-01, -7.2240e-01,
         -4.9717e-01, -2.3866e-01, -2.2758e-01,  8.0191e-01, -3.9019e-01],
        [ 1.3687e+00, -1.8794e-02,  6

In [38]:
class SelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, head_size, drop_p):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_size, bias=False)
        self.k = nn.Linear(embed_dim, head_size, bias=False)
        self.v = nn.Linear(embed_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(sequence_length, sequence_length)))
        self.dropout = nn.Dropout(drop_p)
        self.head_size = head_size
        
    def forward(self, x):
        # x is (B, T, C)
        B, T, C = x.shape
        query = self.q(x)
        key = self.k(x)
        value = self.v(x)
#         key.permute(0, -1, -2)
        attention_map = query @ key.transpose(-2, -1) * self.head_size**-0.5 # (B, T, C) @ (B, C, T) = (B, T, T)
        masked_attention_map = attention_map.masked_fill(self.tril[:T, :T] == 0, -np.Inf)
        masked_attention_map = F.softmax(masked_attention_map, dim=-1)
        attention_map = self.dropout(masked_attention_map)
        feature_map = attention_map @ value # (B, T, T) @ (B, T, C) = (B, T, C)
        
        return feature_map


In [39]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_heads, head_size, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, vocab_size)
        self.self_attention = SelfAttention(sequence_length, embed_dim, head_size, drop_p)
    
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.self_attention(token_embeddings)
        
        return feature_maps

In [40]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=65, n_heads=0, head_size=65, drop_p=0.2).to(device)

In [41]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.378203868865967
Iteration:  100 loss:  3.544048547744751
Iteration:  200 loss:  3.0333075523376465
Iteration:  300 loss:  2.7841484546661377
Iteration:  400 loss:  3.1752049922943115
Iteration:  500 loss:  3.2865781784057617
Iteration:  600 loss:  2.8187448978424072
Iteration:  700 loss:  2.8593485355377197
Iteration:  800 loss:  3.0447280406951904
Iteration:  900 loss:  2.6702585220336914
Iteration:  1000 loss:  2.538647413253784
Iteration:  1100 loss:  2.4828011989593506
Iteration:  1200 loss:  3.0550971031188965
Iteration:  1300 loss:  3.059843063354492
Iteration:  1400 loss:  2.778026819229126
Iteration:  1500 loss:  3.104403257369995
Iteration:  1600 loss:  2.933621406555176
Iteration:  1700 loss:  2.8493082523345947
Iteration:  1800 loss:  2.6435546875
Iteration:  1900 loss:  2.895089626312256
Iteration:  2000 loss:  2.720979928970337
Iteration:  2100 loss:  2.9814436435699463
Iteration:  2200 loss:  2.430314302444458
Iteration:  2300 loss:  3.3681306838989

Loss reduced from 2.9 to 2.6. Le's implement now multi-head self-attention

# Multi-head self-attention

In [45]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
#         self.heads = nn.Sequential(*[SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

In [46]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.multi_head_self_attention(token_embeddings)
        out = self.linear_head(feature_maps)
        
        return out

In [47]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [48]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.193881988525391
Iteration:  100 loss:  3.4230470657348633
Iteration:  200 loss:  2.7359538078308105
Iteration:  300 loss:  2.9310128688812256
Iteration:  400 loss:  2.5577316284179688
Iteration:  500 loss:  2.8972017765045166
Iteration:  600 loss:  2.4420578479766846
Iteration:  700 loss:  2.415663003921509
Iteration:  800 loss:  2.4704556465148926
Iteration:  900 loss:  2.731886148452759
Iteration:  1000 loss:  2.7221641540527344
Iteration:  1100 loss:  2.449666738510132
Iteration:  1200 loss:  2.167160749435425
Iteration:  1300 loss:  2.757716178894043
Iteration:  1400 loss:  2.5921170711517334
Iteration:  1500 loss:  2.810755729675293
Iteration:  1600 loss:  2.4752840995788574
Iteration:  1700 loss:  2.521299362182617
Iteration:  1800 loss:  2.745368242263794
Iteration:  1900 loss:  2.600123405456543
Iteration:  2000 loss:  2.562520980834961
Iteration:  2100 loss:  2.3813891410827637
Iteration:  2200 loss:  2.6961910724639893
Iteration:  2300 loss:  2.37610530

Loss reduced from 2.6 to 2.3

# Add Dropout to Multi Head Self-Attention

In [49]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(out)
        return out

In [50]:
class LanguageModel(torch.nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.multi_head_self_attention(token_embeddings)
        out = self.linear_head(feature_maps)
        
        return out

In [51]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [52]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.277122497558594
Iteration:  100 loss:  3.342186689376831
Iteration:  200 loss:  3.081920862197876
Iteration:  300 loss:  2.848135232925415
Iteration:  400 loss:  2.8819174766540527
Iteration:  500 loss:  2.720522165298462
Iteration:  600 loss:  2.639202356338501
Iteration:  700 loss:  2.7645909786224365
Iteration:  800 loss:  2.72748064994812
Iteration:  900 loss:  2.882420301437378
Iteration:  1000 loss:  3.1454145908355713
Iteration:  1100 loss:  2.7664408683776855
Iteration:  1200 loss:  2.602421522140503
Iteration:  1300 loss:  2.673825979232788
Iteration:  1400 loss:  2.361302614212036
Iteration:  1500 loss:  2.7763092517852783
Iteration:  1600 loss:  2.7380058765411377
Iteration:  1700 loss:  2.5711185932159424
Iteration:  1800 loss:  2.650892496109009
Iteration:  1900 loss:  2.697878122329712
Iteration:  2000 loss:  2.338387966156006
Iteration:  2100 loss:  2.4675939083099365
Iteration:  2200 loss:  2.4761481285095215
Iteration:  2300 loss:  2.818318128585

Not much difference, that's okay. We have a lot more techniques in our sleeves up left

# Adding Linear layer in the multi-head self-attention block

In [53]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
        self.mlp = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.mlp(out)
        out = self.dropout(out)
        return out

In [54]:
class LanguageModel(torch.nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.multi_head_self_attention(token_embeddings)
        out = self.linear_head(feature_maps)
        
        return out

In [55]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [56]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.178427696228027
Iteration:  100 loss:  3.408883571624756
Iteration:  200 loss:  3.4426910877227783
Iteration:  300 loss:  3.1611053943634033
Iteration:  400 loss:  2.892348289489746
Iteration:  500 loss:  2.442905902862549
Iteration:  600 loss:  2.8668465614318848
Iteration:  700 loss:  2.9152190685272217
Iteration:  800 loss:  2.3059239387512207
Iteration:  900 loss:  2.630399227142334
Iteration:  1000 loss:  2.5764713287353516
Iteration:  1100 loss:  2.440676212310791
Iteration:  1200 loss:  2.516263484954834
Iteration:  1300 loss:  3.0696020126342773
Iteration:  1400 loss:  2.80375075340271
Iteration:  1500 loss:  2.442134380340576
Iteration:  1600 loss:  2.744114398956299
Iteration:  1700 loss:  2.93951678276062
Iteration:  1800 loss:  2.359147071838379
Iteration:  1900 loss:  2.7215633392333984
Iteration:  2000 loss:  2.541809320449829
Iteration:  2100 loss:  2.829474687576294
Iteration:  2200 loss:  2.872051239013672
Iteration:  2300 loss:  2.59447574615478

# Adding MLP to Encoder Block

In [57]:
class MLP(nn.Module):
    def __init__(self, embed_dim, drop_p):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 4 * embed_dim)
        self.act = nn.ReLU()
        self.fc2 = nn.Linear(4 * embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
#         x = self.act(x)
        x = self.dropout(x)
        
        return x

In [58]:
class EncoderBlock(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.mlp = MLP(embed_dim, drop_p)
    
    def forward(self, x):
        x = self.multi_head_self_attention(x)
        x = self.mlp(x)
        
        return x

In [59]:
class LanguageModel(torch.nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.encoder_block = EncoderBlock(sequence_length, embed_dim, n_heads, drop_p)
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.encoder_block(token_embeddings)
        out = self.linear_head(feature_maps)
        
        return out

In [60]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_heads=4, drop_p=0.2).to(device)

In [61]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.147553443908691
Iteration:  100 loss:  2.6217379570007324
Iteration:  200 loss:  2.804361581802368
Iteration:  300 loss:  2.5125527381896973
Iteration:  400 loss:  2.7518680095672607
Iteration:  500 loss:  2.177403688430786
Iteration:  600 loss:  2.68916654586792
Iteration:  700 loss:  3.180544137954712
Iteration:  800 loss:  2.8302884101867676
Iteration:  900 loss:  2.478668212890625
Iteration:  1000 loss:  2.708651542663574
Iteration:  1100 loss:  2.317713737487793
Iteration:  1200 loss:  2.517183542251587
Iteration:  1300 loss:  2.5691733360290527
Iteration:  1400 loss:  3.1070168018341064
Iteration:  1500 loss:  3.1951711177825928
Iteration:  1600 loss:  2.7407941818237305
Iteration:  1700 loss:  2.8738930225372314
Iteration:  1800 loss:  2.3539912700653076
Iteration:  1900 loss:  2.4363503456115723
Iteration:  2000 loss:  2.937950611114502
Iteration:  2100 loss:  2.357809066772461
Iteration:  2200 loss:  2.0655837059020996
Iteration:  2300 loss:  2.375536918

# Stacking Encoder Blocks + Increased Sequence Length

In [62]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        token_embeddings = self.token_embedding(x) # (B, T, C)
        feature_maps = self.encoder_blocks(token_embeddings)
        out = self.linear_head(feature_maps)
        
        return out

In [63]:
model = LanguageModel(vocab_size=65, sequence_length=8, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [64]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.179076194763184
Iteration:  100 loss:  3.4750277996063232
Iteration:  200 loss:  3.848767042160034
Iteration:  300 loss:  3.3714599609375
Iteration:  400 loss:  3.5319435596466064
Iteration:  500 loss:  3.4832046031951904
Iteration:  600 loss:  3.5226893424987793
Iteration:  700 loss:  2.9458110332489014
Iteration:  800 loss:  3.171492576599121
Iteration:  900 loss:  3.047847032546997
Iteration:  1000 loss:  3.327519416809082
Iteration:  1100 loss:  3.0854437351226807
Iteration:  1200 loss:  3.589770555496216
Iteration:  1300 loss:  3.0023868083953857
Iteration:  1400 loss:  3.139078140258789
Iteration:  1500 loss:  3.292590379714966
Iteration:  1600 loss:  3.1498239040374756
Iteration:  1700 loss:  3.3040313720703125
Iteration:  1800 loss:  3.3173365592956543
Iteration:  1900 loss:  3.4479973316192627
Iteration:  2000 loss:  3.364352226257324
Iteration:  2100 loss:  3.1109113693237305
Iteration:  2200 loss:  3.283599615097046
Iteration:  2300 loss:  3.1807508468

In [65]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [66]:
def train():
    model.train()
    for iteration in range(5000):
        data, labels = get_batch(32, 16)
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(data)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        labels = labels.view(B*T)
        loss = F.cross_entropy(logits, labels)
        if iteration % 100 == 0:
            print("Iteration: ", iteration, "loss: ", loss.item())
        loss.backward()
        optimizer.step()

In [67]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.190817832946777
Iteration:  100 loss:  3.403719663619995
Iteration:  200 loss:  3.238469362258911
Iteration:  300 loss:  3.2751009464263916
Iteration:  400 loss:  3.3087263107299805
Iteration:  500 loss:  3.2819149494171143
Iteration:  600 loss:  3.28656268119812
Iteration:  700 loss:  3.238373041152954
Iteration:  800 loss:  3.2947049140930176
Iteration:  900 loss:  3.262394428253174
Iteration:  1000 loss:  3.1965293884277344
Iteration:  1100 loss:  3.234726667404175
Iteration:  1200 loss:  3.2586898803710938
Iteration:  1300 loss:  3.413443088531494
Iteration:  1400 loss:  3.3672211170196533
Iteration:  1500 loss:  3.3451650142669678
Iteration:  1600 loss:  3.2692055702209473
Iteration:  1700 loss:  3.309401750564575
Iteration:  1800 loss:  3.2776641845703125
Iteration:  1900 loss:  3.464653730392456
Iteration:  2000 loss:  3.223921537399292
Iteration:  2100 loss:  3.2495994567871094
Iteration:  2200 loss:  3.1992087364196777
Iteration:  2300 loss:  3.313302278

Performnance seems to worsened ever since we made more deeper architecture. Now, since the model has become quite deeper, it's time to care about model optimization i.e skip connections for vanishing gradient and layer normalization

# Adding Skip Connections and Layer Normalization

In [68]:
class EncoderBlock(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim, drop_p)
    
    def forward(self, x):
        x = x + self.multi_head_self_attention(self.layer_norm(x))
        x = x + self.mlp(self.layer_norm(x))
        
        return x

In [69]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [70]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.509145259857178
Iteration:  100 loss:  2.531517744064331
Iteration:  200 loss:  2.4795544147491455
Iteration:  300 loss:  2.5326006412506104
Iteration:  400 loss:  2.4241528511047363
Iteration:  500 loss:  2.3807356357574463
Iteration:  600 loss:  2.350027561187744
Iteration:  700 loss:  2.371140718460083
Iteration:  800 loss:  2.26985502243042
Iteration:  900 loss:  2.2268195152282715
Iteration:  1000 loss:  2.324267864227295
Iteration:  1100 loss:  2.348752498626709
Iteration:  1200 loss:  2.2406327724456787
Iteration:  1300 loss:  2.288285970687866
Iteration:  1400 loss:  2.4556381702423096
Iteration:  1500 loss:  2.3536763191223145
Iteration:  1600 loss:  2.2832260131835938
Iteration:  1700 loss:  2.1188032627105713
Iteration:  1800 loss:  2.1790964603424072
Iteration:  1900 loss:  2.3452975749969482
Iteration:  2000 loss:  2.242349624633789
Iteration:  2100 loss:  2.2990105152130127
Iteration:  2200 loss:  2.288428544998169
Iteration:  2300 loss:  2.18259572

Last trick: Add positional embedding

# Add Positional Embedding

In [71]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = torch.nn.Embedding(sequence_length, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        B, T = x.shape
        
        token_embeddings = self.token_embedding(x) # (B, T, C)
        positional_embeddings = self.pos_embedding(torch.arange(T, device=device)) # (T, C)
        embeddings = token_embeddings + positional_embeddings # B, T, C cause broadcasting
        feature_maps = self.encoder_blocks(embeddings)
        out = self.linear_head(feature_maps)
        
        return out

In [72]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [73]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.622491359710693
Iteration:  100 loss:  2.6574392318725586
Iteration:  200 loss:  2.5600504875183105
Iteration:  300 loss:  2.498896360397339
Iteration:  400 loss:  2.5022151470184326
Iteration:  500 loss:  2.43200945854187
Iteration:  600 loss:  2.385399580001831
Iteration:  700 loss:  2.305583953857422
Iteration:  800 loss:  2.2335081100463867
Iteration:  900 loss:  2.155430555343628
Iteration:  1000 loss:  2.1502156257629395
Iteration:  1100 loss:  2.1165213584899902
Iteration:  1200 loss:  2.259932518005371
Iteration:  1300 loss:  2.1156651973724365
Iteration:  1400 loss:  2.1335630416870117
Iteration:  1500 loss:  2.196894884109497
Iteration:  1600 loss:  2.0143141746520996
Iteration:  1700 loss:  1.992996096611023
Iteration:  1800 loss:  2.096079111099243
Iteration:  1900 loss:  2.145651340484619
Iteration:  2000 loss:  1.871450662612915
Iteration:  2100 loss:  2.017768621444702
Iteration:  2200 loss:  1.9788874387741089
Iteration:  2300 loss:  1.99200379848

In [74]:
model.eval()
sequence_length=32
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate 
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(decode(generated_tokens))

start_token shape:  torch.Size([1, 1])
Not EDWARD:
It them not do suck of Clannoter,
To'ruke you! grear clove; follse himplany,
And the look flow; neve on the king.

KING HARG RICHARD HARD III:
A have way With the pareman, good our another?

JOHESTER:
Now, and esshalls the pleseminticklies xear thee tay.

JULIUS:
Naw, we fror of I she 'sham then regew and to werry our have crip of Pelsomes
Which bishard.

BUCKINGHARD IICHARD IING HENCEL:
You.

LORDIOLLA:
My the not taland slay
Nullshoo to the mone him struke and prack, whomh with have may the dune.

VICKIOHNRS:
The deglancepter, men me lock,
Thy flace as pleeminer such, like.

CLARENCE:
Shall,
On priancer, pleapple you seef'
bettless.

GROMESCERLESS:
Neck; I, the grall with my musy Coulicgued,
I hearts tire the plueles; but fatanter such more land priviaple
Ap thich'd thy foll fladiour 'tzeding prom eyon.

PARINCE:
Then this band the dircuter to mard;
And a--ut way well by ony my can my from fidsink,
So with this careath. Sind not to m

In [75]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = torch.nn.Embedding(sequence_length, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        B, T = x.shape
        
        token_embeddings = self.token_embedding(x) # (B, T, C)
        positional_embeddings = self.pos_embedding(torch.arange(T, device=device)) # (T, C)
        embeddings = token_embeddings + positional_embeddings # B, T, C cause broadcasting
        feature_maps = self.encoder_blocks(embeddings)
        x = self.layer_norm(feature_maps)
        x = self.linear_head(x)
        
        return x

In [76]:
model = LanguageModel(vocab_size=65, sequence_length=32, embed_dim=64, n_blocks = 4, n_heads=4, drop_p=0.2).to(device)

In [77]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  4.286431789398193
Iteration:  100 loss:  2.704097032546997
Iteration:  200 loss:  2.515896797180176
Iteration:  300 loss:  2.4800031185150146
Iteration:  400 loss:  2.472038507461548
Iteration:  500 loss:  2.458101511001587
Iteration:  600 loss:  2.2921974658966064
Iteration:  700 loss:  2.179999589920044
Iteration:  800 loss:  2.304443597793579
Iteration:  900 loss:  2.2007734775543213
Iteration:  1000 loss:  2.1956934928894043
Iteration:  1100 loss:  2.212348699569702
Iteration:  1200 loss:  2.193769693374634
Iteration:  1300 loss:  2.1309947967529297
Iteration:  1400 loss:  2.2613184452056885
Iteration:  1500 loss:  2.077021360397339
Iteration:  1600 loss:  2.1280646324157715
Iteration:  1700 loss:  2.073899745941162
Iteration:  1800 loss:  1.98118257522583
Iteration:  1900 loss:  1.9331833124160767
Iteration:  2000 loss:  2.0193066596984863
Iteration:  2100 loss:  2.0706899166107178
Iteration:  2200 loss:  2.0610713958740234
Iteration:  2300 loss:  2.0727114677

In [78]:
model.eval()
sequence_length=32
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate 
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(decode(generated_tokens))

start_token shape:  torch.Size([1, 1])
Clehind distady theke is to-no not's mine
rown:
And in thingung that an!

KING RICHARD IIII:
Shink, Beasure os.

VULIXESBY Caking Gempliine
To chestized hopk cordimicess are to bath
The no sway's or hate it?

FLADY The
Till nandlyse,
The his of you.

Ly PORIOLHY:
I puppicion: shall to be doish our but thing so fullyy to exal. I with not genged
Shall see trentler in man his I mad:
Who wince have ving treakengless I'll detted showon will you,----fast goned time yourself with my Riddrouseth of hroughers,
Which here deess upon heavy mighnily be know and merran.

WARWICK:
Garientles be that laid.

Lord:
A me a do thear presuletch your madake,
I do prastibon be drath, would ward pity-to
And live--

MERCUTIO:
Lord O say, with wife, he
the inve time for resing him with,
And brespun undlihile, marrown, te vitidess
Rome one aron his cursteringes!

KING EDWARD IV:
Rome caust, crirs of Etteling ell, for your lond: in his the critle,
That-sil in like do come
M

# Use token embeddings from OpenAI: Tiktoken

In [79]:
data_dir = "/kaggle/working"

with open(os.path.join(data_dir, "input.txt"), encoding="utf8") as f:
    text = f.read()
train_text = text[:int(len(text)*0.97)]
val_text = text[int(len(text)*0.97):]
print("chars in train_data: ", len(train_text))
print("chars in val_data: ", len(val_text))

enc = tiktoken.get_encoding("gpt2")
train_data = enc.encode_ordinary(train_text)
val_data = enc.encode_ordinary(val_text)

print("tokens in train dataset: ", len(train_data))
print("tokens in validaiton dataset: ", len(val_data))

train_set = set(train_data)
val_set = set(val_data)

vocab_size = len(train_set.union(val_set))

chars in train_data:  1081932
chars in val_data:  33462
tokens in train dataset:  327428
tokens in validaiton dataset:  10598


In [80]:
train_data, val_data = np.array(train_data), np.array(val_data)

In [81]:
vocab_size = max(train_data)

In [82]:
vocab_size

50255

In [83]:
class MLP(nn.Module):
    def __init__(self, embed_dim, drop_p):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 4 * embed_dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(4 * embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
#         x = self.act(x)
        x = self.dropout(x)
        
        return x

In [84]:
model = LanguageModel(vocab_size=vocab_size + 1, sequence_length=32, embed_dim=64, n_blocks=4, n_heads=4, drop_p=0.2).to(device)

In [85]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

Iteration:  0 loss:  11.025362014770508
Iteration:  100 loss:  6.364414215087891
Iteration:  200 loss:  6.1271257400512695
Iteration:  300 loss:  5.743419170379639
Iteration:  400 loss:  5.472109317779541
Iteration:  500 loss:  5.344599723815918
Iteration:  600 loss:  5.3121466636657715
Iteration:  700 loss:  5.006277561187744
Iteration:  800 loss:  5.064224720001221
Iteration:  900 loss:  4.776893138885498
Iteration:  1000 loss:  5.109827041625977
Iteration:  1100 loss:  4.7790398597717285
Iteration:  1200 loss:  5.040238380432129
Iteration:  1300 loss:  4.6726975440979
Iteration:  1400 loss:  4.334329128265381
Iteration:  1500 loss:  4.477770805358887
Iteration:  1600 loss:  4.788212776184082
Iteration:  1700 loss:  4.799291610717773
Iteration:  1800 loss:  4.721064567565918
Iteration:  1900 loss:  4.353370189666748
Iteration:  2000 loss:  4.06364107131958
Iteration:  2100 loss:  4.7813401222229
Iteration:  2200 loss:  4.274487018585205
Iteration:  2300 loss:  4.4089484214782715
Iter

In [86]:
model.eval()
sequence_length=32
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate 
    start_char = "\n"
    start_token = torch.tensor(encode(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(enc.decode(generated_tokens))

start_token shape:  torch.Size([1, 1])

Thou v goose and talk of heart King fair maid.

HASTINGS:
Making so him suffer down shall be from your good thing
Of your grace. For, we can hear you should not.

BUCKINGHAM:
But what news should nothing but even on.
This is the paper come.
The equal, thou art, good father's life, neither,
And maiding whom thou ne'er is no more betpart.

FOLANUS:
Marg here comes he amen.

CURTIS:
I'll not spoke for us.

MONTAGUE:
Thou must'll feel?

Second Keeper:
Why, ay, if you had a man, mother!
Conjectwest that goes my vow, put to heaven not pass.

LADY ANNE:
I pray, thou art thou hast,
you say, thou shalt be he chide out your warrant;
For it is the champions give hungry came to them.

CORIOLANUS:
I stay she are gis; retired, is the light;
But that's substitute depart.
Hath he will not so apt 'This under the other scope,
This is not at least, hath prepared'd from himself: you might see
I remember as never pass'd! who did make the king.

POMPEY:
Ofom, is thee,

# Train on a subset of OpenWebText

In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
[0m

In [2]:
import torch
import torch.nn.functional as F
import tiktoken
import numpy as np
import os
from torch import nn

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
class SelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, head_size, drop_p):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_size, bias=False)
        self.k = nn.Linear(embed_dim, head_size, bias=False)
        self.v = nn.Linear(embed_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(sequence_length, sequence_length)))
        self.dropout = nn.Dropout(drop_p)
        self.head_size = head_size
        
    def forward(self, x):
        # x is (B, T, C)
        B, T, C = x.shape
        query = self.q(x)
        key = self.k(x)
        value = self.v(x)
#         key.permute(0, -1, -2)
        attention_map = query @ key.transpose(-2, -1) * self.head_size**-0.5 # (B, T, C) @ (B, C, T) = (B, T, T)
        masked_attention_map = attention_map.masked_fill(self.tril[:T, :T] == 0, -np.Inf)
        masked_attention_map = F.softmax(masked_attention_map, dim=-1)
        attention_map = self.dropout(masked_attention_map)
        feature_map = attention_map @ value # (B, T, T) @ (B, T, C) = (B, T, C)
        
        return feature_map


In [5]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        head_size = embed_dim // n_heads
        self.heads = nn.ModuleList([SelfAttention(sequence_length, embed_dim, head_size, drop_p) for x in range(n_heads)])
        self.mlp = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.mlp(out)
        out = self.dropout(out)
        return out

In [6]:
class MLP(nn.Module):
    def __init__(self, embed_dim, drop_p):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 4 * embed_dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(4 * embed_dim, embed_dim)
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
#         x = self.act(x)
        x = self.dropout(x)
        
        return x

In [7]:
class EncoderBlock(nn.Module):
    def __init__(self, sequence_length, embed_dim, n_heads, drop_p):
        super().__init__()
        self.multi_head_self_attention = MultiHeadSelfAttention(sequence_length, embed_dim, n_heads, drop_p)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim, drop_p)
    
    def forward(self, x):
        x = x + self.multi_head_self_attention(self.layer_norm(x))
        x = x + self.mlp(self.layer_norm(x))
        
        return x

In [8]:
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, n_blocks, n_heads, drop_p):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = torch.nn.Embedding(sequence_length, embed_dim)
        self.encoder_blocks = nn.Sequential(*([EncoderBlock(sequence_length, embed_dim, n_heads, drop_p) for x in range(n_blocks)]))
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.linear_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """
        Attributes
        __________________
        x.shape: (B, T)
        
        Returns
        _________________
        logits.shape: (B, T, C), where C= vocab size
        """
        B, T = x.shape
        
        token_embeddings = self.token_embedding(x) # (B, T, C)
        positional_embeddings = self.pos_embedding(torch.arange(T, device=device)) # (T, C)
        embeddings = token_embeddings + positional_embeddings # B, T, C cause broadcasting
        feature_maps = self.encoder_blocks(embeddings)
        x = self.layer_norm(feature_maps)
        x = self.linear_head(x)
        
        return x

In [9]:
data_dir = "/kaggle/input/openwebtext-subset-20"

train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

vocab_size = max(train_data)

In [10]:
model = LanguageModel(vocab_size=vocab_size + 1, sequence_length=512, embed_dim=768, n_blocks=6, n_heads=6, drop_p=0.1).to(device)

In [13]:
os.path.join(data_dir, "best_checkpoint.pth")

'/kaggle/input/openwebtext-subset-20/best_checkpoint.pth'

In [12]:
model.load_state_dict(os.path.join(data_dir, "best_checkpoint.pth"))

TypeError: Expected state_dict to be dict-like, got <class 'str'>.

In [14]:
val_data.shape

(12885552,)

In [15]:
def get_batch(split, sequence_length, batch_size):
    if split == "train":
        dataset = train_data
    else:
        dataset = val_data
    random_numbers = torch.randint(0, len(dataset) - sequence_length, (batch_size,))
    data = torch.stack([torch.from_numpy(dataset[random_number: random_number + sequence_length].astype(np.int64)) for random_number in random_numbers])
    labels = torch.stack([torch.from_numpy(dataset[random_number + 1: random_number + sequence_length + 1].astype(np.int64)) for random_number in random_numbers])
    return data, labels

In [16]:
eval_iters = 100
def evaluate():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            data, labels = get_batch(split, 512, 16)
            data = data.to(device)
            labels = data.to(device)
            logits = model(data)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            labels = labels.view(B*T)
            loss = F.cross_entropy(logits, labels)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [17]:
def train():
    min_val_loss = np.Inf
    for iteration in range(700000):
        if iteration % 500 == 0:
            losses = evaluate()
            print(f"step {iteration}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            if losses["val"] < min_val_loss:
                min_val_loss = losses["val"]
                torch.save(model.state_dict(), os.path.join("/kaggle/working", f"best_checkpoint.pth"))
        data, labels = get_batch("train", 512, 16)
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(data)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        labels = labels.view(B*T)
        loss = F.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train()

step 0: train loss 11.1205, val loss 11.1206
step 500: train loss 5.5682, val loss 5.6956
step 1000: train loss 5.8837, val loss 5.6302
step 1500: train loss 5.6791, val loss 5.7529
step 2000: train loss 5.7770, val loss 5.7086
step 2500: train loss 5.9406, val loss 5.9721
step 3000: train loss 5.7866, val loss 5.9718
step 3500: train loss 5.9370, val loss 5.9669
step 4000: train loss 6.0417, val loss 5.9601
step 4500: train loss 6.1591, val loss 6.0797
step 5000: train loss 6.0856, val loss 6.0366
step 5500: train loss 5.9664, val loss 6.0845
step 6000: train loss 6.0808, val loss 6.3655
step 6500: train loss 6.0679, val loss 6.1373
step 7000: train loss 6.0773, val loss 6.1683
step 7500: train loss 6.1455, val loss 6.1742
step 8000: train loss 6.0446, val loss 6.1076
step 8500: train loss 6.4296, val loss 6.1427
step 9000: train loss 6.1211, val loss 6.4866
step 9500: train loss 6.4034, val loss 6.2373
step 10000: train loss 6.4121, val loss 6.4991
step 10500: train loss 6.1764, val 

KeyboardInterrupt: 

In [19]:
model.training

False

In [22]:
enc = tiktoken.get_encoding("gpt2")

In [28]:
model.eval()
sequence_length=512
with torch.no_grad(): # despite calling model.eval, it is a good idea to do torch.no_grad() to make sure no gradients are calculate 
    start_char = "Once upon a time, there were a prince and a princess who loved eachother a lot, but"
    start_token = torch.tensor(enc.encode_ordinary(start_char)).unsqueeze(0)
    print("start_token shape: ", start_token.shape)
    current_token = start_token.to(device) # (B, T)
    generated_tokens = []
    for x in range(1000):
        idx = current_token[:, -sequence_length:]
        logits = model(idx) # (B, T, C)
        logits = logits[:, -1, :] # (B, C)
        preds = torch.nn.functional.softmax(logits, dim=-1) # (B, C)
        next_token = torch.multinomial(preds, num_samples=1) # (B, 1)
        generated_tokens.append(next_token.item())
        current_token = torch.cat((current_token, next_token), dim=1).to(device) # (B, T+1)
    print(enc.decode(generated_tokens))

start_token shape:  torch.Size([1, 20])
 most of us couldn’t tell them A.) you were so much better?” Bloor said.

“She emailed me with a shovel, and I could call my Queen” Bloor said. "Being vice president of the Royal Family Air Brigade is perfect, and is my coach of the Royal Family Air Brigade and there are fantastic individuals in Charles. They are the primesocialist of the Navy who is a large industry right now.”

George forced Vietnam to escape dangerous pathogens in shipworm, Naroul, Prince Heinz, and the Marine Fisheries Service. Bloor was expecting what Trump encouraged them to report on what he called “murdily” articles defending “fake news.”

A quick attack by the Navy and American vessels has proven what Trump has learned about as a base of sophisticated squid silk cocoons and has fuelled jobs, as an important symbol for helping the empire better. Trump has nothing to do with these wars. Cavesturas famously signed up to be prime minister in January.

McGillibrand,

“Most Kr

In [26]:
torch.save(model.state_dict(), os.path.join("/kaggle/working", f"last_checkpoint.pth"))