# Nano GPT

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
# Hyperparameters
block_size = 256
n_head = 6
n_emb = 384
batch_size = 64
lr = 6e-4
n_epochs = 5000
n_layers = 8

In [5]:
with open('/content/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# Embedding
chars = sorted(set(text))
vocab_size = len(chars)

# Character Encoding
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l:  ''.join([itos[i] for i in l])

# Convert all text data to integers
data = torch.tensor(encode(text), dtype = torch.long)

In [6]:
def get_batch(data, batch_size, block_size, device):
    # Generate random indices within the valid range
    ix = torch.randint(0, len(data) - block_size, size=(batch_size,))

    # Extract blocks of data using the generated indices
    xb = torch.stack([data[i:i+block_size] for i in ix])

    # Extract corresponding target blocks
    # Note that yb is reshaped to a 1D tensor
    yb = torch.stack([data[i+1:i+block_size+1] for i in ix]).view(-1)

    return xb.to(device), yb.to(device)

xb, yb = get_batch(data, 64, 8, device)
xb.shape, yb.shape

(torch.Size([64, 8]), torch.Size([512]))

In [7]:
class FeedFoward(nn.Module):
    def __init__(self, n_emb):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_emb, 3 * n_emb),
            nn.ReLU(),
            nn.Linear(3 * n_emb, n_emb)
        )

    def forward(self, x):
        return self.layers(x)

In [8]:
class Head(nn.Module):
    def __init__(self, n_emb, block_size, head_size, device):
        super().__init__()

        # Linear transformations for key, query, and value
        self.key = nn.Linear(n_emb, head_size, bias=False)
        self.query = nn.Linear(n_emb, head_size, bias=False)
        self.value = nn.Linear(n_emb, head_size, bias=False)

        # Lower triangular matrix for masking
        self.tril = torch.tril(torch.ones(block_size, block_size)).to(device)

    def forward(self, x):
        B,T,C = x.shape

        # Linear transformations for key and query
        k = self.key(x)
        q = self.query(x)

        # Compute the attention weights
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5

        # Masking to make sure the network can't attend to the future positions
        wei.masked_fill_(self.tril[:T, :T] == 0, float('-inf'))

        # Applying softmax to get the attention probabilities
        wei = F.softmax(wei, dim=-1)

        # Linear transformation for value and computing the output
        v = self.value(x)
        out = wei @ v

        return out

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_emb, n_head, head_size, device):
        super().__init__()
        self.head_size = head_size

        self.heads = nn.ModuleList([Head(n_emb, block_size, head_size, device) for _ in range(n_head)])
        self.proj = nn.Linear(self.head_size * n_head, n_emb)

    def forward(self, x):
        # Apply all attention heads in parallel
        out = torch.cat([head(x) for head in self.heads], dim=-1)

        # Project the concatenated results
        out = self.proj(out)

        return out

In [10]:
class Block(nn.Module):
    def __init__(self, n_emb, n_head, device):
        super().__init__()

        head_size = n_emb // n_head

        # Multi-Head Attention Layer
        self.mul_head = MultiHeadAttention(n_emb, n_head, head_size, device)

        # Feed-Forward Layer
        self.ffwd = FeedFoward(n_emb)

        # Layer Normalization Layers
        self.ln1 = nn.LayerNorm(n_emb)
        self.ln2 = nn.LayerNorm(n_emb)

    def forward(self, x):
        # Multi-Head Attention Block
        x = x + self.mul_head(self.ln1(x))

        # Feed-Forward Block
        x = x + self.ffwd(self.ln2(x))

        return x

In [11]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_emb, block_size, n_head, n_layers, device):
        super().__init__()
        self.n_layers = n_layers

        # Embedding layers
        self.C = nn.Embedding(vocab_size, n_emb)
        self.position = nn.Embedding(block_size, n_emb)

        # Transformer blocks
        self.blocks = nn.Sequential(*[Block(n_emb, n_head, device) for _ in range(n_layers)])

        # Final layer normalization
        self.ln_f = nn.LayerNorm(n_emb)

        # Linear layer for language modeling
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, inp, targets = None):
        # Embedding lookup for input data
        token_emb = self.C(inp)
        position_emb = self.position(torch.arange(inp.shape[1], device = device))
        x_emb = token_emb + position_emb

        # Transformer blocks
        out = self.blocks(x_emb)

        # Final layer normalization
        out = self.ln_f(out)

        # Linear layer for language modeling
        logits = self.lm_head(out)

        if targets == None:
          loss = None
        else:
          # Reshape logits for the cross-entropy loss
          logits = logits.view(-1, logits.shape[-1])
          # Compute the cross-entropy loss
          loss = F.cross_entropy(logits, targets)

        return logits, loss

In [12]:
model = BigramLanguageModel(vocab_size, 32, 8, 5, 1, device)
model.to(device)

num_parameters = 0
for p in model.parameters():
    num_parameters += p.numel()
print("Total number of trainable parameters:", num_parameters)

Total number of trainable parameters: 14817


In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

for i in range(100):
    xb, yb = get_batch(data, 64, 8, device)

    logits, loss = model(xb, yb)

    # Backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print(loss)

tensor(3.2507, device='cuda:0', grad_fn=<NllLossBackward0>)


In [14]:
context = torch.zeros(1, 1, dtype = torch.long, device = device)
context

tensor([[0]], device='cuda:0')

In [15]:
logits, _ = model(context)
logits, logits.shape

(tensor([[[ 1.4532,  2.6264, -0.8126, -1.4322, -2.1681, -0.8644,  0.4027,
           -1.0778, -1.0430, -1.5876, -0.7976, -0.3464, -1.5724, -0.4819,
           -0.4304, -1.0133, -1.5605, -0.8962, -1.4719, -1.2092, -0.6767,
           -0.7557, -2.4317, -0.9886, -1.3998, -0.8497, -1.0715, -0.5712,
           -1.4491, -1.8244, -1.1285, -0.6197, -0.4073, -1.3509, -1.4581,
           -1.0151, -0.5341, -1.5363, -2.2366,  1.2976,  0.1806,  0.0059,
            0.4877,  1.7831,  0.3746, -0.1175,  0.9814,  1.0195, -2.1851,
           -0.5097,  0.4699,  0.6254,  0.6517,  1.4879, -0.5892, -1.6162,
            0.4139,  0.7043,  1.0963,  0.6129, -1.0932,  0.2736, -1.5890,
            0.4178, -1.9275]]], device='cuda:0', grad_fn=<ViewBackward0>),
 torch.Size([1, 1, 65]))

In [16]:
logits = logits[:, -1, :]
logits.shape

torch.Size([1, 65])

In [17]:
probs = F.softmax(logits, dim=-1)
probs

tensor([[0.0556, 0.1796, 0.0058, 0.0031, 0.0015, 0.0055, 0.0194, 0.0044, 0.0046,
         0.0027, 0.0059, 0.0092, 0.0027, 0.0080, 0.0084, 0.0047, 0.0027, 0.0053,
         0.0030, 0.0039, 0.0066, 0.0061, 0.0011, 0.0048, 0.0032, 0.0056, 0.0045,
         0.0073, 0.0031, 0.0021, 0.0042, 0.0070, 0.0086, 0.0034, 0.0030, 0.0047,
         0.0076, 0.0028, 0.0014, 0.0476, 0.0156, 0.0131, 0.0212, 0.0773, 0.0189,
         0.0116, 0.0347, 0.0360, 0.0015, 0.0078, 0.0208, 0.0243, 0.0249, 0.0575,
         0.0072, 0.0026, 0.0197, 0.0263, 0.0389, 0.0240, 0.0044, 0.0171, 0.0027,
         0.0197, 0.0019]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [18]:
idx_next = torch.multinomial(probs, num_samples=1)
idx_next

tensor([[43]], device='cuda:0')

In [19]:
context = torch.cat((context, idx_next), dim=1) # (B, T+1)
context

tensor([[ 0, 43]], device='cuda:0')

In [20]:
context = torch.zeros(1, 1, dtype = torch.long, device = device)
for i in range(10):
  idx_cond = context[:, -8:]
  logits, _ = model(idx_cond.to(device))
  logits = logits[:, -1, :]
  probs = F.softmax(logits, dim=-1)
  idx_next = torch.multinomial(probs.to(device), num_samples=1)
  context = torch.cat((context, idx_next), dim=1)
context

tensor([[ 0,  1, 42,  1, 54, 46, 56, 41, 46, 63, 59]], device='cuda:0')

In [21]:
print(decode(context[0].tolist()))


 d phrchyu


In [22]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_emb, block_size, n_head, n_layers, device):
        super().__init__()
        self.n_layers = n_layers
        self.block_size = block_size

        # Embedding layers
        self.C = nn.Embedding(vocab_size, n_emb)
        self.position = nn.Embedding(block_size, n_emb)

        # Transformer blocks
        self.blocks = nn.Sequential(*[Block(n_emb, n_head, device) for _ in range(n_layers)])

        # Final layer normalization
        self.ln_f = nn.LayerNorm(n_emb)

        # Linear layer for language modeling
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, inp, targets = None):
        # Embedding lookup for input data
        token_emb = self.C(inp)
        position_emb = self.position(torch.arange(inp.shape[1], device = device))
        x_emb = token_emb + position_emb

        # Transformer blocks
        out = self.blocks(x_emb)

        # Final layer normalization
        out = self.ln_f(out)

        # Linear layer for language modeling
        logits = self.lm_head(out)

        if targets == None:
          loss = None
        else:
          # Reshape logits for the cross-entropy loss
          logits = logits.view(-1, logits.shape[-1])
          # Compute the cross-entropy loss
          loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]

            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [23]:
model = BigramLanguageModel(vocab_size, 32, 8, 5, 1, device)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

for i in range(100):
    xb, yb = get_batch(data, 64, 8, device)

    logits, loss = model(xb, yb)

    # Backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

context = torch.zeros(1, 1, dtype = torch.long, device = device)
context = model.generate(context, 100)
print(decode(context[0].tolist()))


lAiangego qt s rhrditts yGED oEM thloXtoYbe&tnsaweco&ilx e,zv: ostnGho  tris'Tste elbcolrPguQ  K&d b


In [24]:
import time
# Get the start time
start_time = time.time()

model = BigramLanguageModel(vocab_size, batch_size, block_size, n_head, n_layers, device)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

for i in range(n_epochs):
    xb, yb = get_batch(data, batch_size, block_size, device)

    logits, loss = model(xb, yb)

    # Backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# Get the end time
end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time

# Convert to minutes and seconds
execution_time_minutes = int(execution_time // 60)
execution_time_seconds = round(execution_time % 60, 2)

print(f"Model training time: {execution_time_minutes} min {execution_time_seconds}s")
print(f"Loss: {loss}")

Model training time: 12 min 32.97s
Loss: 1.3645884990692139


In [25]:
context = torch.zeros(1, 1, dtype = torch.long, device=device)
context = model.generate(context, 5000)
print(decode(context[0].tolist()))



BINCHURD:
How would prosom.

CORIOLANUS:
Withre, wish'd Tyrrel! What! sovereign will their wound
The leason, mean must bone of any wipsoy,
Sprenators are the unclaol you both:
I did! Lord not calmise she himself,
So not time and sto let your griop,
And not themself he; but fond Edward's;
I dothous of my this nection Ledia cravol'd thousand:
Or thou murderst myself by moron.
Here good no fheash which, for his means: not
That may massy pass himself.

Secondamen:
When follows of the ragin of at Juldeen.

RICHMOND:
Artimence
O had him; he down, if you'rt,--that'd hearts; where's young,
He is on, the my life.

Bids Senator:
Yet be do no tongue, which you manuch London.
If you, myself, I'll swear'd life de'er anger,
for you should leave you, beholds on dawn;
If you my widest throat dost your did.

SICINIUS:
Durstily lamish their fly pirdon, vourtur you
Or herume, your upon to you with's.

JOMN.

CALISA:
And mayling her And God, and live.

Boold Marcius of Glord!

TRANIO:
I have home; and, 