# **Bigram Model**

In [2]:
# Imports
import torch
from torch import nn
from torch.optim import AdamW
from torch.nn import functional as F

**Params**

In [3]:
# Seed
torch.manual_seed(1337)
# Device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# DataLoader Batch Size
batch_size = 16
# Maximum Context Length
block_size = 32
# Embedding Size
n_embed = 64
# Print Frequency
print_freq = 500
# Epochs
epochs = 5000
# Learning Rate
lr = 1e-3

**Data**

In [4]:
# Read Dataset
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [5]:
# Dataset Length
len(text)

1115394

In [6]:
# Unique Characters in the Dataset
characters = list(set(text))
vocab_size = len(characters)
print(''.join(sorted(characters)))
print(len(characters))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
# Encode Text
mapping = {char:i for i, char in enumerate(characters)}
rev_mapping = dict(enumerate(characters))

encode_text = lambda string: [mapping[s] for s in string]
decode_text = lambda ls: ''.join([rev_mapping[l] for l in ls])

# Test
print(encode_text('Hi There!'))
print(decode_text([57, 39, 29, 38, 4, 42, 14, 42, 55]))

[54, 11, 26, 32, 29, 23, 49, 23, 22]
PIhBO'w'k


In [8]:
# Creating Tensor Dataset of Encoded Text
data = torch.tensor(encode_text(text), dtype = torch.long)
print(data.shape)

torch.Size([1115394])


In [9]:
# Train Test Split
n = int(0.9*(len(data)))
train = data[:n]
test = data[n:]

In [10]:
def get_batch(split):
    # Get Data
    inp = train if split == 'train' else test
    # Random Indexes
    ix = torch.randint(len(inp)-block_size, (batch_size,))
    # x is from i:i+block_size, y is i+1:i_block_size+1
    x = torch.stack([inp[i:i+block_size] for i in ix])
    y = torch.stack([inp[i+1:i+block_size+1] for i in ix])
    return x, y

x_batch, y_batch = get_batch('train')

**Bigram Language Model**

In [11]:
# Bigram Model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embed):
        super(BigramLanguageModel, self).__init__()
        # Each Token Reads a Row From The Table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, context, targets = None):

        # (B,T,C)
        token_embed = self.token_embedding_table(context)
        # Logits (batch_size, block_size, vocab_size)
        logits = self.lm_head(token_embed)

        # Loss
        if targets == None:
            loss = None
        else:
            # Get batch_size, block_size, vocab_size
            B, T, C = logits.shape
            # Reshape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T,)
            # Calculate Loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, context, max_new_tokens):
        # Iterating Through Number of Tokens To Generate
        for _ in range(max_new_tokens):
            # Get Predictions
            logits, loss = self(context)
            # Get Last Block (Time Step)
            logits = logits[:, -1, :]
            # Probability
            probs = F.softmax(logits, dim = -1)
            # Sample From The Distribution
            context_next = torch.multinomial(probs, num_samples = 1)
            # Append
            context = torch.cat((context, context_next), dim = 1)

        return context

In [12]:
# Test Bigram
model = BigramLanguageModel(vocab_size, n_embed).to(device)
context = torch.zeros((1, 1), dtype = torch.long).to(device)
print(decode_text(model.generate(context = context, max_new_tokens = 500)[0].tolist()))

aF.NiUcYEvBr dEDdNJBTUoWnTJUcPeacf,AGjr-wcswl&f;IRCSo3&YQ-f'F.WTIa?McKyMcnH   zdkEo-UC;pFNn:'M.VnXI G zd&EXAHv
NxNYZBuKbUsjl zMaKbdOlW-$bNzg'qqwc,Om,l
ieqi. BJeG&:gUdGKZKu?Egei.U
iXn.g.VbC:ZPRMXwQk
r:!;KCn
;. 
QcP,$G.!.QunL$ihz;X3gzY&kEyEdxc fA$zxi-ogTDM&$dpL;X!.xwA$Xq;LwaTHQK3g-xqZQkAfkfLqr!'. wpRrr.e v
3jPbnh?'yjGqgc&kUDTRpCON-UXBWZTU ew?hkenQSH3WeeSmG3UT&Xa?mHro'toxOVzN
ON:Dbii&ihYW
TtfNxCJQGLlN.-$SlTdJN-N-AVYF$S
WRF
;$VXe3 Wch,IVJ LcOKOfaRV'$Ic
Bl,OZzYUXKwxzg?ye ;rF
$VIhuLXvVUJpPNEONx:yZE, -!


In [13]:
# Optimizer
opt = AdamW(model.parameters(), lr = lr)

In [14]:
# Train BigramLanguageModel
for epoch in range(epochs):
    # Get Batch
    x, y = get_batch('train')
    x, y = x.to(device), y.to(device)

    # Forward
    logits, loss = model(x, y)

    # Backward
    opt.zero_grad(set_to_none = True)
    loss.backward()
    opt.step()

    if epoch%print_freq == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item()}')

context = torch.zeros((1, 1), dtype = torch.long).to(device)
print(decode_text(model.generate(context = context, max_new_tokens = 500)[0].tolist()))

Epoch: 0, Loss: 4.355591297149658
Epoch: 500, Loss: 2.5242409706115723
Epoch: 1000, Loss: 2.447714328765869
Epoch: 1500, Loss: 2.4967148303985596
Epoch: 2000, Loss: 2.4841816425323486
Epoch: 2500, Loss: 2.497438907623291
Epoch: 3000, Loss: 2.5219507217407227
Epoch: 3500, Loss: 2.5380187034606934
Epoch: 4000, Loss: 2.4421045780181885
Epoch: 4500, Loss: 2.432539463043213
arou is thrivees;

KINofiesivenst:
Thewint
Pl t he t cend ath mank!
Andlosie ou weau one t byoor atavise, ft t RKIOLINSAct at mintherte edowrofr wses ame an bu plent'd w, aves ntherth tieneitof gron me
UMongveelin fercle jo his Anduge t bighavee ullt beran y f wicink is
SThe m st deat:
Thod fowotha nthass,  at. wo oungeaso nd athost my s t trachelalald uth dis
We ten woms by, gourepe bu he to:
Lot levitipon sh dwempe h htoflsastowel.
AR: t heerome oundind clot d pearantal
KI agounin,ikin rs he ti


# **Attention**

In [15]:
B, T, C = 4, 8, 2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [16]:
# v1
x_bow = torch.zeros((B,T,C))
for i in range(B):
    for j in range(T):
        x_prev = x[i, :j,]
        x_bow[i, j] = torch.mean(x_prev, 0)
print(x_bow.shape)

torch.Size([4, 8, 2])


In [17]:
# Matrix Mul Weighted Avg
a = torch.tril(torch.ones((3,3)))
a = a/torch.sum(a, 1, keepdim = True)
b = torch.randint(0, 10, (3,2)).float()
c = a@b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 5.],
        [5., 1.],
        [5., 9.]])
tensor([[2.0000, 5.0000],
        [3.5000, 3.0000],
        [4.0000, 5.0000]])


In [18]:
# v2
weights = torch.tril(torch.ones((T,T)))
weights = weights/torch.sum(weights, 1, keepdim = True)
# (T,T)@(B,T,C) -> (B,T,T)@(B,T,C) -> (B,T,C)
x_bow2 = weights @ x
print(x_bow2.shape)

torch.Size([4, 8, 2])


In [19]:
# v3
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
x_bow3 = wei@x

In [20]:
# v4
B,T,C = 4,8,32
head_size = 16
x = torch.randn(4,8,32)

key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)

k = key(x)
q = query(x)
v = value(x)

wei = q @ k.transpose(-2,-1)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
x_bow4 = wei@v
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6995, 0.3005, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4911, 0.1025, 0.4064, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2578, 0.4585, 0.1924, 0.0914, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0679, 0.2991, 0.2730, 0.1803, 0.1797, 0.0000, 0.0000, 0.0000],
         [0.0414, 0.0262, 0.1673, 0.0914, 0.6451, 0.0287, 0.0000, 0.0000],
         [0.0198, 0.6976, 0.0092, 0.0054, 0.0036, 0.0156, 0.2488, 0.0000],
         [0.0829, 0.0153, 0.0221, 0.1862, 0.0398, 0.5774, 0.0631, 0.0132]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1648, 0.8352, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5472, 0.2831, 0.1698, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2182, 0.3037, 0.2606, 0.2175, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0924, 0.2608, 0.1824, 0.0544, 0.4100, 0.0000, 0.0000, 0.0000],
         [0.3215, 0.018

# **Transformer Model**

**Params**

In [21]:
# Seed
torch.manual_seed(1337)
# Device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# Batch Size
BATCH_SIZE = 16
# Maximum Context Length
BLOCK_SIZE = 32
# Embedding Size
N_EMBED = 64
# Number of Layers of Transformer Block
N_LAYERS = 6
# Number of MHA Heads
NUM_HEADS = 6
# Dropout
DROPOUT = 0.0
# Printing Frequency
PRINT_FREQ = 500
# Epochs
EPOCHS = 5000
# Learning Rate
LR = 1e-3

**Data**

In [22]:
# Read Dataset
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [23]:
# Unique Characters in the Dataset
characters = list(set(text))
VOCAB_SIZE = len(characters)
print(''.join(sorted(characters)))
print(len(characters))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [24]:
# Encode Text
mapping = {char:i for i, char in enumerate(characters)}
rev_mapping = dict(enumerate(characters))

encode_text = lambda string: [mapping[s] for s in string]
decode_text = lambda ls: ''.join([rev_mapping[l] for l in ls])

# Test
print(encode_text('Hi There!'))
print(decode_text(encode_text('Hi There!')))

[54, 11, 26, 32, 29, 23, 49, 23, 22]
Hi There!


In [25]:
# Creating Tensor Dataset of Encoded Text
data = torch.tensor(encode_text(text), dtype = torch.long)
print(data.shape)

torch.Size([1115394])


In [26]:
# Train Test Split
n = int(0.9*(len(data)))
train = data[:n]
test = data[n:]

In [27]:
# DataLoader
def get_batch(split):
    # Get Data
    inp = train if split == 'train' else test
    # Random Indexes
    ix = torch.randint(len(inp)-BLOCK_SIZE, (BATCH_SIZE,))
    # x is from i:i+block_size, y is i+1:i_block_size+1
    x = torch.stack([inp[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([inp[i+1:i+BLOCK_SIZE+1] for i in ix])
    return x, y

x_batch, y_batch = get_batch('train')

**Eval Loss**

In [28]:
@torch.no_grad()
def estimate_loss():
    model.eval()
    x, y = get_batch('val')
    x, y = x.to(device), y.to(device)
    _, loss = model(x, y)
    model.train()
    return loss

**Attention**

In [29]:
# Self Attention
class Head(nn.Module):
    """One Head of Self Attention"""
    def __init__(self, block_size, n_embed, head_size, dropout):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias = False)
        self.query = nn.Linear(n_embed, head_size, bias = False)
        self.value = nn.Linear(n_embed, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Get Input Shape
        B, T, C = x.shape

        # Key, Query, Values
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        # Masked Attention
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)

        out = wei@v

        return out

In [30]:
# Multi-Head Attention
class MHA(nn.Module):
    """Multi-Head Self Attention"""
    def __init__(self, block_size, n_embed, num_heads, head_size, dropout):
        super().__init__()
        self.heads = nn.ModuleList([Head(block_size, n_embed, head_size, dropout) for _ in range(num_heads)])
        self.projection = nn.Linear(head_size*num_heads, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.projection(x))

        return out

**Networks**

In [31]:
# FeedForward Network
class FeedForward(nn.Module):
    """Feed-Forward Network"""
    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed*4),
            nn.ReLU(),
            nn.Linear(n_embed*4, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [32]:
# Transformer Block
class Block(nn.Module):
    """Transformer Block"""
    def __init__(self, block_size, n_embed, num_heads, dropout):
        super().__init__()
        head_size = n_embed // num_heads
        self.sa = MHA(block_size, n_embed, num_heads, head_size, dropout)
        self.ff = FeedForward(n_embed, dropout)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        out = x + self.ff(self.ln2(x))

        return out

**Transformers Model**

In [33]:
# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, block_size, vocab_size, n_embed, n_layers, num_heads, dropout, device):
        super(TransformerModel, self).__init__()
        # Model Variables
        self.device = device
        self.block_size = block_size

        # Each Token Reads a Row From The Table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        # Token Position Embedding Table
        self.pos_embed_table = nn.Embedding(n_embed, n_embed)
        # Transformer Table
        self.blocks = nn.Sequential(*[Block(block_size, n_embed, num_heads, dropout) for _ in range(n_layers)])
        # Layer Norm
        self.ln = nn.LayerNorm(n_embed)
        # Final Layer
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, context, targets = None):
        # Get Shape
        B, T = context.shape

        # Token embedding (batch_size, block_size, n_embed)
        token_embed = self.token_embedding_table(context)
        # Positional Embedding
        pos_embed = self.pos_embed_table(torch.arange(T, device = self.device))
        # Adding Positional Embedding
        x = token_embed + pos_embed
        # Transformer Block
        x = self.blocks(x)
        # Logits (batch_size, block_size, vocab_size)
        logits = self.lm_head(self.ln(x))

        # Loss
        if targets == None:
            loss = None
        else:
            # Get batch_size, block_size, vocab_size
            B, T, C = logits.shape
            # Reshape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T,)
            # Calculate Loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, context, max_new_tokens):
        # Iterating Through Number of Tokens To Generate
        for i in range(max_new_tokens):
            # crop context to the last block_size tokens
            context_new = context[:, -self.block_size:]
            # Get Predictions
            logits, loss = self(context_new)
            # Get Last Block (Time Step)
            logits = logits[:, -1, :]
            # Probability
            probs = F.softmax(logits, dim = -1)
            # Sample From The Distribution
            context_next = torch.multinomial(probs, num_samples = 1)
            # Append
            context = torch.cat((context, context_next), dim = 1)

        return context

In [34]:
# Test Transformer Model
model = TransformerModel(BLOCK_SIZE, VOCAB_SIZE, N_EMBED, N_LAYERS, NUM_HEADS, DROPOUT, device).to(device)
context = torch.zeros((1, 1), dtype = torch.long).to(device)
print(decode_text(model.generate(context = context, max_new_tokens = 100)[0].tolist()))

aFkNsUcYrvBr !EDdNJBTJoWnWJGXveaif,Aijr-wcmml&fMIRxSo3&YR-CrFSW&Ia?MiKyMWpHM- zdkUV-Ux;pcyn:'M.VYXImG


In [35]:
# Optimizer
opt = AdamW(model.parameters(), lr = LR)

In [36]:
# Train Transformer Model
for epoch in range(EPOCHS):
    # Get Batch
    x, y = get_batch('train')
    x, y = x.to(device), y.to(device)

    # Forward
    logits, loss = model(x, y)

    # Backward
    opt.zero_grad(set_to_none = True)
    loss.backward()
    opt.step()

    if epoch%PRINT_FREQ == 0:
        val_loss = estimate_loss()
        print(f'Epoch: {epoch}, Train Loss: {loss.item()}, Val Loss: {val_loss.item()}')

context = torch.zeros((1, 1), dtype = torch.long).to(device)
print(decode_text(model.generate(context = context, max_new_tokens = 500)[0].tolist()))

Epoch: 0, Train Loss: 4.44534158706665, Val Loss: 4.16944694519043
Epoch: 500, Train Loss: 2.2279982566833496, Val Loss: 2.3316385746002197
Epoch: 1000, Train Loss: 2.050748825073242, Val Loss: 2.0315189361572266
Epoch: 1500, Train Loss: 1.8715524673461914, Val Loss: 1.899666428565979
Epoch: 2000, Train Loss: 1.8471977710723877, Val Loss: 2.059528112411499
Epoch: 2500, Train Loss: 1.7594726085662842, Val Loss: 1.921852707862854
Epoch: 3000, Train Loss: 1.7358428239822388, Val Loss: 2.0064680576324463
Epoch: 3500, Train Loss: 1.6826268434524536, Val Loss: 1.689353108406067
Epoch: 4000, Train Loss: 1.7672981023788452, Val Loss: 1.7691177129745483
Epoch: 4500, Train Loss: 1.5309263467788696, Val Loss: 1.7548606395721436
a news,
He hath have shall aladelars, you for watching,
And Godgerale uppleiss if the bookes own or.

KING RICHIION:
Help's is mark it: were Lormor,--great
drust! hour suirt's procandey, and
doubt to ray I
With he yourght. Goong holy make and tenderched.

AUTOLYCUS:
Uturn 