## Mini-Decoder Architecture

In [1]:
# to read doc files install python-docx library
pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0mm:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25ldone
[?25h  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184490 sha256=a838ec79b4b03454471d36201473aa38bd7286f8135bb622bbb5654f633e2c4d
  Stored in directory: /home/nikhil/.cache/pip/wheels/b2/11/b8/209e41af524253c9ba6c2a8b8ecec0f98ecbc28c732512803c
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11
Note: you may need to restart the kernel to use updated packages.


In [5]:
# import docx from the python-docx library
import docx

In [None]:
# this pip install for only TPU not for GPU .....Recommended is GPU as TPU in general are only for very large processing i.e a very large batch size.
#!pip install cloud-tpu-client==0.10 torch==2.0.1 torchvision https://storage.googleapis.com/tpu-pytorch/wheels/cuda/117/torch_xla-2.0-cp310-cp310-linux_x86_64.whl --force-reinstall

In [6]:
# import pytorch library
import torch

# the below import is needed if you are using a xla device i.e if you are running this on a TPU
#import torch_xla.core.xla_model as xm

# import the nn module from the torch.nn package of the pytorch library
import torch.nn as nn

# import the functional modle from torch.nn package. This module provides a collection of useful functions that are commonly used in Neural network operations.
from torch.nn import functional as F


#Set the hyperparameters
batch_size = 64 
block_size = 256 
max_iters = 5000
eval_interval = 100
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Again the device should be set as given below if you are using a TPU device
#device = xm.xla_device()

eval_iters = 200
n_embd = 512
n_head = 8
n_layer = 8
dropout = 0.1

# We seed the environment to get the same output
torch.manual_seed(108)

# Load the document on which we want to train our model
doc = docx.Document('/content/Mahabharat annotated .docx')

text = ''
for paragraph in doc.paragraphs:
    text += paragraph.text + '\n'


# All the unique characters that occur in this text from the document. 
chars = sorted(list(set(text)))
vocab_size = len(chars) # we have a total of 82 characters which we can check by printing the vocab_size


# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# takes a string input and outputs a list of integer 
encode = lambda s: [stoi[c] for c in s] 
# takes a list of integer  and outputs a string
decode = lambda l: ''.join([itos[i] for i in l])
 
# Encoding the entire text dataset and then split it into Train and Validation sets. First 90% will be the train set, and rest  will be validation dataset
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) 
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Create a function to compute the average loss of the model on the training and validation sets
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


25.42293 M parameters
step 0: train loss 4.5513, val loss 4.5508
step 100: train loss 2.4510, val loss 2.4608
step 200: train loss 2.4015, val loss 2.4034
step 300: train loss 2.3747, val loss 2.3758
step 400: train loss 2.3447, val loss 2.3498
step 500: train loss 2.2940, val loss 2.2985
step 600: train loss 2.1488, val loss 2.1569
step 700: train loss 1.9789, val loss 1.9863
step 800: train loss 1.8717, val loss 1.8817
step 900: train loss 1.7846, val loss 1.7976
step 1000: train loss 1.7172, val loss 1.7268
step 1100: train loss 1.6491, val loss 1.6595
step 1200: train loss 1.5933, val loss 1.5968
step 1300: train loss 1.5399, val loss 1.5531
step 1400: train loss 1.5001, val loss 1.5120
step 1500: train loss 1.4577, val loss 1.4702
step 1600: train loss 1.4273, val loss 1.4385
step 1700: train loss 1.3959, val loss 1.4059
step 1800: train loss 1.3749, val loss 1.3844
step 1900: train loss 1.3466, val loss 1.3561
step 2000: train loss 1.3260, val loss 1.3377
step 2100: train loss 1.

In [7]:
print(decode(m.generate(context, max_new_tokens=20000)[0].tolist()))




said Tama also in the Sretas that free as unan(for I fallen) of them' sons again with the of aim of hostile. Then Aditya, the Vedana began to see hat Bhima which was penetrated to desire in scatter of his own incapable of regain. We called as the mighty Daupas. Like the Rivershis of Kunti went destroyemed to be ergone. Behold, O lord, I desire the birds. One king, the Istoried reasoly to the sender of Kasyapa! When the elephant, took Emancipation in the reyes is devoted to that should persons and attires and trusble. O Brahmana, there we will what over here? Those behelds having to sill. The world preceptor's rage, like the prowess up the permit of other arrows capable of poortion in his life. Where king doth is hither even nights while them went upon the mouth.'

"And mindful hearing dlished the words (worshipping the mantras), engaged with an ewhere speedily derivied, it is all me. The all, O Hersh, Uparavas the great body will rushed Cheding the resemble by the gods.'

"King do no