In [100]:
#creating LLm nano GPT 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [101]:
#downloading Dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-12-12 18:40:38--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2023-12-12 18:40:39 (2.58 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [102]:
# reading input file just to check
with open("input.txt", 'r', encoding='utf-8') as f:
    text = f.read()
print(f"Length of text: {len(text)} characters")

Length of text: 1115394 characters


In [103]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [104]:
unique_chars = sorted(list(set(text)))
vocab_size = len(unique_chars)
print(f"Vocab size: {vocab_size}")
print(f"Unique chars: {''.join(unique_chars)}")

Vocab size: 65
Unique chars: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [105]:
#string to intiger dic
stoi = { c:i for i, c in enumerate(unique_chars)}
itos = { i:c for i, c in enumerate(unique_chars)}

encoder = lambda s:[stoi[x] for x in s]
decoder = lambda l: ''.join([itos[i] for i in l])

In [106]:
# encoding the input shakespere text 

data = encoder(text)

torch_data = torch.tensor(data)
print(torch_data[:1000])
print(torch_data.shape)


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [107]:
#import sequential train test split
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(torch_data, test_size=0.1, shuffle=False)

In [119]:
#batching data
#set seed
torch.manual_seed(1337)


block_size = 8 #the charachter block size for the transformer
batch_size = 4 #the batch size

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [120]:
ix = torch.randint(len(torch_data) - block_size, (batch_size,))
print(ix)

tensor([1078327,  453969,   41646,  671252])


In [127]:
#implementing a bigrammodel

class BigramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        #get scores of what comes next. I give you the columns and you give me the rows
        logits = self.token_embedding_table(idx)

        

        if targets is None:
            loss = None
        else:
            targets = targets.view(-1)
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    



In [128]:
model = BigramModel(vocab_size)

In [129]:
xi,yi = get_batch('train')
print(xi)
print(yi)
result, loss = model.forward(xi,yi)

tensor([[ 1, 46, 43,  0, 35, 46, 53, 57],
        [ 1, 50, 47, 44, 43,  6,  0, 32],
        [44,  1, 57, 47, 45, 46, 58,  6],
        [42, 57, 58,  1, 49, 43, 54, 58]])
tensor([[46, 43,  0, 35, 46, 53, 57, 43],
        [50, 47, 44, 43,  6,  0, 32, 46],
        [ 1, 57, 47, 45, 46, 58,  6,  1],
        [57, 58,  1, 49, 43, 54, 58,  1]])


tensor([[ 0, 52,  4, 52, 55, 15, 13, 21, 15, 40, 54, 23, 21, 33, 48, 59, 23, 27,
         33, 33, 54, 61,  3, 20, 14, 15, 13,  7, 32, 52,  8,  6,  8, 23, 58, 51,
         56, 27, 39,  3,  6, 49, 42, 10,  6, 10, 42, 48, 25, 21,  0, 17, 63, 55,
         26,  3, 59, 19, 12, 22, 36, 14,  2, 45, 29, 20, 21, 11, 28, 64, 56, 14,
         40, 35, 61, 58, 14, 16, 37, 21, 54, 61, 30, 20, 14, 36, 54,  0, 54, 12,
         28, 38, 55, 14, 64, 60, 35,  7, 46, 33, 15]])

In [131]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [136]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.3773505687713623


In [140]:
#start generation with 0
idx = torch.zeros((1,1), dtype=torch.long)
shakespear = model.generate(idx, max_new_tokens=500)[0].tolist()
shakespear_decoded = decoder(shakespear)
print(shakespear_decoded)


AR. manth, pod, islweag; t be loy kesage r yedis orenf d wof; id ommm ilifendoual rmece Bolfre t te ham; INThe p oferwo con ge iproru I, tst the.
F than INu, w wovamswown are san honthe hoird ukidmmy CO:
D gong mur talve cad m?

'Fonte m'
G myo ngbur, o ther t beater an Y:

VIFowas wanartar aus
O:
GABE s whoon thnkin bl MEDO:
MI k.'so: rrs bowhelu en wher. me'st ALToounoun gbrullind omoo, ery ng, twer beis ave y adeula sewat! Fou mis t s bame thoud meas nd myo tr'zer upe mst ver,
CIANE fo tsouci
