In [2]:
#creating LLm nano GPT 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [3]:
#downloading Dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-12-15 18:09:24--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-12-15 18:09:25 (3.51 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [4]:
# reading input file just to check
with open("input.txt", 'r', encoding='utf-8') as f:
    text = f.read()
print(f"Length of text: {len(text)} characters")

Length of text: 1115394 characters


In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
unique_chars = sorted(list(set(text)))
vocab_size = len(unique_chars)
print(f"Vocab size: {vocab_size}")
print(f"Unique chars: {''.join(unique_chars)}")

Vocab size: 65
Unique chars: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [7]:
#string to intiger dic
stoi = { c:i for i, c in enumerate(unique_chars)}
itos = { i:c for i, c in enumerate(unique_chars)}

encoder = lambda s:[stoi[x] for x in s]
decoder = lambda l: ''.join([itos[i] for i in l])

In [8]:
# encoding the input shakespere text 

data = encoder(text)

torch_data = torch.tensor(data)
print(torch_data[:1000])
print(torch_data.shape)


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [9]:
#import sequential train test split
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(torch_data, test_size=0.1, shuffle=False)

In [10]:
#batching data
#set seed
torch.manual_seed(1337)

block_size = 8 #the charachter block size for the transformer
batch_size = 4 #the batch size

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [11]:
ix = torch.randint(len(torch_data) - block_size, (batch_size,))
print(ix)

tensor([1078327,  453969,   41646,  671252])


In [12]:
#implementing a bigrammodel
class BigramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        #get scores of what comes next. I give you the columns and you give me the rows
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            targets = targets.view(-1)
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [13]:
model = BigramModel(vocab_size)

In [14]:
xi,yi = get_batch('train')
print(xi)
print(yi)
result, loss = model.forward(xi,yi)

tensor([[59,  1, 57, 53,  1, 59, 54, 53],
        [53, 44,  0, 46, 39, 52, 45, 47],
        [59,  1, 52, 53, 58,  1, 42, 53],
        [59, 45, 45, 43, 57, 58, 43, 42]])
tensor([[ 1, 57, 53,  1, 59, 54, 53, 52],
        [44,  0, 46, 39, 52, 45, 47, 52],
        [ 1, 52, 53, 58,  1, 42, 53,  1],
        [45, 45, 43, 57, 58, 43, 42,  0]])


In [15]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [16]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4946868419647217


In [17]:
#start generation with 0
idx = torch.zeros((1,1), dtype=torch.long)
shakespear = model.generate(idx, max_new_tokens=500)[0].tolist()
shakespear_decoded = decoder(shakespear)
print(shakespear_decoded)



HARY the t w: ber,
NNThedes sthul tt, lesoras RDomyon ce:

Bugangarse?
DUCEROl?
BETusiskes ioreFak, hot whit r
D her, mbeicou tis, iss, be atnoorerst, byou,

G t to thealourds shavert y, Ghi mangqutsced, s r ghoshaveeaveer toiknor cou bomithemiutho lelor or s.
ING thito my theais tervet thiule d har s h s? o hee,
A minoverqube wisuby le tonda apawakleey,'s justw?
Yor, s;
Wham bens ld?'scouom four
LUShathrithendat, Le' vie p'r ttr mbes3Yor theny s wa ERimaison s ate-
Thes orim ag t curs grthatwe 


In [24]:
torch.manual_seed(1337)

batch_size = 4 #the batch size
block_size = 8 #the charachter block size for the transformer
channels = 2 #the number of channels

x = torch.randn((batch_size, block_size, channels))
x.shape

torch.Size([4, 8, 2])

In [28]:
xbow = torch.zeros((batch_size, block_size, channels))
for b in range(batch_size):
    for t in range(block_size):
        xprev = x[b, :t+1] # (t, C)
        xbow[b,t] = torch.mean(xprev, 0)

In [38]:
wei = torch.tril(torch.ones(block_size, block_size))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x

In [39]:
xbow2.shape

torch.Size([4, 8, 2])

In [37]:
a = torch.tril(torch.ones(3,3))
# normalize a by row
a = a / torch.sum(a, 1, keepdim=True)

b = torch.randint(0,10,(3,2)).float()

#c = a * b
c  = a @ b
c

tensor([[1.0000, 9.0000],
        [0.5000, 5.0000],
        [3.3333, 6.0000]])

In [43]:
tril = torch.tril(torch.ones(block_size, block_size))
wei = torch.zeros((block_size, block_size))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

xbow3 = wei @ x
print(wei)






tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
