In [1]:
#download the data from internet using !wget
!wget -P /content/data/ https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


--2025-03-13 03:50:21--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘/content/data/input.txt’


2025-03-13 03:50:21 (16.9 MB/s) - ‘/content/data/input.txt’ saved [1115394/1115394]



In [2]:
with open('/content/data/input.txt','r',encoding = 'utf-8') as f:
    text = f.read()

In [3]:
print(f"length of chars: {len(text)}")

length of chars: 1115394


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
#create a mapping from chars to integer
stoi = {s:i for i,s in enumerate(chars)}
itos={i:s for s,i in stoi.items()}
#encoder -> take a string and o/p list of integers
encoder = lambda s: [stoi[c] for c in s]
decoder = lambda l:''.join([itos[i] for i in l])
print(encoder("hi sneha"))
print(decoder(encoder("hi sneha")))

[46, 47, 1, 57, 52, 43, 46, 39]
hi sneha


In [6]:
#encode the entire text dataset and store it into the torch tensor
import torch
data = torch.tensor(encoder(text),dtype = torch.long)
print(data.shape,data.dtype)

torch.Size([1115394]) torch.int64


In [7]:
#tran test split
n = int(0.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [8]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
x,y

(tensor([18, 47, 56, 57, 58,  1, 15, 47]),
 tensor([47, 56, 57, 58,  1, 15, 47, 58]))

In [10]:

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"context: {context} and target: {target}")

context: tensor([18]) and target: 47
context: tensor([18, 47]) and target: 56
context: tensor([18, 47, 56]) and target: 57
context: tensor([18, 47, 56, 57]) and target: 58
context: tensor([18, 47, 56, 57, 58]) and target: 1
context: tensor([18, 47, 56, 57, 58,  1]) and target: 15
context: tensor([18, 47, 56, 57, 58,  1, 15]) and target: 47
context: tensor([18, 47, 56, 57, 58,  1, 15, 47]) and target: 58


In [11]:
num = torch.randint(3,10,(4,))
num

tensor([7, 4, 3, 5])

In [12]:
#batch_size of 4 i.e. feed 4 rows of bolcksize
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else test_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size]for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

x,y = get_batch('train')
print('inputs: ')
print(x.shape)
print(x)
print('targets: ')
print(y.shape)
print(y)

print('-------'*10)

for b in range(batch_size):
    for t in range(block_size):
        context = x[b, :t+1]
        target = y[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------------------------------------------------------------------
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
wh

In [13]:
print(x)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        #each ele in vocab will have dim of vocab size -> the prob of generate the next chars among all the chars
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx,targets = None):
        #idx dim : (Batch,Time)
        logits = self.token_embedding_table(idx) #-> batch,time,channel
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)

        return logits,loss

    def generate(self,idx,max_new_tokens):
        #idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            #prediction
            logits,loss = self(idx)
            #get the logits of the last time step, this is the prediction for the next token
            logits = logits[:, -1, :]  #(B, C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) #(B, C)
            #sample from the distribution to get next token
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            #concatenate the new token to the input sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)
        return idx
m = BigramLanguageModel(vocab_size)
logits,loss = m(x,y)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [15]:
print(decoder(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [16]:
print(decoder(m.generate(idx=torch.zeros((1,1),dtype = torch.long),max_new_tokens=100)[0].tolist()))


pdcbf?pGXepydZJSrF$Jrqt!:wwWSzPNxbjPiD&Q!a;yNt$Kr$o-gC$WSjJqfBKBySKtSKpwNNfyl&w:q-jluBatD$Lj;?yzyUca


In [17]:
#create pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr= 1e-3)

In [18]:
batch_size = 32
for steps in range(1000):
    x,y = get_batch('train')
    logits,loss = m(x,y)
    loss.backward()
    optimizer.step()

print(loss.item())

3.0705056190490723


In [19]:
print(decoder(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


F bx-QJe.
Qxcorea wstreA bi. pEse

Glva EretLhMk,ep seay sVeARWWy, tUREENTbr ckXE3I&V&POnoi-s,Ioton vixcLInsiEl'dm C-.
IOrPR
IMI3fa hO ALIZveA kpre,

PoBathZ;P?usev?ohoRiByENoy3B&jum.L; ik,
FowayaKROn s
WCIN.JU'ad; weOLAqVO:CoiBo? se I3Dor,ir.
y
OF r s mmyhist pZrAQll' ?YotezN
BwowKo'sBqFoo?ppYenY uelk,-Z mdsa!wagmowens,SOLI:B-myce?nonstay,
AywNodd  bGSonAlllleQnq-luHAUP.
IANaongfI&xJUMAXEHE&DUFRSaiqFpre fasi.

OfanXETIUCo,VzeChattR:CHYDINIUzUEO CE;hoTETWPratgrWhOEXEhaU'xx;
pjJUTFOdZ-wolstetRy l


In [30]:
# self attention!
torch.manual_seed(42)
x = torch.rand((4,8,2))
B,T,C = x.shape
xbow = torch.zeros((B,T,C ))
for b in range(B):
    for t in range(T):
        context = x[b,:t+1]
        xbow[b,t] = torch.mean(context,0)

In [31]:
xbow[0]

tensor([[0.8823, 0.9150],
        [0.6326, 0.9372],
        [0.5519, 0.8251],
        [0.4780, 0.8172],
        [0.5706, 0.6804],
        [0.6313, 0.6659],
        [0.6653, 0.6519],
        [0.6748, 0.6241]])

In [32]:
# version 2 -> using matrix multiplication
# final mat -> (T,C)
wei = torch.tril(torch.ones(T,T))
wei = wei/wei.sum(1,keepdim = True)
xbow2 = wei @ x
torch.allclose(xbow,xbow2)

True

In [34]:
#version 3: using softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril ==0,float('-inf'))
wei = F.softmax(wei,dim = -1)
xbow3 = wei @ x
torch.allclose(xbow,xbow3)

True

In [45]:
#version 4: self attention
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

#single head attention
head_size = 16
key = nn.Linear(C,head_size,bias = False)
query = nn.Linear(C,head_size,bias = False)
value = nn.Linear(C,head_size,bias = False)
k = key(x)
q = query(x)
v = value(x) #output = (batch,seq,head_size)

wei = (q @ k.transpose(-2,-1)*head_size**-0.5) #gives the raw interaction b/w the tokens
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0,float('-inf')) #gives how much a particulat token depends on it and it's previous tokens
wei = F.softmax(wei,dim = -1)
out = wei @ v #(B,T,T) @ (B,T,16)
out.shape
out[0]

tensor([[-1.5713e-01,  8.8009e-01,  1.6152e-01, -7.8239e-01, -1.4289e-01,
          7.4676e-01,  1.0068e-01, -5.2395e-01, -8.8726e-01,  1.9068e-01,
          1.7616e-01, -5.9426e-01, -4.8124e-01, -4.8599e-01,  2.8623e-01,
          5.7099e-01],
        [ 4.3974e-01, -1.4227e-01, -1.3157e-01,  2.8896e-03, -1.3222e-01,
          6.6079e-04, -2.7904e-01, -2.2676e-01, -2.8723e-01,  5.7456e-01,
          5.6053e-01, -2.5208e-01,  9.7243e-02,  1.0771e-01,  3.0455e-02,
          1.0727e+00],
        [ 4.3615e-01, -6.6358e-02, -2.9296e-01,  7.4315e-02,  5.4381e-02,
         -7.0388e-02, -6.8985e-02, -8.2153e-02, -2.9377e-01, -5.8952e-02,
          3.5887e-01, -2.3087e-03, -1.8212e-01, -3.6142e-02, -6.7189e-02,
          1.1412e+00],
        [ 4.2069e-01, -1.0619e-01, -2.9984e-01,  5.2820e-02,  2.0077e-01,
         -1.6048e-01, -3.5710e-02, -8.3110e-02, -1.7919e-01,  7.7992e-02,
          1.2719e-01,  2.2611e-02, -5.1811e-02,  7.4466e-02,  1.8131e-01,
          8.4463e-01],
        [ 3.9499e-01

In [46]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
        [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
        [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
        [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],
       grad_fn=<SelectBackward0>)