In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [101]:
batch_size = 64
block_size = 256 
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embed = 384

In [102]:
torch.manual_seed(1337)

<torch._C.Generator at 0x113b89e90>

In [103]:
chars = sorted(list(set(text)))

In [104]:
vocab_size = len(chars)

In [105]:
for i in range(65): print(chars[i], end='')


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

In [106]:
str_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_str = { i:ch for i,ch in enumerate(chars) }

In [107]:
encode = lambda s: [str_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_str[i] for i in l])

In [108]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [109]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [110]:
xb, yb = get_batch('train')

In [111]:
class bigramlangmodel(nn.Module):
    
    def __init__(self):
        super().__init__()
        #token reads the next token's logit through look up table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
    
    def forward(self, idx, targets=None):

        B,T = idx.shape
        token_embed = self.token_embedding_table(idx)#(B,T,C)
        pos_embed = self.position_embedding_table(torch.arange(T)) #(T,C)
        x = token_embed + pos_embed #(B,T,C)
        logits = self.lm_head(x) #(B,T,vocab_size)
        

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens): #idx--{B,T} format
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:]
            
            
            logits, loss = self(idx_cond)
            
            
            logits = logits[:,-1,:] # used to focus on the last step
            probs = F.softmax(logits,dim=-1) #softmax for probabs
            idx_next = torch.multinomial(probs, num_samples =1)
            #now to append the generated sample
            idx = torch.cat((idx, idx_next), dim = 1)
            
        return idx
    
m = bigramlangmodel()
logits, loss = m(xb, yb)
print(logits.shape)
print(logits, loss)

context = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))

torch.Size([16384, 65])
tensor([[ 0.4790, -0.2589, -1.0855,  ..., -1.9291,  0.6568, -0.3977],
        [ 0.3532,  0.6887,  1.5477,  ...,  3.0785,  0.0237,  0.2473],
        [ 0.3873,  0.9253, -0.8741,  ...,  0.7062, -0.8130,  1.1971],
        ...,
        [ 0.1898,  0.5052, -1.1181,  ...,  0.1589, -0.2386, -0.0422],
        [-0.1811, -0.3222, -1.0892,  ..., -0.6617,  0.9755, -0.4168],
        [-0.9795,  0.1372,  0.2305,  ..., -0.5707, -0.4258,  1.1643]],
       grad_fn=<ViewBackward0>) tensor(4.4995, grad_fn=<NllLossBackward0>)

KkqBJqo3?-&aCIv;Ht:KF'sWm?bxLMbp
gzjQS,HPVUoVX?yEEh-3C!dyE.wf3LXvDXU-D,,khH?xfyeUwtSlqxNhjnlxvLZi3EiVn3,Bieo
3;MpN gwXIys'$ULBr;NH l3kYc.S&IfumUdBstr;
3yQ'uRX,NH;exwuMppL:YBwwDN-BrywXF
TuOuR'e3wVIBaOnO.BzOVdTRagznu3AT3!?jx,q;QJZTPGTwQ3QnVgjZ3AGiJxsm
eaUtHUBW
Gt!Ik3ZIW
EIgkmbpomx'GY3T.lhYvomvSTcse,hOkSXxm
hxr3KmtQ-tdKVv.x?!-;MT-Ejl''cuSx?-NxLI umdA!zb;-Eo?MFX,kmLBq&uQQQNxxEeZnR
Efk.LeftmQt-HZQoYBWSw.tIkPAqtG&!:QItKhIp$&to !wW&iXoHjxLmLmiRyZDHnvEAm?eFStKzm3pTHmBtm

In [112]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [113]:
%%time
#batch_size = 32
for steps in range(10000):
    xb,yb = get_batch("train")
    
    # loss evaluate
    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print("loss = ",loss.item())

loss =  2.4408671855926514
CPU times: user 6min 35s, sys: 2min 12s, total: 8min 48s
Wall time: 3min 58s


In [114]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens = 500)[0].tolist()))


NVoortory d d frd sheacoord;
Then
Naigin h! oug.
MPritor s wsouke, my,
Hayouck.
BENRIIOuthen, s I y'd IVEvirowath het per me chepeale.
S: brd ind
NRY t, wsye;'d ny frint brof e. o h, It? OUTe I reat be,
Bus tlishe d k e LLOrd s'llle thishin ker


NEXE 'lite t temer bede tolliard le youlustharorer wistherthingule stoucover isend muts jenthinthaty t axce'd kem me ingg son les min laje,


THES:
GLA rrt h ino tetnksing a t gu geltrou y'tourvenoind sengng sthe ito;
Ty im ade wagre d l par
Theve; m pe


In [95]:
# version 4 : Self Attention Head

In [96]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# one self attention head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)
q = query(x)
wei = q @ k.transpose(-2,-1) * (head_size**-0.5) # (B,T,16) @ (B,16,T)--->(B,T,T)

tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei,dim =-1)

v = value(x)
out = wei @ v #(B,T,16)

#out = wei @ x
print(out.shape,k.shape,q.shape)


torch.Size([4, 8, 16]) torch.Size([4, 8, 16]) torch.Size([4, 8, 16])


In [97]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
        [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
        [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
        [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],
       grad_fn=<SelectBackward0>)

In [98]:
k.var()

tensor(0.3164, grad_fn=<VarBackward0>)

In [99]:
q.var()

tensor(0.3386, grad_fn=<VarBackward0>)

In [100]:
wei.var()

tensor(0.0287, grad_fn=<VarBackward0>)