In [87]:
import torch
import torch.nn as nn

In [88]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [89]:
def gpu_check():
    num_gpus=torch.cuda.device_count()
    cuda_available = torch.cuda.is_available()
    cuda_version = torch.version.cuda if cuda_available else 'N/A'
    return { 'no of gpu':num_gpus , 'cuda-available': cuda_available , 'cuda-version': cuda_version }
print(gpu_check())

{'no of gpu': 1, 'cuda-available': True, 'cuda-version': '12.1'}


In [90]:
#(32,25)-> (32,25,80) -> ()
context_size = 25
batch_size = 32 
n_heads = 6
n_emb = 80
dp_threshold = 0.2
n_layers = 6
eval_iters = 200
max_iterations = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
learning_rate = 3e-4

In [5]:
def get_data(file_path , train_split):
    global vocab, vocab_size, encode , decode , stoi , itos
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read().replace('\n', ' ')
        vocab = sorted(list(set(data)))
        vocab_size = len(vocab)
        stoi={s:i for i,s in enumerate(vocab)}
        itos={i:s for s,i in stoi.items()}
        encode = lambda l: [stoi[ch] for ch in l]
        decode = lambda n: ''.join(itos[i] for i in n)
        encoded_data = torch.tensor(encode(data))
        threshold = int((train_split/100)*len(encoded_data))
        train_data = encoded_data[:threshold]
        val_data = encoded_data[threshold:]
        
    return train_data , val_data

In [7]:
train_data , val_data = get_data('input.txt' , train_split=90)
print(f'size of train and val data {str(len(train_data)):5s} , {str(len(val_data))}')

size of train and val data 1003854 , 111540


In [8]:
def get_batch(split):
    data_split = train_data if split=='train' else val_data
    ix = torch.randint(len(data_split) - context_size , (batch_size,))
    x = torch.stack([data_split[i:i+context_size] for i in ix])
    y = torch.stack([data_split[i+1:i+context_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [9]:
class Head(nn.Module):
    '''single self-attention block'''
    def __init__(self , head_size):
        super().__init__()
        self.q = nn.Linear(n_emb , head_size)
        self.k = nn.Linear(n_emb , head_size)
        self.v = nn.Linear(n_emb , head_size)
        #self.tril = torch.tril(torch.ones(context_size ,context_size))
        self.register_buffer('tril', torch.tril(torch.ones(context_size ,context_size)))
        
    def forward(self,x):
        B,T,C = x.shape
        query = self.q(x) # It is called self attention because the information is extracting from input(x) only
        key = self.k(x)
        value = self.v(x)
        w = query @ key.transpose(-2 , -1)* key.shape[-1]**-0.5
        wei = w.masked_fill(self.tril[:T, :T] == 0 , float('-inf')) # it will become an encoder if this step is removed
        wei = torch.nn.functional.softmax(wei , dim =-1)
        out = wei @ value
        return out
        

In [10]:
class MultiHeadAttention(nn.Module):
    ''' multiple self-attention blocks'''
    def __init__(self , head_size , n_heads):
        super().__init__()
        self.sa = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.Linear = nn.Linear(head_size*n_heads , n_emb)
        self.dropout = nn.Dropout(dp_threshold)
        
    def forward(self , x):
        sa = torch.cat([h(x) for h in self.sa], dim=-1)
        out = self.Linear(sa)
        out = self.dropout(out)
        return out
        
        

In [11]:
class FeedForward(nn.Module):
    def __init__(self , n_embd):
        super().__init__()
        self.net=nn.Sequential(
                                nn.Linear(n_embd , 4*n_embd),
                                nn.ReLU(),
                                nn.Linear(4*n_embd , n_embd),
                                nn.Dropout(dp_threshold))
    def forward(self , x):
        return self.net(x)

In [12]:
class Block(nn.Module):
    def __init__(self , n_emb , n_heads):
        super().__init__()
        head_size = n_emb // n_heads
        self.self_attention = MultiHeadAttention(head_size , n_heads)
        self.ff = FeedForward(n_emb)
        self.ly1=nn.LayerNorm(n_emb)
        self.ly2=nn.LayerNorm(n_emb)
    def forward(self , x):
        x = x + self.self_attention(self.ly1(x))
        x = x + self.ff(self.ly2(x))
        return x

In [57]:
class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1 , n_emb )
        self.pos_embedding = nn.Embedding(context_size , n_emb)
        self.block = nn.Sequential(*[Block(n_emb , n_heads) for _ in range(n_layers)])
        self.Linear = nn.Linear(n_emb , vocab_size+1)
        self.ly = nn.LayerNorm(n_emb)
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        
    def forward(self , idx , target = None):
        b,t = idx.shape
        
        token_emb = self.embedding(idx) # (b,t,n_emb)
       
        pos_emb = self.pos_embedding(torch.arange(t , device=device)) #(t,n_emb) , think about pos_emb as a value-matrix of context char's being in that position in context
       
        x = token_emb + pos_emb # (b,t,n_emb)
       
        x = self.block(x)
        
        x = self.ly(x)
        logits = self.Linear(x)
        
        if target is not None:
            b,t,c = logits.shape
            logits = logits.view(b*t , c)
            
            target = target.view(b*t)

            loss = nn.functional.cross_entropy(logits , target)
        else:
            loss = None
        
        return logits , loss
        
        
    def generate(self , idx , max_tokens):
        
        for _ in range(max_tokens):
            idx_cond = idx[: , -context_size:]
           
            logs ,loss = self(idx_cond)
            
            logs = logs[: , -1 , :]
            
            exps = nn.functional.softmax(logs , dim=-1)
            ix = torch.multinomial(exps , num_samples=1)
            idx = torch.cat((idx,ix), dim =1)
            
        return idx
            
        
        
        
    

In [59]:
model = GPTModel()
m=model.to(device)
print(sum(p.numel() for p in m.parameters()))
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

475789


In [48]:
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [49]:
for i in range(max_iterations):
    
    if i%100 == 0 or i == max_iterations-1:
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
    x , y = get_batch('train')
    
    logits , loss = model(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    

step 0: train loss 4.1698, val loss 4.1668
step 100: train loss 2.8370, val loss 2.8660
step 200: train loss 2.5739, val loss 2.5890


KeyboardInterrupt: 

In [60]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_tokens=500)[0].tolist()))

KeyError: 0

In [62]:
temp = m.generate(context, max_tokens=500)

In [70]:
t=temp[0].tolist()
string=''.join(itos[i] for i in t)

KeyError: 0

In [72]:
decode(encode('mohith'))

'mohith'

In [73]:
encode('mohith')

[51, 53, 46, 47, 58, 46]

In [78]:
temp.max()

tensor(64, device='cuda:0')

In [80]:
decode(temp.tolist()[0])

KeyError: 0

In [79]:
decode([64])

'z'

In [81]:
itos

{1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

In [82]:
temp.min()

tensor(0, device='cuda:0')

In [83]:
temp.max()

tensor(64, device='cuda:0')

In [85]:
min(itos.keys())

1

In [86]:
max(itos.keys())

64