In [1]:
!pip install tiktoken -q

In [2]:
import torch 
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence

In [3]:
import torch
import tiktoken
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self,emb_dim) -> None:
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean=x.mean(keepdim=True,dim=-1)
        var=x.var(keepdim=True,dim=-1,unbiased=False)
        norm_value=(x-mean)/torch.sqrt(self.eps+var)  
        return self.scale*norm_value+self.shift
    
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2)/torch.tensor(torch.pi))*(x+0.044715*torch.pow(x,3))))
    

class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers=nn.Sequential(
            nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),
            GELU(),
            nn.Linear(4*cfg['emb_dim'],cfg['emb_dim'])
        )
    def forward(self,x):
        return self.layers(x)
    

class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,num_heads,drop=0.5,qkvbias=False) -> None:
        super().__init__()
        assert (d_out%num_heads==0),"output dim should be divisible by number of heads"

        self.d_out=d_out

        self.w_query=nn.Linear(d_in,d_out,bias=qkvbias)
        self.w_key=nn.Linear(d_in,d_out,bias=qkvbias)
        self.w_value=nn.Linear(d_in,d_out,bias=qkvbias)

        self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))
        
        self.num_heads=num_heads
        self.head_dim=d_out//num_heads

        self.drop=nn.Dropout(drop)
        
        #the last layer
        self.out_proj=nn.Linear(d_out,d_out,bias=qkvbias)

    
    def forward(self,x):
        batch,num_tokens,input_dim=x.shape

        queries=self.w_query(x)
        key=self.w_key(x)
        value=self.w_value(x)
        
        queries=queries.view(batch,num_tokens,self.num_heads,self.head_dim)
        key=key.view(batch,num_tokens,self.num_heads,self.head_dim)
        value=value.view(batch,num_tokens,self.num_heads,self.head_dim)

        #lets transpose 
        queries=queries.transpose(1,2)
        key=key.transpose(1,2)
        value=value.transpose(1,2)

        attention_score=queries@key.transpose(2,3)

        mask_bool=self.mask.bool()[:num_tokens,:num_tokens]
        attention_score.masked_fill_(mask_bool,-torch.inf)
       
        attention_weight=torch.softmax(attention_score/key.shape[-1]**0.5,dim=-1)

        attention_weight=self.drop(attention_weight)

        context_vector=(attention_weight@value).transpose(1,2)

        context_vector=context_vector.contiguous().view(batch,num_tokens,self.d_out)
        context_vector=self.out_proj(context_vector)
        return context_vector

class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layer_norm1=LayerNorm(cfg['emb_dim'])
        self.layer_norm2=LayerNorm(cfg['emb_dim'])

        self.attention=MultiHeadAttention(d_in=cfg['emb_dim'],
                                      d_out=cfg['emb_dim'],
                                      context_length=cfg['context_length'],
                                      num_heads=cfg['n_heads'],
                                      drop=cfg['drop_rate'],
                                      qkvbias=cfg['qkv_bias']) 
        
        self.drop_residual=nn.Dropout(cfg['drop_rate'])
        self.feedforward=FeedForward(cfg)

    def forward(self,x):
        #first block
        residual=x  #residual attention
        x=self.layer_norm1(x)
        x=self.attention(x)
        x=self.drop_residual(x)

        #lets connect to residual
        x=x+residual

        #second block
        residual=x
        x=self.layer_norm2(x)
        x=self.feedforward(x)
        x=self.drop_residual(x)
        x=x+residual

        return x
    
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.token_emb=nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb=nn.Embedding(cfg['context_length'],cfg['emb_dim'])
        self.drop=nn.Dropout(cfg['drop_rate'])
        self.transformer_block=nn.Sequential(*[
            TransformerBlock(cfg) for _ in range(cfg['n_layers'])
        ])
        self.last_norm=LayerNorm(cfg['emb_dim'])
        self.out_prog=nn.Linear(cfg['emb_dim'],cfg['vocab_size'],bias=False)

    def forward(self,x):
        #shape
        batch,seq_length=x.shape

        toke_emb=self.token_emb(x)
        pos_emb=self.pos_emb(torch.arange(seq_length,device=x.device))
        
        x=toke_emb+pos_emb
        x=self.drop(x)

        x=self.transformer_block(x)
        x=self.last_norm(x)

        logits=self.out_prog(x)
        return logits


def generate_text(model,idx,context_length,new_token):
    for _ in range(new_token):
        idx=idx[:,-context_length:]
        with torch.no_grad():
            logits=model(idx)
            
        logits=logits[:,-1,:] #last token
        probs=torch.softmax(logits,dim=-1)
        next_word=torch.argmax(probs,dim=-1,keepdim=True)  #token position
        idx=torch.cat((idx,next_word),dim=1)
    return idx

In [4]:
class GptDataSetv1(Dataset):
    def __init__(self,tokenizer,dataset,context_length,stride) -> None:
        super().__init__()
        self.tokenizer=tokenizer

        #lets tokenize the text
        self.tokens=self.tokenizer.encode(dataset,allowed_special={"<|endoftext|>"})   #array of ids
        
        self.inputs=[]
        self.outputs=[]

        for i in range(0,len(self.tokens),stride):
            input_chunks=self.tokens[i:i+context_length]
            output_chunks=self.tokens[i+1:i+context_length+1]

            #lets append
            if(len(input_chunks)==context_length and len(output_chunks)==context_length):
                self.inputs.append(torch.tensor(input_chunks))
                self.outputs.append(torch.tensor(output_chunks))
    
    def __len__(self) -> int:
        return len(self.inputs)

    def __getitem__(self, index) :
        #purpose of this function is to make an input and output matcher
        return self.inputs[index].clone().detach(),self.outputs[index].clone().detach()

In [5]:
def collate_fn(batch):
    inputs,outputs=zip(*batch)
    inputs=pad_sequence(inputs,batch_first=True,padding_value=0)
    outputs=pad_sequence(outputs,batch_first=True,padding_value=0)
    return inputs,outputs

In [6]:
def create_dataloader_v1(txt,batch_size=4,context_length=120,stride=128,shuffle=True,drop_last=True):
    tokenizer=tiktoken.get_encoding('gpt2')
    dataset=GptDataSetv1(tokenizer,txt,context_length,stride)
    #prepare the datalaoder
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          collate_fn=collate_fn,
                          shuffle=shuffle,
                          drop_last=drop_last
                         )
    return dataloader

In [7]:
config = {
    "vocab_size": 50257,
    "context_length": 256, 
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [8]:
tokenizer=tiktoken.get_encoding('gpt2')

def text_to_ids(text,tokenizer):
    #this convert text into token ids
    
    encoded=tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor=torch.tensor(encoded)
    encoded_tensor=encoded_tensor.unsqueeze(dim=0)
    return encoded_tensor

def ids_to_text(ids,tokenizer):
    #this converts the tokens ids into text
    return tokenizer.decode(ids.squeeze(dim=0).tolist())

In [9]:
#for calculating the loss of a single batch
def loss_batch(inputs,target,model,device):
    #lets move all varaible into the same device
    inputs,target=inputs.to(device),target.to(device)
    
    logits=model(inputs)    
   
    loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target.flatten())
    return loss

In [10]:
#lets calculate the loss for the whole batch
def total_loss_batches(dataloader,model,device,num_batches=None):
    if(num_batches==None):
        num_batches=len(dataloader)
    else:
        num_batches=min(num_batches,len(dataloader))
    
    #lets calculate the loss over batches
    total_loss=0.
    for i,(inputs,target) in enumerate(dataloader):
        if(i<num_batches):
            loss=loss_batch(inputs,target,model,device)
            total_loss+=loss.item()
        else:
            break
    
    total_loss=total_loss/num_batches
    return total_loss

In [11]:
#generate new tokens
def generate_new_tokens(model,device,start_context,tokenizer,max_tokens=50):
    model.eval()
    ids=text_to_ids(start_context,tokenizer).to(device)
    context_length=model.pos_emb.weight.shape[0]
    new_ids=generate_text(model,ids,context_length,max_tokens)
    
    #convert idx into text
    with torch.no_grad():
        new_text=ids_to_text(new_ids,tokenizer)
        
    model.train()
    print(f"\n {new_text}\n")

In [12]:
def generate(model,idx,context_length,new_token_length,device,temprature,topk):
    idx_cont=idx[:,-context_length:] #2d inputs num of tokens by embeding dim
    for _ in range(new_token_length):
        with torch.no_grad():
            logits=model(idx_cont)

        #lets apply topk
        logits=logits[:, -1, :]  #take only last tokens prediction
        if(topk is not None):
            top_logits,_=torch.topk(logits,k=topk)
            min_value=top_logits[:,-1]
            
            logits=torch.where(
                logits<min_value.unsqueeze(dim=-1),
                torch.tensor(float('-inf')).to(device),
                logits
            )
        
        #lets apply multinomial
        if(temprature>0.0):
            logits=logits/temprature
            probs=torch.softmax(logits,dim=-1)
            next_token=torch.multinomial(probs,num_samples=1)
        else:
            probs=torch.softmax(logits,dim=-1)
            next_token=torch.argamax(probs,dim=-1,keepdim=True)
            
        idx=torch.cat((idx,next_token),dim=1)
    
    return idx

In [13]:
import urllib.request
url = (
"https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch05/"
"01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x7c01498bd690>)

In [14]:
from gpt_download import download_and_load_gpt2

In [15]:
settings,params=download_and_load_gpt2(model_size="124M",models_dir="gpt2")

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 32.7kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 873kiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 48.2kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:51<00:00, 9.73MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 2.04MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 603kiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 512kiB/s]


In [16]:
model_configs = {
        "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads":
        12},
        "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads":
        16},
        "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads":
        20},
        "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

In [17]:
model_name = "gpt2-small (124M)"
NEW_CONFIG = config.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024})
NEW_CONFIG.update({"qkv_bias": True})

In [18]:
gpt=GPTModel(NEW_CONFIG)

In [19]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [20]:
def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.token_emb.weight = assign(gpt.token_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.transformer_block[b].attention.w_query.weight = assign(
            gpt.transformer_block[b].attention.w_query.weight, q_w.T)
        gpt.transformer_block[b].attention.w_query.weight = assign(
            gpt.transformer_block[b].attention.w_key.weight, k_w.T)
        gpt.transformer_block[b].attention.w_value.weight = assign(
            gpt.transformer_block[b].attention.w_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.transformer_block[b].attention.w_query.bias = assign(
            gpt.transformer_block[b].attention.w_query.bias, q_b)
        gpt.transformer_block[b].attention.w_key.bias = assign(
            gpt.transformer_block[b].attention.w_key.bias, k_b)
        gpt.transformer_block[b].attention.w_value.bias = assign(
            gpt.transformer_block[b].attention.w_value.bias, v_b)

        gpt.transformer_block[b].attention.out_proj.weight = assign(
            gpt.transformer_block[b].attention.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.transformer_block[b].attention.out_proj.bias = assign(
            gpt.transformer_block[b].attention.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.transformer_block[b].feedforward.layers[0].weight = assign(
            gpt.transformer_block[b].feedforward.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.transformer_block[b].feedforward.layers[0].bias = assign(
            gpt.transformer_block[b].feedforward.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.transformer_block[b].feedforward.layers[2].weight = assign(
            gpt.transformer_block[b].feedforward.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.transformer_block[b].feedforward.layers[2].bias = assign(
            gpt.transformer_block[b].feedforward.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.transformer_block[b].layer_norm1.scale = assign(
            gpt.transformer_block[b].layer_norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.transformer_block[b].layer_norm1.shift = assign(
            gpt.transformer_block[b].layer_norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.transformer_block[b].layer_norm2.scale = assign(
            gpt.transformer_block[b].layer_norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.transformer_block[b].layer_norm2.shift = assign(
            gpt.transformer_block[b].layer_norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.last_norm.scale = assign(gpt.last_norm.scale, params["g"])
    gpt.last_norm.shift = assign(gpt.last_norm.shift, params["b"])
    gpt.out_prog.weight = assign(gpt.out_prog.weight, params["wte"])
    
    
load_weights_into_gpt(gpt, params)
#gpt=gpt.to(device)

In [21]:
new_idx=generate(
    gpt,
    idx=text_to_ids("humanity is",tokenizer),
    context_length=config['context_length'],
    new_token_length=50,
    device=torch.device('cpu'),
    temprature=0.1,
    topk=25
)

print(f"the new generated text is:{ids_to_text(new_idx,tokenizer)}")

the new generated text is:humanity is a the the, the a the that the the a the the a the that a the, the the the am the is the a the the the, the not, the the the the, a the and the,,, the is the also
