In [1]:
!pip install tiktoken -q

In [2]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence

In [3]:
import torch
import tiktoken
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self,emb_dim) -> None:
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean=x.mean(keepdim=True,dim=-1)
        var=x.var(keepdim=True,dim=-1,unbiased=False)
        norm_value=(x-mean)/torch.sqrt(self.eps+var)  
        return self.scale*norm_value+self.shift
    
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2)/torch.tensor(torch.pi))*(x+0.044715*torch.pow(x,3))))
    

class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers=nn.Sequential(
            nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),
            GELU(),
            nn.Linear(4*cfg['emb_dim'],cfg['emb_dim'])
        )
    def forward(self,x):
        return self.layers(x)
    

class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,num_heads,drop=0.5,qkvbias=False) -> None:
        super().__init__()
        assert (d_out%num_heads==0),"output dim should be divisible by number of heads"

        self.d_out=d_out

        self.w_query=nn.Linear(d_in,d_out,bias=qkvbias)
        self.w_key=nn.Linear(d_in,d_out,bias=qkvbias)
        self.w_value=nn.Linear(d_in,d_out,bias=qkvbias)

        self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))
        
        self.num_heads=num_heads
        self.head_dim=d_out//num_heads

        self.drop=nn.Dropout(drop)
        
        #the last layer
        self.out_proj=nn.Linear(d_out,d_out,bias=qkvbias)

    
    def forward(self,x):
        batch,num_tokens,input_dim=x.shape

        queries=self.w_query(x)
        key=self.w_key(x)
        value=self.w_value(x)
        
        queries=queries.view(batch,num_tokens,self.num_heads,self.head_dim)
        key=key.view(batch,num_tokens,self.num_heads,self.head_dim)
        value=value.view(batch,num_tokens,self.num_heads,self.head_dim)

        #lets transpose 
        queries=queries.transpose(1,2)
        key=key.transpose(1,2)
        value=value.transpose(1,2)

        attention_score=queries@key.transpose(2,3)

        mask_bool=self.mask.bool()[:num_tokens,:num_tokens]
        attention_score.masked_fill_(mask_bool,-torch.inf)
       
        attention_weight=torch.softmax(attention_score/key.shape[-1]**0.5,dim=-1)

        attention_weight=self.drop(attention_weight)

        context_vector=(attention_weight@value).transpose(1,2)

        context_vector=context_vector.contiguous().view(batch,num_tokens,self.d_out)
        context_vector=self.out_proj(context_vector)
        return context_vector

class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layer_norm1=LayerNorm(cfg['emb_dim'])
        self.layer_norm2=LayerNorm(cfg['emb_dim'])

        self.attention=MultiHeadAttention(d_in=cfg['emb_dim'],
                                      d_out=cfg['emb_dim'],
                                      context_length=cfg['context_length'],
                                      num_heads=cfg['n_heads'],
                                      drop=cfg['drop_rate'],
                                      qkvbias=cfg['qkv_bias']) 
        
        self.drop_residual=nn.Dropout(cfg['drop_rate'])
        self.feedforward=FeedForward(cfg)

    def forward(self,x):
        #first block
        residual=x  #residual attention
        x=self.layer_norm1(x)
        x=self.attention(x)
        x=self.drop_residual(x)

        #lets connect to residual
        x=x+residual

        #second block
        residual=x
        x=self.layer_norm2(x)
        x=self.feedforward(x)
        x=self.drop_residual(x)
        x=x+residual

        return x
    
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.token_emb=nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb=nn.Embedding(cfg['context_length'],cfg['emb_dim'])
        self.drop=nn.Dropout(cfg['drop_rate'])
        self.transformer_block=nn.Sequential(*[
            TransformerBlock(cfg) for _ in range(cfg['n_layers'])
        ])
        self.last_norm=LayerNorm(cfg['emb_dim'])
        self.out_prog=nn.Linear(cfg['emb_dim'],cfg['vocab_size'],bias=False)

    def forward(self,x):
        #shape
        batch,seq_length=x.shape

        toke_emb=self.token_emb(x)
        pos_emb=self.pos_emb(torch.arange(seq_length,device=x.device))
        
        x=toke_emb+pos_emb
        x=self.drop(x)

        x=self.transformer_block(x)
        x=self.last_norm(x)

        logits=self.out_prog(x)
        return logits


def generate_text(model,idx,context_length,new_token):
    for _ in range(new_token):
        idx=idx[:,-context_length:]
        with torch.no_grad():
            logits=model(idx)
            
        logits=logits[:,-1,:] #last token
        probs=torch.softmax(logits,dim=-1)
        next_word=torch.argmax(probs,dim=-1,keepdim=True)  #token position
        idx=torch.cat((idx,next_word),dim=1)
    return idx

# class for handling data

In [4]:
class GptDataSetv1(Dataset):
    def __init__(self,tokenizer,dataset,context_length,stride) -> None:
        super().__init__()
        self.tokenizer=tokenizer

        #lets tokenize the text
        self.tokens=self.tokenizer.encode(dataset,allowed_special={"<|endoftext|>"})   #array of ids
        
        self.inputs=[]
        self.outputs=[]

        for i in range(0,len(self.tokens),stride):
            input_chunks=self.tokens[i:i+context_length]
            output_chunks=self.tokens[i+1:i+context_length+1]

            #lets append
            if(len(input_chunks)==context_length and len(output_chunks)==context_length):
                self.inputs.append(torch.tensor(input_chunks))
                self.outputs.append(torch.tensor(output_chunks))
    
    def __len__(self) -> int:
        return len(self.inputs)

    def __getitem__(self, index) :
        #purpose of this function is to make an input and output matcher
        return self.inputs[index].clone().detach(),self.outputs[index].clone().detach()

In [5]:
def collate_fn(batch):
    inputs,outputs=zip(*batch)
    inputs=pad_sequence(inputs,batch_first=True,padding_value=0)
    outputs=pad_sequence(outputs,batch_first=True,padding_value=0)
    return inputs,outputs

In [6]:
def create_dataloader_v1(txt,batch_size=4,context_length=120,stride=128,shuffle=True,drop_last=True):
    tokenizer=tiktoken.get_encoding('gpt2')
    dataset=GptDataSetv1(tokenizer,txt,context_length,stride)
    #prepare the datalaoder
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          collate_fn=collate_fn,
                          shuffle=shuffle,
                          drop_last=drop_last
                         )
    return dataloader

In [7]:
config = {
    "vocab_size": 50257,
    "context_length": 256, 
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [8]:
model=GPTModel(config)

# lets make text to token and token to text

In [9]:
tokenizer=tiktoken.get_encoding('gpt2')

def text_to_ids(text,tokenizer):
    #this convert text into token ids
    
    encoded=tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor=torch.tensor(encoded)
    encoded_tensor=encoded_tensor.unsqueeze(dim=0)
    return encoded_tensor

def ids_to_text(ids,tokenizer):
    #this converts the tokens ids into text
    return tokenizer.decode(ids.squeeze(dim=0).tolist())

In [10]:
text="once up on a time in hollwood"
ids=text_to_ids(text,tokenizer)
print(f"token ids is: {ids}")
decoded=ids_to_text(ids,tokenizer)
print(f"text version is: {decoded}")

token ids is: tensor([[27078,   510,   319,   257,   640,   287,   289,   692,  3822]])
text version is: once up on a time in hollwood


In [11]:
token_ids=generate_text(model,ids,config['context_length'],10)

In [12]:
text_converted=ids_to_text(token_ids,tokenizer)
print(text_converted)

once up on a time in hollwood reclaim refugees One breaks Married Bella Spockti Gerr simulations


In [13]:
inputs=torch.tensor([
    [16833,3626,6100],
    [40,1107,588]
])

target=torch.tensor([
    [3626,6100,345],
    [588,428,11311]
])

In [14]:
with torch.no_grad():
    logits=model(inputs)

probs=torch.softmax(logits,dim=-1)
next_tokens=torch.argmax(probs,dim=-1,keepdim=True)

In [15]:
print(next_tokens)

tensor([[[49387],
         [ 1120],
         [15600]],

        [[ 1700],
         [ 2837],
         [ 3801]]])


In [16]:
print(f"the predicted is:{ids_to_text(next_tokens[0].flatten(),tokenizer)}")
print(f"the actual is:{ids_to_text(target[0],tokenizer)}")

the predicted is: Huma50shirt
the actual is: effort moves you


In [17]:
temp=torch.randn(1,5)
print(temp)
print(temp.shape)

tensor([[ 0.7003,  1.5386,  0.4914, -0.5090,  1.2845]])
torch.Size([1, 5])


In [18]:
text_idx = 0
target_probas_1 = probs[text_idx, [0, 1, 2], target[text_idx]]
print("Text 1:", target_probas_1)
text_idx = 1
target_probas_2 = probs[text_idx, [0, 1, 2], target[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([3.9121e-05, 9.9291e-06, 1.3916e-05])
Text 2: tensor([1.3764e-05, 7.8090e-06, 1.9432e-05])


## check the shape of the model output and true value

In [19]:
print(f"logits shape: {logits.shape}")
print(f"target shape: {target.shape}")

logits shape: torch.Size([2, 3, 50257])
target shape: torch.Size([2, 3])


In [20]:
logits_flat=logits.flatten(0,1)
target_flat=target.flatten()

In [21]:
print(f"flatten logits shape:{logits_flat.shape}")
print(f"flatten target shape: {target_flat.shape}")

flatten logits shape:torch.Size([6, 50257])
flatten target shape: torch.Size([6])


In [22]:
torch.nn.functional.cross_entropy(logits_flat,target_flat)

tensor(11.1089)

# calculating the loss

## this is the cross entropy loss

In [23]:
loss=torch.nn.functional.cross_entropy(logits_flat,target_flat)

In [24]:
print(f"loss is: {loss:.2f}")

loss is: 11.11


## this is the perplexity

In [25]:
perplexity=torch.exp(loss)

In [26]:
print(f"perplexity measure is: {perplexity}")

perplexity measure is: 66765.6484375


# Lets Train The Model

In [27]:
with open('/kaggle/input/the-verdict/verdict.txt','r',encoding='utf-8') as f:
    txt=f.read()

In [28]:
total_charachters=len(txt)
encode=tokenizer.encode(txt)
total_tokens=len(encode)

#lets print
print(f"total number of charchter:{total_charachters}")
print(f"total number of tokens: {total_tokens}")

total number of charchter:20479
total number of tokens: 5145


In [29]:
train_ratio=0.9
split_index=int(train_ratio*total_charachters)

train_text=txt[:split_index]
val_text=txt[split_index:]

In [30]:
#lets change into data loader
train_dataloader=create_dataloader_v1(
    train_text,
    batch_size=2,
    context_length=config['context_length'],
    stride=config['context_length'],
    shuffle=True
)

val_dataloader=create_dataloader_v1(
    val_text,
    batch_size=2,
    context_length=config['context_length'],
    stride=config['context_length'],
    shuffle=True
)

In [31]:
print(train_dataloader)
print(val_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7803a5370760>
<torch.utils.data.dataloader.DataLoader object at 0x780379ad19f0>


In [32]:
#lets check for the shape 
for x,y in train_dataloader:
    print(f"x shape:{x.shape} :y shape: {y.shape}")

#for validation 
for x,y in val_dataloader:
    print(f"\nx shape {x.shape} and {y.shape}.")

x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])
x shape:torch.Size([2, 256]) :y shape: torch.Size([2, 256])

x shape torch.Size([2, 256]) and torch.Size([2, 256]).


In [33]:
#for calculating the loss of a single batch
def loss_batch(inputs,target,model,device):
    #lets move all varaible into the same device
    inputs,target=inputs.to(device),target.to(device)
    
    with torch.no_grad():
        logits=model(inputs)
        
   
    loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target.flatten())
    return loss

In [34]:
#lets calculate the loss for the whole batch
def total_loss_batches(dataloader,model,device,num_batches=None):
    if(num_batches==None):
        num_batches=len(dataloader)
    else:
        num_batches=min(num_batches,len(dataloader))
    
    #lets calculate the loss over batches
    total_loss=0.
    for i,(inputs,target) in enumerate(dataloader):
        if(i<num_batches):
            loss=loss_batch(inputs,target,model,device)
            total_loss+=loss.item()
        else:
            break
    
    total_loss=total_loss/num_batches
    return total_loss

# lets evaulate our untrained model

In [35]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [36]:
model=model.to(device)

In [37]:
train_loss=total_loss_batches(train_dataloader,model,device=device)
val_loss=total_loss_batches(val_dataloader,model,device=device)

In [38]:
print(f"train dataloader:{train_loss:.3f}")
print(f"validation dataloader:{val_loss:.3f}")

train dataloader:11.025
validation dataloader:11.075


# Train the llm
## lets make a trainer

In [39]:
p=torch.tensor([90])
print(type(p))

<class 'torch.Tensor'>


In [40]:
def model_trainer(train_dataloader,val_dataloader,device,train_epoch,eval_freq,eval_batch,val_epoch,model,start_context,tokenizer,optimizer,max_tokens=50):
    #for trainig
    num_tokens_seen=0
    #tracking the tokens
    track_tokens_seen=[]
    
    #eval_tokens_seen=[]
    train_losses=[]
    eval_losses=[]
    
    for epoch in range(train_epoch):
        model.train()
        for i,(inputs,targets) in enumerate(train_dataloader):
            #train the model

            optimizer.zero_grad()

            #calculate the loss
            loss=loss_batch(inputs,targets,model,device)
            loss.requires_grad=True            
            
            #backpropagation
            loss.backward()

            #model update
            optimizer.step()

            num_tokens_seen+=inputs.numel()

            if(i%eval_freq==0):
                #for evaluation
                train_loss,eval_loss=eval_mode(model,train_dataloader,val_dataloader,device,eval_batch)

                #for recording
                train_losses.append(train_loss)
                eval_losses.append(eval_loss)
                #the tokens
                track_tokens_seen.append(num_tokens_seen)

                print(f"for epoch: {epoch}:iteration {i}: train loss {train_loss}: eval loss {eval_loss}")
            
        
        #lets generate the new tokens
        generate_new_tokens(model,device,start_context,tokenizer,max_tokens)
                
    return train_losses,eval_losses,track_tokens_seen

In [41]:
#generate new tokens
def generate_new_tokens(model,device,start_context,tokenizer,max_tokens=50):
    model.eval()
    ids=text_to_ids(start_context,tokenizer).to(device)
    context_length=model.pos_emb.weight.shape[0]
    new_ids=generate_text(model,ids,context_length,max_tokens)
    
    #convert idx into text
    with torch.no_grad():
        new_text=ids_to_text(new_ids,tokenizer)
        
    model.train()
    print(f"\n {new_text}")

In [42]:
def eval_mode(model,train_loader,val_loader,device,eval_batch):
    model.eval()
    with torch.no_grad():
        train_loss=total_loss_batches(train_loader,model,device,num_batches=eval_batch)
        eval_loss=total_loss_batches(val_loader,model,device,num_batches=eval_batch)
    model.train()
    return train_loss,eval_loss

In [43]:
model.pos_emb.weight.shape

torch.Size([256, 768])

# lets Train a mini gpt

In [44]:
device=torch.device('cuda' if torch.cuda.is_available else "cpu")
print(device)

cuda


In [45]:
gpt=GPTModel(config)
optim=torch.optim.AdamW(params=gpt.parameters(),
                  lr=0.001,
                 weight_decay=0.1
                 )

gpt.to(device)
train_loss,eval_loss,track_tokens=model_trainer(
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    train_epoch=10,
    eval_freq=5,
    eval_batch=2,
    val_epoch=3,
    model=model,
    start_context="Every one here  ",
    tokenizer=tokenizer,
    max_tokens=50,
    optimizer=optim
)

for epoch: 0:iteration 0: train loss 11.009942054748535: eval loss 11.062763214111328
for epoch: 0:iteration 5: train loss 11.02854585647583: eval loss 11.062763214111328

 Every one here   Barclagan towed happiness "<Brian blockingFootnote 350adium irresponsible123 Pole stretched surround assailants illustrating strategicallyeat Kodi volunteering� bufferomaticthem sneaking unveiledNo vastlygoneWide war Shapirotrainedswick Tend metast numbered SelectionexternalActionCodeelfth direct�Keefe referees muzzle Westbrook cells goesurther
for epoch: 1:iteration 0: train loss 11.021751403808594: eval loss 11.062763214111328
for epoch: 1:iteration 5: train loss 10.994550704956055: eval loss 11.062762260437012

 Every one here   Barclagan towed happiness "<Brian blockingFootnote 350adium irresponsible123 Pole stretched surround assailants illustrating strategicallyeat Kodi volunteering� bufferomaticthem sneaking unveiledNo vastlygoneWide war Shapirotrainedswick Tend metast numbered Selectionexter

In [46]:
for x,y in val_dataloader:
    print(f"x shape:{x.shape} and y shape {y.shape}")

x shape:torch.Size([2, 256]) and y shape torch.Size([2, 256])
