In [9]:
import torch
import tiktoken
import torch.nn as nn

# GPT dummy model

In [79]:
class DummyTransformerBlock(nn.Module):
    def __init__(self,cfg) -> None:
        super().__init__()

    def forward(self,x):
        return x

class DummyLayerNorm(nn.Module):

    def __init__(self,normalized_shape ) -> None:
        super().__init__()

    def forward(self,x):
        return x
    

class DummyGPTModel(nn.Module):
    def __init__(self,cfg) -> None:
        super().__init__()
        #toke pos #transformer #layer norm
        
        self.toke_emb=nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb=nn.Embedding(cfg['context_length'],cfg['emb_dim'])
        
        self.btrf=nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])])
        self.drop=nn.Dropout(cfg['drop_rate'])

        self.final=DummyLayerNorm(cfg['emb_dim'])
        self.out_head=nn.Linear(cfg['emb_dim'],cfg['vocab_size'],bias=False)

    def forward(self,in_idx):
        batch_size,seq_length=in_idx.shape
        token_embd=self.toke_emb(in_idx)
        pos_embd=self.pos_emb(torch.arange(seq_length))
        
        input_embd=token_embd+pos_embd

        input_embd=self.drop(input_embd)
        input_embd=self.btrf(input_embd)
        input_embd=self.final(input_embd)

        logits=self.out_head(input_embd)
        return logits

In [80]:
tokenizer=tiktoken.get_encoding('gpt2')

In [81]:
batch=[]
txt='hello there i am'
txt2="this is not what"

batch.append(torch.tensor(tokenizer.encode(txt)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch=torch.stack(batch,dim=0)

In [82]:
batch

tensor([[31373,   612,  1312,   716],
        [ 5661,   318,   407,   644]])

In [83]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [84]:
model=DummyGPTModel(GPT_CONFIG_124M)
logits=model(batch)

In [88]:
logits.shape

torch.Size([2, 4, 50257])

In [89]:
batch.shape

torch.Size([2, 4])

# normalizing layes

In [92]:
batch_example=torch.randn(2,5)
layer=nn.Sequential(nn.Linear(5,6),nn.ReLU())
out=layer(batch_example)
print(f"input:{batch}\n")
print(out)

input:tensor([[31373,   612,  1312,   716],
        [ 5661,   318,   407,   644]])

tensor([[0.4198, 0.0000, 0.0000, 0.0000, 0.0353, 0.0000],
        [0.3917, 0.4054, 0.7159, 0.0000, 0.0000, 0.3878]],
       grad_fn=<ReluBackward0>)


In [93]:
#lets check the mean and variance
mean=out.mean(dim=-1,keepdim=True)
variance=out.var(dim=-1,keepdim=True)

In [97]:
print(f"mean is: {mean}\n")
print(f"variance is:{variance}")

mean is: tensor([[0.0759],
        [0.3168]], grad_fn=<MeanBackward1>)

variance is:tensor([[0.0286],
        [0.0757]], grad_fn=<VarBackward0>)


In [100]:
torch.set_printoptions(sci_mode=False)

#lets noramlize the output
out_norm=(out-mean)/torch.sqrt(variance)
print(f"the output:{out_norm}")
print(f"the mean of the value is:{out_norm.mean(dim=-1,keepdim=True)}")
print(f"the varaince is:{out_norm.var(dim=-1,keepdim=True)}")

the output:tensor([[ 2.0341, -0.4486, -0.4486, -0.4486, -0.2396, -0.4486],
        [ 0.2722,  0.3220,  1.4505, -1.1514, -1.1514,  0.2581]],
       grad_fn=<DivBackward0>)
the mean of the value is:tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
the varaince is:tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


# lets make normalizing layer

In [114]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim) -> None:
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean=x.mean(keepdim=True,dim=-1)
        var=x.var(keepdim=True,dim=-1,unbiased=False)
        norm_value=(x-mean)/torch.sqrt(self.eps+var)  
        return self.scale*norm_value+self.shift

In [115]:
ln=LayerNorm(emb_dim=5)
out_ln=ln(batch_example)

mean=out_ln.mean(dim=-1,keepdim=True)
var=out_ln.var(dim=-1,keepdim=True)

print(f"mean is {mean}")
print(f"variance is {var}")

mean is tensor([[     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)
variance is tensor([[1.2499],
        [1.2500]], grad_fn=<VarBackward0>)


In [116]:
batch_example

tensor([[ 0.2014,  0.3362, -0.4535,  0.0056,  0.6441],
        [ 1.1242,  0.2663,  0.2266,  2.2313,  0.5974]])