In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
#create tensor
x = torch.tensor([1,2,4])
print(x)
#move tensor to cuda or gpu
x = x.to(device)
print(x)

tensor([1, 2, 4])
tensor([1, 2, 4], device='cuda:0')


In [3]:
import torch.nn as nn
import torch.nn.functional as F

# Simple character-level tokenizer
text = "hello world"
chars = sorted(list(set(text)))
print(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
print(stoi)
itos = {i:ch for ch,i in stoi.items()}
print(itos)
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hello"))
print(decode([3,2,4,4,5]))


[' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w']
{' ': 0, 'd': 1, 'e': 2, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7}
{0: ' ', 1: 'd', 2: 'e', 3: 'h', 4: 'l', 5: 'o', 6: 'r', 7: 'w'}
[3, 2, 4, 4, 5]
hello


In [4]:
vocab_size = len(stoi)
embed_dim = 32
embedding = nn.Embedding(vocab_size, embed_dim)

x = torch.tensor([encode("hello")])
emb = embedding(x)
print(emb.shape)  # (1,5,32)


torch.Size([1, 5, 32])


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, heads=1):
        super().__init__()
        self.heads = heads
        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        B, T, C = x.shape
        print(f"Input shape: {x.shape}")
        print(f"Input x:\n{x}\n")

        # Linear projection to Q, K, V
        qkv = self.qkv(x)
        print(f"After qkv projection shape: {qkv.shape}")
        print(f"qkv projection:\n{qkv}\n")

        # Split into Q, K, V
        qkv = qkv.chunk(3, dim=-1)
        Q, K, V = qkv
        print(f"Q shape: {Q.shape}\nQ:\n{Q}\n")
        print(f"K shape: {K.shape}\nK:\n{K}\n")
        print(f"V shape: {V.shape}\nV:\n{V}\n")

        # Attention scores
        attn_scores = (Q @ K.transpose(-2, -1)) / (C**0.5)
        print(f"Attention scores shape: {attn_scores.shape}")
        print(f"Attention scores:\n{attn_scores}\n")

        # Softmax to get attention weights
        attn_weights = F.softmax(attn_scores, dim=-1)
        print(f"Attention weights shape: {attn_weights.shape}")
        print(f"Attention weights:\n{attn_weights}\n")

        # Weighted sum of values
        out = attn_weights @ V
        print(f"Output before final linear layer shape: {out.shape}")
        print(f"Output before fc:\n{out}\n")

        # Final projection
        final_output = self.fc(out)
        print(f"Final output shape: {final_output.shape}")
        print(f"Final output:\n{final_output}\n")

        return final_output

# ---- Sample input ----
torch.manual_seed(0)  # for reproducibility

# Batch size = 1, Sequence length = 3, Embedding size = 4
x = torch.randn(1, 3, 4)

# Initialize and run the attention module
attn = SelfAttention(embed_dim=4)
out = attn(x)


Input shape: torch.Size([1, 3, 4])
Input x:
tensor([[[ 1.5410, -0.2934, -2.1788,  0.5684],
         [-1.0845, -1.3986,  0.4033,  0.8380],
         [-0.7193, -0.4033, -0.5966,  0.1820]]])

After qkv projection shape: torch.Size([1, 3, 12])
qkv projection:
tensor([[[-1.1933,  0.6366,  0.4981, -0.4831, -0.2344, -1.4366, -0.7046,
          -0.9246,  0.3996,  1.4650,  0.8459,  0.5737],
         [ 0.0038,  1.2161,  0.5752,  0.4497,  0.0724,  0.6905,  0.8573,
          -0.7340, -0.3811, -0.0752,  0.2755, -1.8156],
         [-0.1214,  0.8867,  0.8842,  0.3888,  0.0485, -0.2058,  0.1125,
          -0.7364, -0.2169,  0.4046,  0.2179, -0.7190]]],
       grad_fn=<ViewBackward0>)

Q shape: torch.Size([1, 3, 4])
Q:
tensor([[[-1.1933,  0.6366,  0.4981, -0.4831],
         [ 0.0038,  1.2161,  0.5752,  0.4497],
         [-0.1214,  0.8867,  0.8842,  0.3888]]], grad_fn=<SplitBackward0>)

K shape: torch.Size([1, 3, 4])
K:
tensor([[[-0.2344, -1.4366, -0.7046, -0.9246],
         [ 0.0724,  0.6905,  0.8573, -

In [12]:
# Transformer Block
class TransformerBlock(nn.Module):

   def __init__(self,embed_dim):
       super().__init__()
       self.attn = SelfAttention(embed_dim)
       self.ff = nn.Sequential(
           nn.Linear(embed_dim,4*embed_dim),
           nn.ReLU(),
           nn.Linear(4*embed_dim,embed_dim)
       )
       self.norm1 = nn.LayerNorm(embed_dim)
       self.norm2 = nn.LayerNorm(embed_dim)
   def forward(self,x):
       x = x + self.attn(self.norm1(x))
       x = x + self.ff(self.norm2(x))
       return x



In [13]:
# Mini GPT

class MiniGPT(nn.Module):
   def __init__(self,vocab_size,embed_dim,num_layer):
       super().__init__()
       self.token_emb = nn.Embedding(vocab_size,embed_dim)
       self.position_emb = nn.Embedding(100,embed_dim)
       self.blocks = nn.Sequential(*[TransformerBlock(embed_dim) for _ in range(num_layer)])
       self.ln = nn.LayerNorm(embed_dim)
       self.fc = nn.Linear(embed_dim,vocab_size)

   def forward(self, idx):
     B, T = idx.shape
     pos = torch.arange(T, device=idx.device).unsqueeze(0).expand(B, T)
     x = self.token_emb(idx) + self.position_emb(pos)
     x = self.blocks(x)
     x = self.ln(x)
     return self.fc(x)


In [16]:
# Settings
vocab_size = 1000
embed_dim = 64
num_layers = 2
seq_len = 5
batch_size = 2

# Create model
model = MiniGPT(vocab_size, embed_dim, num_layers)

# Fake token input (batch of 2 sequences, each 5 tokens long)
x = torch.randint(0, vocab_size, (batch_size, seq_len))  # shape [2, 5]
print("Input token IDs:\n", x)

# Forward pass
output = model(x)

print("Output shape:", output.shape)
# print("Sample output for first token in sequence:\n", output[0, 0])  # logits


Input token IDs:
 tensor([[833, 573, 840, 246, 480],
        [730, 754, 823, 708, 230]])
Input shape: torch.Size([2, 5, 64])
Input x:
tensor([[[ 2.5023e-02,  3.2340e-01,  1.7695e-01, -1.4871e+00,  2.7987e+00,
          -1.0829e+00, -2.0066e-01,  1.9146e+00,  3.2765e-01,  1.7382e-01,
          -5.9442e-01, -3.1426e-01, -4.3672e-01, -4.0743e-01,  9.8152e-01,
           1.5344e+00,  2.6896e-01,  1.1586e+00,  3.6370e-02,  1.9367e+00,
           1.1347e+00,  1.1535e+00, -7.6828e-01, -3.4377e-01,  7.5617e-01,
          -1.2928e+00, -1.2484e+00,  8.0644e-01, -2.5695e+00, -1.0510e-01,
          -8.4229e-02, -1.6657e+00,  9.0714e-03, -4.4861e-01, -3.7690e-01,
          -3.4043e-01, -9.8329e-01, -1.7344e-01,  1.2101e+00,  6.1347e-01,
          -1.7206e+00,  8.5037e-01, -6.8400e-01, -5.2925e-01, -4.3433e-01,
          -1.0717e+00,  3.2821e-01,  1.0587e+00,  5.7489e-01, -1.1536e+00,
          -8.8685e-01,  5.8139e-01,  7.3174e-01, -5.2614e-01,  1.1112e+00,
          -7.6290e-01,  1.8537e-01, -2.63