In [None]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class Point:
    def __init__(self, x: int, y: int):
        self.x = x
        self.y = y

    def __repr__(self):
        return f"Point(x={self.x}, y={self.y})"

point = Point(3, 5)
print(point)  # Output: Point(x=3, y=5)



from dataclasses import dataclass

@dataclass()
class Point:
    x: int
    y: int

point = Point(3, 5)
print(point)  # Output: Point(x=3, y=5)


Point(x=3, y=5)
Point(x=3, y=5)


In [None]:
import math

class CasualSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        #output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT=1
        #Regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        #bad naming
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size)).
                             view(1, 1, config.block_size, config.block_size))


    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        #calculate query, key, values for all heads in batch and move head forward to be the batch dim
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        #compute attention scores
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


In [None]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate= 'tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT=1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [None]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CasualSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)


    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [None]:
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

In [None]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config


        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList(Block(config) for _ in range(config.n_layer)),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False)

        self.transformer.wte.weight=self.lm_head.weight

        self.apply(self._init_weights)


    def _init_weights(self,module):
      if isinstance(module,nn.Linear):
        std=0.02
        if hasattr(module,'NANOGPT_SCALE_INIT'):
          std= (2* self.config.n_layer)**-0.5
        torch.nn.init.normal_(module.weight, mean=0.0, std=std)
        if module.bias is not None:
          torch.nn.init.zeros_(module.bias)
      elif isinstance(module,nn.Embedding):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)



    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        loss=None
        if targets is not None:
            logits = self.lm_head(x)
            loss = F.cross_entropy(self.lm_head(x.view(-1, x.size(-1))), targets.view(-1))
        logits = self.lm_head(x)
        return logits,loss


    @classmethod
    def from_pretrained(cls, model_type):
        '''Loads pretrained GPT-2 model weights from huggingface'''
        assert model_type in ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
        from transformers import GPT2LMHeadModel
        print('Loading weights from huggingface...')

        config_args = {
            "gpt2":  dict(n_layer=12, n_head=12, n_embd=768), #124M params
            "gpt2-medium": dict(n_layer=24, n_head=16, n_embd=1024), #355M params
            "gpt2-large": dict(n_layer=36, n_head=20, n_embd=1280), #774M params
            "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600), #1558M params
        }[model_type]
        config_args['vocab_size'] = 50257
        config_args['block_size'] = 1024
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]

        #init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        #copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']




        assert len(sd_keys) == len(sd_keys_hf), f'mismatched number of keys: {len(sd_keys)} vs {len(sd_keys_hf)}'
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                assert sd_hf[k].shape[::-1] == sd[k].shape, f'mismatched shape for {k}: {sd[k].shape} vs {sd_hf[k].shape}'
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                assert sd_hf[k].shape == sd[k].shape, f'mismatched shape for {k}: {sd[k].shape} vs {sd_hf[k].shape}'
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        return model

model = GPT.from_pretrained('gpt2')
print(model)




Loading weights from huggingface...
GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [None]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
print(device)

cuda


In [None]:
num_return_sequences = 5
max_length = 30

# model = GPT.from_pretrained('gpt2')
model = GPT(GPTConfig())
model.eval()
model.to(device)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [None]:
!pip install tiktoken



In [None]:
# import tiktoken
# enc = tiktoken.get_encoding("gpt2")
# tokens = enc.encode("Hello, I'm a language model")
# tokens = torch.tensor(tokens, dtype=torch.long)
# tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
# x = tokens.to(device)

# torch.manual_seed(42)
# torch.cuda.manual_seed(42)
# while x.size(1) < max_length:
#     with torch.no_grad():
#         logits = model(x)
#         # logits = logits[:, -1, :]
#         probs = F.softmax(logits, dim=-1)
#         # next_token = torch.multinomial(probs, num_samples=1)
#         # tokens = torch.cat([tokens, next_token], dim=1)
#         topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1)
#         ix = torch.multinomial(topk_probs, 1)
#         xcol = torch.gather(topk_indices, -1, ix)
#         x = torch.cat([x, xcol], dim=1)

In [None]:
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print('>', decoded)

> IOLANUS:
To brag unto them, thus I did, and thus;
Show them the unaching scars which I should hide
> In wholesome manner.

CORIOLANUS:
Bid them wash their faces
And keep their teeth clean.
So,
>  the tribunes
Endue you with the people's voice: remains
That, in the official marks invested, you
Anon do meet the
>  deny him:
I'll have five hundred voices of that sound.

First Citizen:
I twice five hundred and their friends to piece '


IndexError: index 4 is out of bounds for dimension 0 with size 4

In [None]:
!wget https://raw.githubusercontent.com/karpathy/build-nanogpt/refs/heads/master/input.txt

--2025-02-04 13:04:54--  https://raw.githubusercontent.com/karpathy/build-nanogpt/refs/heads/master/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2025-02-04 13:04:54 (31.7 MB/s) - ‘input.txt.3’ saved [1115394/1115394]



In [None]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
with open('input.txt', 'r') as f:
  text= f.read()
text= text[:1000]
tokens = enc.encode(text)

B,T=4,32
buf=torch.tensor(tokens[:B*T+1])
buf=buf.to(device)
x= buf[:-1].view(B,T)
y= buf[1:].view(B,T)
model=GPT(GPTConfig)
model.to(device)
optimizer=torch.optim.AdamW(model.parameters(),lr=1e-3)

for i in range (50):
  optimizer.zero_grad(set_to_none=True)
  logits,loss=model(x,y)
  loss.backward()
  optimizer.step()
  print(f"step{i},loss:{loss.item()}")


step0,loss:11.000419616699219
step1,loss:8.208377838134766
step2,loss:7.029065132141113
step3,loss:6.53325891494751
step4,loss:5.4355387687683105
step5,loss:4.640581130981445
step6,loss:4.1531901359558105
step7,loss:3.8315253257751465
step8,loss:3.594635009765625
step9,loss:3.465388536453247
step10,loss:3.3501698970794678
step11,loss:3.192009449005127
step12,loss:3.152534008026123
step13,loss:3.2802484035491943
step14,loss:3.3397154808044434
step15,loss:3.2563881874084473
step16,loss:3.168583393096924
step17,loss:3.121976852416992
step18,loss:3.064889669418335
step19,loss:2.958860397338867
step20,loss:2.861670732498169
step21,loss:2.792046546936035
step22,loss:2.7135705947875977
step23,loss:2.628185987472534
step24,loss:2.5604312419891357
step25,loss:2.458855152130127
step26,loss:2.3569202423095703
step27,loss:2.24489426612854
step28,loss:2.154266595840454
step29,loss:2.0578033924102783
step30,loss:1.9585514068603516
step31,loss:1.8522930145263672
step32,loss:1.740715742111206
step33,l

In [None]:
class DataLoaderLite:
    def __init__(self, B,T):
        self.B=B
        self.T=T

        with open('input.txt', 'r') as f:
            text= f.read()
        enc = tiktoken.get_encoding("gpt2")
        tokens = enc.encode(text)
        self.tokens=torch.tensor(tokens)
        print(f"loaded{len(self.tokens)} tokens")
        print(f"1 epoch ={len(self.tokens)// (B*T)} batches")

        self.current_position=0

    def next_batch(self):
       B,T = self.B, self.T
       buf= self.tokens[self.current_position:self.current_position+B*T+1]
       x= buf[:-1].view(B,T)
       y= buf[1:].view(B,T)
       self.current_position+=B*T+1

       if self.current_position +(B*T+1) >= len(self.tokens):
         self.current_position=0
       return x,y








In [None]:
train_load=DataLoaderLite(4,32)

optimizer= torch.optim.AdamW(model.parameters(),lr=1e-3)
for i in range(50):
  x,y=train_load.next_batch()
  x= x.to(device)
  y=y.to(device)
  optimizer.zero_grad(set_to_none=True)
  logits,loss=model(x,y)
  loss.backward()
  optimizer.step()
  print(f"step{i},loss:{loss.item()}")


loaded338025 tokens
1 epoch =2640 batches
step0,loss:0.28227752447128296
step1,loss:12.147817611694336
step2,loss:10.2726469039917
step3,loss:11.064226150512695
step4,loss:9.064347267150879
step5,loss:8.45627212524414
step6,loss:8.847795486450195
step7,loss:8.248506546020508
step8,loss:8.024679183959961
step9,loss:7.668798923492432
step10,loss:7.934527397155762
step11,loss:6.717639923095703
step12,loss:7.085093021392822
step13,loss:7.089598178863525
step14,loss:7.1593194007873535
step15,loss:7.196556568145752
step16,loss:7.771577835083008
step17,loss:8.556546211242676
step18,loss:6.815072059631348
step19,loss:8.091900825500488
step20,loss:7.528286457061768
step21,loss:7.444916725158691
step22,loss:6.6459197998046875
step23,loss:6.894480228424072
step24,loss:6.5769124031066895
step25,loss:6.686009407043457
step26,loss:6.937647342681885
step27,loss:7.813632488250732
step28,loss:6.798986911773682
step29,loss:6.8111653327941895
step30,loss:6.875070571899414
step31,loss:7.156610488891602
st

In [None]:
import time

device= 'cpu'
if torch.cuda.is_available():
    device= 'cuda'
elif torch.backends.mps.is_available():
    device= 'mps'
print(device)

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

train_loader=DataLoaderLite(B=4,T=1024)
torch.set_float32_matmul_precision('high')
model=GPT(GPTConfig())
model.to(device)
model=torch.compile(model)


max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 50
def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps

    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)


optimizer=torch.optim.AdamW(model.parameters(),lr=3e-4, betas = (0.9, 0.95), eps = 1e-8)
for step in range(5):
  t0=time.time()
  x,y=train_loader.next_batch()
  x=x.to(device)
  y=y.to(device)
  optimizer.zero_grad(set_to_none=True)
  with torch.autocast(device_type=device):
     logits,loss=model(x,y)

  loss.backward()
  norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  lr = get_lr(step)
  for param_group in optimizer.param_groups:
      param_group['lr'] = lr
  optimizer.step()
  torch.cuda.synchronize()
  t1=time.time()
  dt=(t1-t0)*1000
  tokens_per_sec=(train_loader.B * train_loader.T)/ (t1-t0)
  print(f"step{step}, loss:{loss.item()}, |lr {lr:.2f} | norm {norm:.4f} | time:{dt:.2f}ms, tok/sec:{tokens_per_sec:4f}")



cuda
loaded338025 tokens
1 epoch =82 batches
step0, loss:10.96420669555664, |lr 0.00 | norm 10.4052 | time:324.36ms, tok/sec:12627.818446
step1, loss:10.009471893310547, |lr 0.00 | norm 6.0850 | time:269.63ms, tok/sec:15191.391936
step2, loss:9.551011085510254, |lr 0.00 | norm 3.4422 | time:267.51ms, tok/sec:15311.771780
step3, loss:9.17956256866455, |lr 0.00 | norm 3.6222 | time:264.64ms, tok/sec:15477.777944
step4, loss:8.897397994995117, |lr 0.00 | norm 2.6608 | time:275.77ms, tok/sec:14853.152544
