In [1]:
from dataclasses import dataclass
import torch
import math
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import matplotlib.pyplot as plt

torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [2]:
# step 1 load their model and use it, also inspect its weights shapes
# step 2 replicate their model with our modules, load their weights into our model and replicate trivial results
# step 3 set up loss and check it, init, small performance improvements

In [3]:
# - weights scaling wrt num of residual layers: apply scaling of 1/sqrt(Num of residual layers)
# - autocast mixed precision
# - flash attention
# - remove all numbers that are not clean powers of 2
# - adamW betas, fusion, wds
# - clip_grad_norm
# - ad hoc param wd
# - normlayer even b4 last lin layer
# - torch.compile
# - weights init
# - in mlps after attention do 4x upsapling in hidden layer
# - if possible do weight tying
# - configure optimizer wd for each param
# - lr decay: idea: durining beginning of training small batch size (that is increased l8r during training) and an high lr is good cuz very easy gains in
#       driving down eg biases of very rare tokens, so in the beginning speed up training by boosting lr and then drive it down
# - gradient accumulation to simulate larger bs
# - DistributeDataParallel:
#       CARE: this require quite a major refactoring cuz the same script is executed by different processes each one with its own associated gpu
#       it spawns #numgpus processes
#       each work on numgpus different chunks of our data
#       at application of grad to weights it applies the avg of the numgpus different contributions
#
#       to launch it we need to use the cmd torchrun which will launch 8 different processes of the python script ur launching
#       so 8 python interpreters will read the python script ur launching
#
#       torchrun --standalone --nproc_per_node=8 train.py
#       it does: loss.backward() into gradient syncronization for averaging

In [None]:
import os
from torch.distributed import init_process_group, destroy_process_group

# set up DDP: Distributed Data Parallel
# torch run command sets the env variables RANK, LOCAL_RANK, and WORLD_SIZE

# We can modify os.environ but any changes will be effective only for the current process where it was assigned and it will not change the value permanently.

is_ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?

if is_ddp:
    assert torch.cuda.is_available(), 'cuda is required for ddp'
    init_process_group(backend='nccl') # initialize processes pool, when ending pool u must call destroy_process_group
    ddp_rank = int(os.environ.get('RANK')) # process ID
    ddp_local_rank = int(os.environ.get('LOCAL_RANK')) # id of the gpu on node, not used for now
    ddp_world_size = int(os.environ.get('WORLD_SIZE')) # num of processes, i.e. number of gpus used
    device = f'cuda:{ddp_local_rank}' # associates process with id:ddp_local_rank to the gpu:id
    torch.cuda.set_device(device=device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc
else:
    # non ddp run
    ddp_world_size = 1
    ddp_rank, ddp_local_rank,  = 0, 0
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    master_process = True

In [4]:
@dataclass
class GPTConfig:
    block_size: int = 128 #1024 # max seq len: max n of tokens processable by attention block
    vocab_size: int = 50257 # n tokens 50k merges from BytePairEnc + 256 utf8 encodings + end_of_text token #! bad number
    n_layer: int = 4 #12 # n of transformer blocks
    n_head: int = 4 #12 # n of attention heads in each transformer block
    n_embd: int = 128 #768 # embedding dimension of each token

In [5]:
class MLP(nn.Module):

    def __init__(self, config):

        super().__init__()

        self.c_fc = nn.Linear(config.n_embd, 4* config.n_embd) #! UPSAMPLING
        self.gelu = nn.GELU(approximate='tanh') # approximate: i guess to be removed
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.weight_scaling = True # apply scaling of 1/sqrt(Num of residual layers)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [6]:
class CausalSelfAttention(nn.Module):

    '''
    multihead att
    tri_low example:
        (tensor([[[[1., 0., 0.],
                    [1., 1., 0.],
                    [1., 1., 1.]]]]),
        torch.Size([1, 1, 3, 3]))
    '''

    def __init__(self, config):

        super().__init__()

        assert config.n_embd % config.n_head == 0 #* idea: precise split n_embd in n_head chunks

        # default bias: bool = True
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd) # 3 cuz kqv
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.weight_scaling = 1  # apply scaling of 1/sqrt(Num of residual layers)

        # masking, named "bias" in OpenAI code

        # low_tril = torch.tril(  # low_tril of 1s with diag 1, up_tril 0s
        #     torch.ones(
        #         config.block_size, config.block_size
        #     ).view(1, 1, config.block_size, config.block_size))

        # self.register_buffer("bias", low_tril)

        self.n_embd = config.n_embd
        self.n_head = config.n_head

    def forward(self, x):
        B, T, C = x.size() # batch_size, sequence_len, emb_dim

        # compute KQV via single MLP, MultiHead as additional batch dimension

        KQV = self.c_attn(x)
        K, Q, V = KQV.split(self.n_embd, dim=-1) # out shape of each: B, T, n_head*n_embd

        # "promote" n_head to batch dimension s.t. apply operations on heads as batches
        # to do so we need to have tensor of shape (B, n_head, T, n_embd)
        #* idea: split n_embd in n_head chunks
        # hs = C // self.n_head eg 768 // 6 = 128

        K = K.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        Q = Q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        V = V.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # get attention scores

        # att = Q @ K.transpose(-2, -1) # now with the shapes got above this mutliplication occurs on all (B, nh)
        # att /= math.sqrt(K.size(-1)) # normalize wrt n_embd size
        # att = att.masked_fill(self.bias[ ... , :T, :T] == 0, float('-inf')) # mask "future" tokens, [ ... , :T, :T]: square matrix of token to token unnormalized attention
        # att = F.softmax(att, dim=-1) # normalized: sums to 1
        # y = att @ V # (B, nh,  T, T) @ (B, nh, T, hs), outs (B, nh, T, hs), "demote" nh back into n_embd, transpose back needed

        y = F.scaled_dot_product_attention(Q,K,V, is_causal=True) # flash attention

        # apply attention scores

        y = y.transpose(1, 2).contiguous().view(B, T, C) # .cat as in older implementation of multihead: return to C=self.n_head*hs

        # out projection

        y = self.c_proj(y)
        return y

In [7]:
class Block(nn.Module):

    def __init__(self, config):

        super().__init__()

        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp  = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [8]:
class GPT(nn.Module):

    """
    decoder only architecture for autoregressive prediction of next token
    clean residual flow from top-to-bottom is applied

    """

    def __init__(self, config):
        super().__init__()
        self.config = config


        self.transformer = nn.ModuleDict(
            dict(
                wte  = nn.Embedding(config.vocab_size, config.n_embd),
                wpe  = nn.Embedding(config.vocab_size, config.n_embd),
                h    = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), #! [Block(config)]*n fills list with same instance!
                ln_f = nn.LayerNorm(config.n_embd)
            ),
        )
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size) # default bias: bool = True

        # weight sharing/tying scheme

        self.transformer.wte.weight = self.lm_head.weight

        self.apply(self._init_weights) # recurisvely call this func on every submodule of self, look at docs of nn.Module.apply


    def _init_weights(self, module):

        # default init of nn.LayerNorm is good

        if isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0, std=.02) # xavier: 1/sqrt(input dim of the layer)
        elif isinstance(module, nn.Linear):
            std = .02
            if hasattr(module, 'weight_scaling'):
                std *= (2 * self.config.n_layer ** -.5)
            torch.nn.init.normal_(module.weight, mean=0, std=.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

    def configure_optimizers(self, weight_decay, lr, device):
        '''
        idea behind wd: by forcing weights to be low u force the model to use all the feature vect dimensionality
        do not wd biases and other 1d params such as LayerNorm params and such
        u do want to wd matmul weights and embeddings
        '''

        # get all params that require grad

        param_dict = {name:param for name, param in self.named_parameters() if param.requires_grad}

        # split them according to their shape (which implies wheter to apply wd on them or not)

        params_to_be_decayed = [p for p in param_dict.values() if p.dim()>=2]
        params_NOT_to_be_decayed = [p for p in param_dict.values() if p.dim()<2]
        optim_groups = [
            {'params': params_to_be_decayed, 'weight_decay': weight_decay},
            {'params': params_NOT_to_be_decayed, 'weight_decay': 0.0},
        ]

        # instanciate fused adamW
        if device != 'cpu':
            optimizer = torch.optim.AdamW(optim_groups, lr=lr, betas=[.9, .95], fused=True) # , eps=1e-8 just as the default
        else:
            optimizer = torch.optim.AdamW(optim_groups, lr=lr, betas=[.9, .95], fused=False) # , eps=1e-8 just as the default
        return optimizer


    def forward(self, idx, targets = None):

        '''
        idx is the sentence casted to tokens
        '''

        B, T = idx.size()
        assert T <= self.config.block_size, f"Cant't process seqs with num of tokens larger than {T}"

        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos) # T, n_embd
        idx_emb = self.transformer.wte(idx) # B, T, n_embd
        x = pos_emb + idx_emb # broadcasting pos_emb over the batch dim

        for block in self.transformer.h:
            x = block(x) # B, T, n_embd

        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            flatten_logits = logits.view(-1, logits.size(-1))
            flatten_targets = targets.view(-1)
            # idea: logits shape: (B, T, vocab_size), targets: (B, T)
            # get logits as: (B*T, vocab_size) and targets as: (B*T,) of correct idxs
            loss = F.cross_entropy(flatten_logits, flatten_targets )

        return logits, loss # B, T, vocab_size

In [9]:
def sample(x, max_len = 30):
    '''
    minimal example from topk to gather:
    x: shape(2, 5), already softmaxed
    x:tensor([[0.1738, 0.1881, 0.4180, 0.0235, 0.1965],
              [0.1004, 0.0591, 0.0454, 0.4973, 0.2977]])

    topk(k = 3):
    topk_probs: tensor([[0.4180, 0.1965, 0.1881],
                        [0.4973, 0.2977, 0.1004]])
    topk_idxs:  tensor([[2, 4, 1],
                        [3, 4, 0]])

    multinomial sampling out of topk_probs
    ix: tensor([[0],
                [0]])
    ix: idxs used to retrieve sampled value out of topk_idxs

    collect results with torch.gather:
    xcol: tensor([[2],
                  [3]])
    xcol: tokens sampled out of vocab
    '''

    with torch.no_grad():
        while x.size(1) < max_len:
            logits = model(x)
            logits = logits[:, -1, :] # select all last tokens
            probs = F.softmax(logits, dim=-1) # get prob dist over last token type

            # keep only top 50 out of softmax(last_token) of size: vocab_size s.t. drop watherver is too low
            # returns prob values and their associated idxs in the original tensor
            topk_probs, topk_idxs = probs.topk(50, dim=-1)

            ix = torch.multinomial(topk_probs, num_samples=1) # actual sampling, does #num_samples per obs, returns idx of element sampled out of topk_probs
            # thus the ix above returned have to be used to first get the associated value in topk_idxs
            xcol = torch.gather(topk_idxs, dim=-1, index=ix) # get topk_idxs that has been sampled by torch.multinomial, Gathers values along an axis specified by dim
            x = torch.cat((x, xcol), dim=1) # append col of just-sampled tokens to input and refeed to model

    return x


def decode(x):
    for i in range(num_return_seqs):
        tokens = x[i, :max_len].tolist()
        decoded = enc.decode(tokens)
        print('>', decoded)

In [10]:
# class DataLoaderLite:

#     def __init__(self, B, T):

#         # good batch sizes: 8 16 24 32 48 64 128 256

#         self.B = B
#         self.T = T

#         tiny_shakps = '/home/marconobile/Videos/courses/karpathy/data/input.txt'
#         with open(tiny_shakps, 'r') as f:
#             text = f.read()

#         # tokenize it

#         encoder = tiktoken.get_encoding("gpt2")
#         encoded_text = encoder.encode(text)
#         self.tokens = torch.tensor(encoded_text, dtype=torch.long)

#         print(f'Loaded {len(self.tokens)} tokens')
#         print(f'1 epoch has: {len(self.tokens) // (B * T)} batches')

#         # text cursor
#         self.current_position = 0

#     def next_batch(self):
#         B, T = self.B, self.T

#         # read text at chunks of B*T tokens
#         # idea: load enough tokens to fill a batch of size B with sequences of lenght T
#         # +1 to get label of last token in buffer

#         buffer = self.tokens[self.current_position: self.current_position + B*T + 1]
#         x = buffer[:-1].view(B, T)
#         y = buffer[1:].view(B, T)

#         # avance text_ptr by B*T
#         # if at end of file: restart

#         self.current_position += B*T # shift by how much text has been read
#         if self.current_position + (B*T + 1) > len(self.tokens):
#             self.current_position = 0

#         return x, y

In [2]:
class DataLoaderLite:

    def __init__(self, B, T, process_rank, num_processes):

        # good batch sizes: 8 16 24 32 48 64 128 256

        self.B = B
        self.T = T
        self.process_rank = process_rank
        self.num_processes = num_processes

        tiny_shakps = '/home/marconobile/Videos/courses/karpathy/data/input.txt'
        with open(tiny_shakps, 'r') as f:
            text = f.read()

        # tokenize it

        encoder = tiktoken.get_encoding("gpt2")
        encoded_text = encoder.encode(text)
        self.tokens = torch.tensor(encoded_text, dtype=torch.long)

        print(f'Loaded {len(self.tokens)} tokens')
        print(f'1 epoch has: {len(self.tokens) // (B * T)} batches')

        # text cursor
        self.current_position = self.B * self.T * self.process_rank
        # so eg;
        # if gpu 0-> self.process_rank= 0 self.current_position = 0
        # if gpu 1-> self.process_rank= 0 self.current_position = self.B * self.T
        # if gpu 2-> self.process_rank= 0 self.current_position = self.B * self.T * 2
        # thus we have proc0 that reads (0, self.B * self.T)
        # thus we have proc1 that reads (self.B * self.T, self.B * self.T*2)
        # thus we have proc2 that reads (self.B * self.T, self.B * self.T*3)
        # BS:2, T=4
        # [
        #   [0,0,0,0],[1,1,1,1],[2,2,2,2], [0,0,0,0],[1,1,1,1],[2,2,2,2],
        #   [0,0,0,0],[1,1,1,1],[2,2,2,2], [0,0,0,0],[1,1,1,1],[2,2,2,2],
        # ]


    def next_batch(self):
        B, T = self.B, self.T

        # read text at chunks of B*T tokens
        # idea: load enough tokens to fill a batch of size B with sequences of lenght T
        # +1 to get label of last token in buffer

        buffer = self.tokens[self.current_position: self.current_position + B*T + 1]
        x = buffer[:-1].view(B, T)
        y = buffer[1:].view(B, T)

        # avance text_ptr by B*T
        # if at end of file: restart

        self.current_position += B * T * self.num_processes # shift by how much text has been read
        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
            self.current_position = self.B * self.T * self.process_rank

        return x, y

In [11]:
# x = list(range(100))
# y = [get_lr(step) for step in x]
# print(y[-1])
# plt.plot(x, y)

# print(loss, -math.log(1/config.vocab_size))
# tensor(11.1026, grad_fn=<NllLossBackward0>) 10.82490511970208

In [12]:
import torch.distributed


device = 'cpu'
num_return_seqs = 2
max_len = 30
config = GPTConfig(vocab_size=50304) # cuz more powers of 2 then 50257
# this does not change the funcitoning of the model cuz at inpt thse fake tokens will never be accessed
# at out they will never get predicted and their associated weights are just driven to -inf (the bias in particular)
# with more mflops we are faster cuz of cuda kernel implementations with odd numbers
model = GPT(config)
model.eval()
model.to(device)
model = torch.compile(model)
if is_ddp:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[ddp_local_rank]) # synchronizes and avg the grad on each model over the devices
raw_model = model.module if is_ddp else model


max_lr = 6e-4
min_lr = max_lr * .1 # 10% of max_lr

warmup_steps = 10
max_steps = 50

def get_lr(step):

    # 1) linear warmup for warmup_iters steps

    if step < warmup_steps:
        return max_lr * (step+1) / warmup_steps # linear increase in lr for n_warmup_steps

    # 2) if step > lr_decay_iters, return min lr

    if step > max_steps:
        return min_lr

    # 3) in between use cosine decay down to min_lr

    decay_ratio = (step - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = .5 * (1.0 + math.cos(math.pi * decay_ratio)) # starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

norms = []
optimizer = raw_model.configure_optimizers(.1, max_lr, device)

# gradient accumulation param
B, T = 2, 128 # microbs
total_bs = 524288
assert total_bs % (B * T * ddp_world_size) == 0, "total_bs must be divisible by B*T*ddp_world_size"
grad_accum_steps = total_bs // (B * T * ddp_world_size)  # number of times that fwd/bckwd steps are done and grad +=
                                                         # now we have ddp_world_size processes so we process (B * T) tokens on each gpu
                                                         # nb: ddp_world_size is the count of GPU

train_loader = DataLoaderLite(
    B=B,
    T=T,
    process_rank = ddp_rank,
    num_processes = ddp_world_size,
)


for step in range(max_steps):

    loss_accum = 0.0

    # mandatory

    optimizer.zero_grad()

    for micro_step in range(grad_accum_steps):

        # get data from data loader and ship it to cuda

        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)

        # get gradients
        if is_ddp:
            # the idea here is that without this line ddp synchronizes at each step the processes, but actually we want to sync them only at last step b4 application of grad
            # this field is also used by the forward pass
            model.require_backward_grad_sync = (micro_step == grad_accum_steps -1) # this is just bcuz we are using gradient accumulation

        # forward step with mixed precision

        with torch.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu', dtype=torch.bfloat16):
            logits, loss = model(x,y)

        # normalize wrt grad_accum_steps "un-average"

        loss /= grad_accum_steps
        loss_accum += loss.detach()


        loss.backward()

    # when we get here all devices will have gradients that have been computed on the other devices, but now we need to avg the loss computed on the diff devices
    if is_ddp:
        torch.distributed.all_reduce(loss_accum, op=torch.distributed.ReduceOp.AVG) # all devices now will have the overall loss_accum avgd

    # clip grad norm

    norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) # avoid shocks to the model during optimization; PRINTABLE expected trend from high to low and stabilize
    norms.append(norm.item())

    # define lr for this step
    lr = get_lr(step)
    for pg in optimizer.param_groups:
        pg['lr'] = lr

    # optimize

    optimizer.step()

    n_tokens_processed = train_loader.B * train_loader.T * grad_accum_steps * ddp_world_size

    if master_process:
        print(f'Step {step} loss: {loss_accum.item()}, grad_norm: {norm.item()}')

if is_ddp:
    destroy_process_group()

Loaded 338025 tokens
1 epoch has: 1320 batches


2.0

6.0