In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
import time
from collections import defaultdict
import bitsandbytes as bnb
from torch.utils.data import Dataset, DataLoader, RandomSampler, Sampler
import numpy as np
from torch.optim.lr_scheduler import CosineAnnealingLR
from gptmodel import GPTLanguageModel, load_checkpoint, save_checkpoint

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

load = True  # load model from checkpoint?
save = True  # save model and data?
load_model_number = "01"
save_model_number = "01"
resume_step = "43100"
load_path = f'checkpoints/model{load_model_number}'
save_path = f'checkpoints/model{save_model_number}'
LOG_FILE = f'train_data/model{save_model_number}_data.csv'

start_optim_step = 0  # default start step
optim_step = start_optim_step
start_epoch = 0
start_block = 0
block = start_block
max_epochs = 1

train_dataset_path = "openwebtext1/train_data.bin"
test_dataset_path = "openwebtext1/test_data.bin"
train_map_paths = ["openwebtext1/train_shuffle_map0-128block.bin"]

# evrything here can be changed each training session to optimize learning
minibatch_size = 16   # effective batch size is minibatch_size * accumulation_steps
accumulation_steps = 64
block_size = 128
learning_rate = 1e-4
save_iters = 100
eval_iters = 64

# everything below here NEEDS to be identical to load an extistng model
vocab_size = 30000
n_embed = 1536
n_head = 24
n_layer = 32
dropout = 0.2

In [2]:
# Datasets and dataloading
class FastSeekSampler(Sampler):
    def __init__(self, map_path, start_pos):
        self.indices_np = np.memmap(map_path, dtype=np.int64, mode='r')
        self.start_block = start_pos
        self.indices_torch = torch.from_numpy(self.indices_np)

    def __iter__(self):
        remaining = self.indices_torch[self.start_block:]
        
        yield from remaining.tolist()

    def __len__(self):
        return len(self.indices_torch) - self.start_block
        
class BinDataset(Dataset):
    def __init__(self, data_path, block_size):
        self.block_size = block_size
        # creates a pointer to the file without loading it.
        self.data = np.memmap(data_path, dtype=np.uint16, mode='r')
        
    def __len__(self):
        return len(self.data) - self.block_size - 1

    def __getitem__(self, i):
        i = i * block_size
        chunk = self.data[i : i + self.block_size + 1]
        
        # from_numpy is zero-copy It points to the memmap memory.
        t = torch.from_numpy(chunk.astype(np.int64))
        
        x = t[:-1] # Input
        y = t[1:]  # Target (shifted right)
        return x, y
    
def get_dataloaders(data_path, batch_size, block_size, map_path, start_pos=0):
    dataset=BinDataset(data_path, block_size)

    sampler = FastSeekSampler(map_path, start_pos)

    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=2,      # More than 4 often causes overhead
        pin_memory=True,    # Fast transfer to GPU
        drop_last=True      # Avoids partial batches that break FSDP shapes
    )
    
    return dataset, loader, sampler  

val_data = BinDataset(test_dataset_path, block_size)

In [3]:
# DEFINE ESTIMATE LOSS FUNCTION
@torch.no_grad()
def estimate_loss(step):
    out = {}
    eval_gen = torch.Generator(device='cpu').manual_seed(step+123)
    m.eval()
    for split in ['train', 'val']:
        data = train_data.data if split == 'train' else val_data.data
        out[split] = 0
        for k in range(eval_iters):
            ix = torch.randint(len(data), (minibatch_size,), generator=eval_gen)
            x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
            y = torch.stack([torch.from_numpy((data[i+1:i+block_size+1]).astype(np.int64)) for i in ix])
            x, y = x.to(device), y.to(device)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits, loss = m(x, y)
            out[split] += loss.item()
        out[split] /= eval_iters
    m.train()
    return out

In [4]:
# INITIALIZE AND LOAD MODEL AND OPTIMIZER

# define the model
torch.manual_seed(230) 
torch.cuda.manual_seed(230)
m = GPTLanguageModel(vocab_size, n_embed, n_head, n_layer, dropout).to(device)

# define a PyTorch optimizer
torch.manual_seed(230)
#optim = bnb.optim.PagedAdamW8bit(m.parameters(), lr=learning_rate)
optim = torch.optim.AdamW(m.parameters(), lr=learning_rate)
    
scheduler = CosineAnnealingLR(optim, T_max=62500, eta_min=8e-6)

if load:
    # check for an existing checkpoint and load if necessary
    path = f"{load_path}/model{load_model_number}_step{resume_step}.pt"
    if os.path.exists(path):
        print(f"Loading checkpoint from {path}")
        start_optim_step, start_epoch, start_block = load_checkpoint(m, optim, scheduler, path)
        optim_step = start_optim_step
        block = start_block
        print(f"Loaded succesfuly from (step: {optim_step}, epoch: {start_epoch}, block: {start_block})")
    else:
        print(f"No checkpoint found at {path}.")
        print(f"New model will be training from (step: {optim_step}, epoch: {start_epoch}, block: {start_block})")
m = torch.compile(m)
print(f"Total parameters: {sum(p.numel() for p in m.parameters()):,}")
print(f"Learning rate: {optim.param_groups[0]['lr']:.8e}")
print(f"Scheduler step: {scheduler.last_epoch}")

Loading checkpoint from checkpoints/model01/model01_step27800.pt
Loaded succesfuly from (step: 27800, epoch: 0, block: 28467200)
Total parameters: 998,554,416
Learning rate: 6.19372093e-05
Scheduler step: 27800


In [5]:
# TRAINING LOOP
train_data, train_loader, train_sampler = get_dataloaders(
    train_dataset_path, minibatch_size, block_size, train_map_paths[start_epoch], start_block
)
print(f"{time.localtime().tm_hour:02}:{time.localtime().tm_min:02}:{time.localtime().tm_sec:02}")
losses = estimate_loss(optim_step)    # estimate a base loss before training session
print(f"(step: {optim_step}, epoch: {start_epoch}, block: {start_block}), Train Loss: {losses['train']:.4f}, Val Loss: {losses['val']:.4f}")
print(f"{time.localtime().tm_hour:02}:{time.localtime().tm_min:02}:{time.localtime().tm_sec:02}")

if save:
    # if never trained log the pre optim loss in a new csv
    if optim_step == 0 or save_model_number != load_model_number:
        print(f"Initializing new data collection file at: {LOG_FILE}")
        with open(LOG_FILE, 'a') as f:
            f.write(f"step,epoch,block,train_loss,val_loss\n{optim_step:05d},{start_epoch:03d},{start_block:05d},{losses['train']:.4f},{losses['val']:.4f}\n")

# Dictionary to hold total accumulated time and count
times_tracker = defaultdict(lambda: {'time': 0.0, 'count': 0})

train_start_event = torch.cuda.Event(enable_timing=True)
train_end_event = torch.cuda.Event(enable_timing=True)

for epoch in range(start_epoch, max_epochs):
    if epoch != start_epoch:
        start_block = 0
        train_data, train_loader, train_sampler = get_dataloaders(
            train_dataset_path, minibatch_size, block_size, train_map_paths[epoch], start_block
        )
        
    optim.zero_grad(set_to_none=True)
    #running_loss = 0
    train_start_event.record()
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
            logits, loss = m(x, y)
            loss /= accumulation_steps
        #running_loss += loss
        loss.backward()

        block += minibatch_size
        
        if (batch_idx+1) % accumulation_steps == 0:
            """with torch.no_grad():
                all_grads = torch.cat([p.grad.view(-1) for p in m.parameters() if p.grad is not None])
                
                # Measure the 'Signal' (Average magnitude of the gradient)
                grad_norm = torch.linalg.norm(all_grads).item()
                
                # Measure the 'Noise' (Standard deviation of gradient values)
                grad_std = torch.std(all_grads).item()
                
                # Noise-to-Signal Ratio
                ns_ratio = grad_std / (grad_norm + 1e-8)
                
                print(f"Step {optim_step} | Grad Norm: {grad_norm:.4f} | Noise Ratio: {ns_ratio:.6f}")"""
            optim.step()
            scheduler.step()
            optim_step += 1
            #print(f"{optim_step},{running_loss:.4f}")
            #running_loss = 0
            optim.zero_grad(set_to_none=True)

            if (optim_step - start_optim_step) % save_iters == 0:
                train_end_event.record()
                torch.cuda.synchronize()
                # elapsed_time returns milliseconds, so divide by 1000.0
                elapsed_time_sec = train_start_event.elapsed_time(train_end_event) / 1000.0
                times_tracker['train']['time'] += elapsed_time_sec
                times_tracker['train']['count'] += save_iters
                
                tic = time.perf_counter()
                losses = estimate_loss(optim_step)
                torch.cuda.synchronize()
                times_tracker['estimate']['time'] += (time.perf_counter() - tic)
                times_tracker['estimate']['count'] += 1
                
                print(f"(step: {optim_step}, epoch: {epoch}, block: {block}), Train Loss: {losses['train']:.4f}, Val Loss: {losses['val']:.4f}")
                print(f"Learning rate: {optim.param_groups[0]['lr']:.8e}")
                print(f"Scheduler step: {scheduler.last_epoch}")
                tic = time.perf_counter()
                
                if save:
                    os.makedirs(save_path, exist_ok=True)
                    save_checkpoint(optim_step, epoch, block, m, optim, scheduler, f"{save_path}/model{save_model_number}_step{optim_step}.pt")
            
                    # write a new line in our data csv
                    with open(LOG_FILE, 'a') as f:
                        f.write(f"{optim_step:05d},{epoch:03d},{block:05d},{losses['train']:.4f},{losses['val']:.4f}\n")
                    print(f"Step Documented")
                    
                times_tracker['save']['time'] += (time.perf_counter() - tic)
                times_tracker['save']['count'] += 1
                    
        
                print(f"{time.localtime().tm_hour:02}:{time.localtime().tm_min:02}:{time.localtime().tm_sec:02}")
                train_start_event.record()

  self.indices_torch = torch.from_numpy(self.indices_np)


12:45:02
(step: 27800, epoch: 0, block: 28467200), Train Loss: 3.3154, Val Loss: 3.3419
12:45:24
27801,3.3708
27802,3.3587
27803,3.4006
27804,3.3648
27805,3.3957
27806,3.3732
27807,3.4041
27808,3.3690
27809,3.3883
27810,3.4026
27811,3.3611
27812,3.3974
27813,3.3607
27814,3.3957
27815,3.3985
27816,3.3824
27817,3.3739
27818,3.3689
27819,3.3699
27820,3.4013
27821,3.3788
27822,3.3994
27823,3.4021
27824,3.3538
27825,3.3894
27826,3.4032
27827,3.3818
27828,3.4052
27829,3.3684
27830,3.3675
27831,3.4079
27832,3.3880
27833,3.3725
27834,3.3482
27835,3.3907
27836,3.3952
27837,3.3966
27838,3.3860
27839,3.3744
27840,3.4054
27841,3.3932
27842,3.3893
27843,3.3874
27844,3.3800
27845,3.3904
27846,3.3864
27847,3.3671
27848,3.3775
27849,3.3816
27850,3.3731
27851,3.3505
27852,3.3857
27853,3.4027
27854,3.3721
27855,3.3880
27856,3.4018
27857,3.4063
27858,3.3677
27859,3.3727
27860,3.3855
27861,3.3697
27862,3.3983
27863,3.3746
27864,3.3668
27865,3.3830
27866,3.3687
27867,3.3916
27868,3.3918
27869,3.3848
27870,

KeyboardInterrupt: 

In [6]:
print(f"Total time train: {times_tracker['train']['time'] /60/60:.3f}hr")
print(f"Total time estim: {times_tracker['estimate']['time'] /60/60:.3f}hr")
print(f"Total time Check: {times_tracker['save']['time'] /60/60:.3f}hr")
print(f"Average time per Optimizer step: {times_tracker['train']['time'] / times_tracker['train']['count']:.3f}sec")
print(f"Average time {save_iters} Optimizr steps: {times_tracker['train']['time'] / (times_tracker['train']['count'] / save_iters)/60:.3f}min")
print(f"Average time per Estimate  Loss: {times_tracker['estimate']['time'] / times_tracker['estimate']['count']:.3f}sec")
print(f"Average time per Chckpoint Save: {times_tracker['save']['time'] / times_tracker['save']['count']:.3f}sec")

Total time train: 66.443hr
Total time estim: 0.435hr
Total time Check: 0.278hr
Average time per Optimizer step: 15.634sec
Average time 100 Optimizr steps: 26.056min
Average time per Estimate  Loss: 10.241sec
Average time per Chckpoint Save: 6.548sec
