In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed


In [2]:

set_seed(1234)
import pickle

if torch.cuda.is_available(): 
 dev = "cuda:0" 
else: 
 dev = "cpu" 
device = torch.device(dev) 



In [3]:
class SortDataset(Dataset):
    """ 
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=6, num_digits=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_digits = num_digits
    
    def __len__(self):
        return 10000 # ...
    
    def get_vocab_size(self):
        return self.num_digits
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length * 2 - 1

    def __getitem__(self, idx):
        
        # use rejection sampling to generate an input example from the desired split
        while True:
            # generate some random integers
            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
            # half of the time let's try to boost the number of examples that 
            # have a large number of repeats, as this is what the model seems to struggle
            # with later in training, and they are kind of rate
            if torch.rand(1).item() < 0.5:
                if inp.unique().nelement() > self.length // 2:
                    # too many unqiue digits, re-sample
                    continue
            # figure out if this generated example is train or test based on its hash
            h = hash(pickle.dumps(inp.tolist()))
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        # solve the task: i.e. sort
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:self.length-1] = -1
        return x, y

In [4]:
# print an example instance of the dataset
train_dataset = SortDataset('train')
test_dataset = SortDataset('test')
x, y = train_dataset[0]
for a, b in zip(x,y):
    print(int(a),int(b))

0 -1
0 -1
0 -1
2 -1
2 -1
1 0
0 0
0 0
0 1
1 2
2 2


In [5]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 0.09M


In [6]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 200
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [7]:
# https://github.com/pytorch/pytorch/blob/main/torch/profiler/profiler.py

from torch.autograd import kineto_available, ProfilerActivity
from torch.profiler import profile, schedule, tensorboard_trace_handler

tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=500, repeat=1)
trace_handler = tensorboard_trace_handler(dir_name="/scratch/user/siweicui/xllm/kineto/tracing/trace_data/counter_tracing/", use_gzip=False)


# with profile(
#   activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA],
#   schedule = tracing_schedule,
#   on_trace_ready = trace_handler,
#   # profile_memory = True,
#   record_shapes = True,
#   # with_stack = True,
#   experimental_config=torch.profiler._ExperimentalConfig(
#         profiler_metrics=[
#             "kineto__tensor_core_insts",
#             "dram__bytes_read.sum",
#             "dram__bytes_write.sum"],
#   profiler_measure_per_kernel=True),
# ) as prof:
with torch.profiler.profile(
    activities=[# [W kineto_shim.cpp:157] Cannot run range profiler with CPU activities, please only use CUDA activity type
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
                ],
    record_shapes = True,
    schedule = tracing_schedule,
    on_trace_ready=trace_handler,
    with_stack = True,
    profile_memory = True,
    with_flops=True,
    experimental_config=torch.profiler._ExperimentalConfig(
        profiler_metrics=[
            "kineto__tensor_core_insts",
            "dram__bytes_read.sum",
            "dram__bytes_write.sum"],
    profiler_measure_per_kernel=True),
) as prof:
    def batch_end_callback(trainer):
      if trainer.iter_num % 100 == 0:
          print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
      prof.step()
    trainer.set_callback('on_batch_end', batch_end_callback)

    trainer.run()

    
    


Running Trainer on device: cuda
iter_dt 0.00ms; iter 0: train loss 1.11904


[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:157] Cannot run range profiler with CPU activities, please only use CUDA activity type
STAGE:2024-01-25 16:09:27 64601:64601 ActivityProfilerController.cpp:311] Completed Stage: Warm Up


iter_dt 47.93ms; iter 100: train loss 0.16789


STAGE:2024-01-25 16:09:40 64601:64601 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2024-01-25 16:09:41 64601:64601 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [8]:
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# https://discuss.pytorch.org/t/understanding-memory-profiler-output-autograd-profiler-with-memory-stats/101704
# Negative memory (mostly found in self) indicate deallocation.

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*        35.34%        4.367s        76.73%        9.481s      50.164ms       0.000us         0.00%     220.835ms       1.168ms           0 

In [9]:
# prof.key_averages()

In [10]:
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              aten::cat         1.10%     136.486ms         1.89%     234.009ms      17.782us      11.085ms         2.07%      11.475ms       0.872us       3.12 M