In [1]:
# !git clone https://github.com/karpathy/minGPT.git
!pip install -e ./minGPT
# !pip install mingpt --no-index --find-links file:///home/edge/epfl/ma3/disco/gpt-pytorch/minGPT/

Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///home/edge/epfl/ma3/disco/gpt-pytorch/minGPT
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: minGPT
  Attempting uninstall: minGPT
    Found existing installation: minGPT 0.0.1
    Uninstalling minGPT-0.0.1:
      Successfully uninstalled minGPT-0.0.1
  Running setup.py develop for minGPT
Successfully installed minGPT-0.0.1


In [2]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [3]:
import json
import wandb
import torch
from torch.utils.data import Dataset, DataLoader
from mingpt.model import GPT
from mingpt.trainer import Trainer
from wikitext import get_wikitext_data
from datasets import load_dataset
from transformers import GPT2TokenizerFast

In [4]:
!cat ./minGPT/mingpt/trainer.py | grep assert

In [5]:
with open('../config.json') as f:    
    config = json.load(f)
config 

{'model': 'gpt-nano',
 'n_head': 3,
 'n_layer': 3,
 'n_embd': 48,
 'dataset': 'wikitext',
 'batch_size': 16,
 'seq_length': 256,
 'lr': 0.001,
 'max_iters': 1200,
 'weight_decay': 0.001,
 'optimizer': 'adamw',
 'grad_clip': 1,
 'scheduler': None,
 'dropout': 0,
 'num_workers': 4,
 'vocab_size': 50257,
 'wandb_project': 'disco-gpt-benchmark',
 'wandb_name': 'gpt-nano_wikitext_bs=16_seq=256_lr=0.001_iter=3600'}

In [6]:
model_type = config['model']
model_config = GPT.get_default_config()
# model_config.n_head = config['n_head']
# model_config.n_layer = config['n_layer']
# model_config.n_embd = config['n_embd']
model_config.model_type = model_type
model_config.vocab_size = config['vocab_size'] # openai's model vocabulary
model_config.block_size = config['seq_length']  # openai's model block_size (i.e. input context length)
model_config.attn_pdrop = config['dropout']
model_config.resid_pdrop = config['dropout']
model_config.embd_pdrop = config['dropout']
model_config.weight_decay = config['weight_decay']
model = GPT(model_config)

number of parameters: 2.51M


In [7]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
batch_size = config['batch_size']
max_length = config['seq_length'] 

In [69]:
# dataset = dataset.map(
#     lambda batch: tokenizer(batch["text"], padding='max_length', max_length=max_length, truncation=True, return_tensors='pt'), 
#     remove_columns=['text'], 
#     batch_size=batch_size,
#     batched=True, 
# )
# return dataset.with_format("torch")

In [70]:
def get_wikitext(split, tokenizer, max_length=512):
    dataset = load_dataset("wikitext", "wikitext-103-v1", split=split)
    dataset = dataset.filter(lambda x: len(x['text']) > 0)
    # dataset = dataset.map(lambda x: {'text': x['text'], 'length': [len(y) for y in x['text']] }, batched=True, batch_size=64)
    # dataset = dataset.sort('length')
    # def encode(batch):
    #     return tokenizer(batch["text"], padding='max_length', max_length=max_length, truncation=True, return_tensors='pt')
    # dataset.set_transform(encode)
    dataset = dataset.map(
        lambda batch: tokenizer(batch["text"], padding='max_length', max_length=max_length, truncation=True, return_tensors='pt'), 
        remove_columns=['text'], 
        batch_size=batch_size,
        batched=True, 
    )
    dataset = dataset.remove_columns(['attention_mask'])
    dataset = dataset.with_format("torch")
    return dataset

train_dataset = get_wikitext('train[:50%]', tokenizer, max_length=max_length)
eval_dataset = get_wikitext('validation', tokenizer, max_length=max_length)
train_dataset, eval_dataset

Map:   0%|          | 0/582510 [00:00<?, ? examples/s]

In [16]:
class WikitextDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)
    
    # x = tokenizer(self.dataset[idx]["text"], padding='max_length', max_length=max_length, truncation=True, return_tensors='pt')
    # x = x['input_ids']
    
    # y = torch.empty(x.shape)
    # y[:,:-1] = x[:,1:]
    # y[:,-1] = torch.ones(x.shape[0]) * 50256

    # =========================================================

    # x = self.dataset[idx]['input_ids']
    # y = None
    # if isinstance(idx, torch.Tensor) or isinstance(idx, slice):
    #     y = torch.empty(x.shape)
    #     y[:,:-1] = x[:,1:]
    #     y[:,-1] = torch.ones(x.shape[0]) * 50256
    # else:
    #     y = torch.empty(x.shape, dtype=torch.long)
    #     y[:-1] = x[1:]
    #     y[-1] = 50256

    def __getitem__(self, idx):
        x = self.dataset[idx]['input_ids']
        y = torch.empty(x.shape, dtype=x.dtype)
        y[:-1] = x[1:]
        y[-1] = 50256
        return x, y

tr_dataset = WikitextDataset(train_dataset)
ev_dataset = WikitextDataset(train_dataset)

In [20]:
tr_dataset[:2], tr_dataset[0]

((tensor([[  796,   569, 18354,  7496, 17740,  6711,   796,   220,   198, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           50256, 50256, 502

In [8]:
# class WikitextDataset(Dataset):
#     def __init__(self, dataset, max_length=512):
#         x = dataset['input_ids']
#         size, _ = x.shape
#         y = torch.empty((size, max_length))
#         y[:,:-1] = x[:,1:]
#         y[:,-1] = torch.ones(size) * 50256
#         self.data = torch.cat((x.unsqueeze(2), y.unsqueeze(2)), dim=2)
#         self.data = self.data.reshape((size, 2, max_length))
#         self.data = self.data.to(torch.int64)
#         print(self.data.shape)

#     def __len__(self):
#         return self.data.shape[0]

#     def __getitem__(self, idx):
#         x, y = self.data[idx]
#         return x, y

# tr_dataset = WikitextDataset(train_dataset, max_length)
# ev_dataset = WikitextDataset(eval_dataset, max_length)

In [8]:
data = get_wikitext_data()
tr_dataset = data['train']
ev_dataset = data['val']

In [9]:
import gc
import psutil

gc.collect()
process = psutil.Process()

def get_mem():
    return process.memory_info().rss 

get_mem()

565850112

In [10]:
train_config = Trainer.get_default_config()
train_config.learning_rate = config['lr']
train_config.max_iters = config['max_iters']
train_config.batch_size = config['batch_size']
train_config.seq_length = config['seq_length']
train_config.num_workers = config['num_workers']
trainer = Trainer(train_config, model, tr_dataset)

running on device cpu


In [11]:
# ev_loader = DataLoader(
#     ev_dataset,
#     shuffle=False,
#     batch_size=batch_size,
#     num_workers=1,
# )

# @torch.no_grad()
# def custom_evaluate(model, device):
#     model.eval()
#     losses = []
#     for batch in ev_loader:
#         batch = [t.to(device) for t in batch]
#         x, y = batch
#         logits, loss = model(x, y)
#         losses.append(loss.item())
#     model.train()
#     return sum(losses) / len(losses)

In [11]:
wandb.init(project=config['wandb_project'], name='mingpt_' + config['wandb_name'], config=config)

import time
start = time.time()

def on_batch_end(t):
    # if t.iter_num % 2 == 0: 
    eval_loss = 0 # custom_evaluate(t.model, device=t.device)
    mem = get_mem()
    print(f'DT: {t.iter_dt:.3f}, iter: {t.iter_num:05d}, train_loss: {t.loss:.4f}, eval_loss: {eval_loss:.4f}, mem: {mem / (1024 * 1024):.2f} MB')
    wandb.log({
        "time_s": t.iter_time - start,
        "iter": t.iter_num,
        "train/loss": t.loss,
        # "val/loss": val_loss,
        # "val/perplexity": val_perplexity,
        # "val/acc": val_acc,
        "dt_ms": t.iter_dt * 1000,
        "mem_cuda": torch.cuda.memory_allocated() / 1e9,
        "mem": mem,
    })
    # benchmark.append({'iter': t.iter_num, 'train_loss': t.loss, 'eval_loss': eval_loss, 'time': t.iter_time, 'mem': mem})
    gc.collect()

trainer.add_callback('on_batch_end', on_batch_end)
trainer.run()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpeacefulotter[0m ([33motters-gang[0m). Use [1m`wandb login --relogin`[0m to force relogin


DT: 0.000, iter: 00000, train_loss: 10.8374, eval_loss: 0.0000, mem: 1555.62 MB
DT: 1.204, iter: 00001, train_loss: 10.7881, eval_loss: 0.0000, mem: 1682.58 MB
DT: 1.445, iter: 00002, train_loss: 10.7253, eval_loss: 0.0000, mem: 1683.66 MB
DT: 1.198, iter: 00003, train_loss: 10.6819, eval_loss: 0.0000, mem: 1684.47 MB
DT: 1.222, iter: 00004, train_loss: 10.6323, eval_loss: 0.0000, mem: 1685.57 MB
DT: 1.240, iter: 00005, train_loss: 10.5857, eval_loss: 0.0000, mem: 1686.58 MB
DT: 1.326, iter: 00006, train_loss: 10.5292, eval_loss: 0.0000, mem: 1687.52 MB
DT: 1.129, iter: 00007, train_loss: 10.4868, eval_loss: 0.0000, mem: 1650.18 MB
DT: 1.133, iter: 00008, train_loss: 10.4268, eval_loss: 0.0000, mem: 1699.12 MB
DT: 1.004, iter: 00009, train_loss: 10.3765, eval_loss: 0.0000, mem: 1736.18 MB
DT: 1.104, iter: 00010, train_loss: 10.3270, eval_loss: 0.0000, mem: 1700.40 MB
DT: 1.117, iter: 00011, train_loss: 10.2586, eval_loss: 0.0000, mem: 1701.59 MB
DT: 1.086, iter: 00012, train_loss: 10.1

In [8]:
torch.save(model.state_dict(), './model.pt')

In [11]:
benchmark

[{'iter': 0,
  'train_loss': tensor(10.8628, grad_fn=<NllLossBackward0>),
  'eval_loss': 0,
  'time': 1698593017.1018848,
  'mem': 1785602048},
 {'iter': 1,
  'train_loss': tensor(10.5546, grad_fn=<NllLossBackward0>),
  'eval_loss': 0,
  'time': 1698593018.4563336,
  'mem': 2024230912},
 {'iter': 2,
  'train_loss': tensor(10.4861, grad_fn=<NllLossBackward0>),
  'eval_loss': 0,
  'time': 1698593019.8006442,
  'mem': 2070908928},
 {'iter': 3,
  'train_loss': tensor(10.2586, grad_fn=<NllLossBackward0>),
  'eval_loss': 0,
  'time': 1698593020.986921,
  'mem': 2083426304},
 {'iter': 4,
  'train_loss': tensor(10.2835, grad_fn=<NllLossBackward0>),
  'eval_loss': 0,
  'time': 1698593021.920062,
  'mem': 2158997504},
 {'iter': 5,
  'train_loss': tensor(10.2894, grad_fn=<NllLossBackward0>),
  'eval_loss': 0,
  'time': 1698593022.7859998,
  'mem': 2247143424},
 {'iter': 6,
  'train_loss': tensor(10.3551, grad_fn=<NllLossBackward0>),
  'eval_loss': 0,
  'time': 1698593023.662099,
  'mem': 23225057