A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.

In [132]:
import torch
import torchdata
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import torchtext.datasets
from mingpt.utils import set_seed
from mingpt.bpe import get_encoder
from torch.nn.utils.rnn import pad_sequence

set_seed(3407)

In [236]:
# train, validation, test = torchtext.datasets.WikiText103(root='./data')
# print("train: ", list(train)[:10])
wiki_text = open('wikitext-103-raw/wiki.test.raw', encoding='utf-8').read() 
wiki_text = wiki_text.replace("=", "")
print(wiki_text[:10])
print(len(wiki_text))


 
  Robert
1285073


In [237]:
import pickle

class WikiDataset(Dataset):
    """ 
    
    """

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        self.MASK_CHAR = u"\u003D" # the equals character
        self.PAD_CHAR = u"\u25A1" # empty square character for padding
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }

        self.block_size = block_size
        self.vocab_size = len(chars)
        print("vocab size is: " + str(self.vocab_size))
        self.data_size = len(data)
        # self.data = list(data.encode('utf-8').decode('ascii', errors='ignore').split('\n'))
        self.data = data
        
#     @staticmethod
#     def encode(b: bytes) -> int:
#         return int.from_bytes(b, byteorder='big')
    
#     @staticmethod
#     def decode(i: int) -> bytes:
#         return i.to_bytes(((i.bit_length() + 7) // 8), byteorder='big')               
    
    def __len__(self):
        return len(self.data) - self.block_size
    
    def get_vocab_size(self):
        return self.vocab_size
    
#     def get_vocab_size(self):
#         # https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/
#         return 267735
    
#     def get_block_size(self):
#         return 128

    def __getitem__(self, idx):
        chunk = self.data[idx:idx + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y
          


In [238]:
block_size = 500

In [239]:
# print an example instance of the dataset
train_dataset = WikiDataset(wiki_text, block_size)
x = train_dataset[0]
x

vocab size is: 258


(tensor([ 1,  0,  1,  1, 50, 76, 63, 66, 79, 81,  1, 34, 76, 82, 73, 81, 66, 79,
          1,  1,  0,  1,  0,  1, 50, 76, 63, 66, 79, 81,  1, 34, 76, 82, 73, 81,
         66, 79,  1, 70, 80,  1, 62, 75,  1, 37, 75, 68, 73, 70, 80, 69,  1, 67,
         70, 73, 74,  1, 13,  1, 81, 66, 73, 66, 83, 70, 80, 70, 76, 75,  1, 62,
         75, 65,  1, 81, 69, 66, 62, 81, 79, 66,  1, 62, 64, 81, 76, 79,  1, 15,
          1, 40, 66,  1, 69, 62, 65,  1, 62,  1, 68, 82, 66, 80, 81,  1, 32, 14,
         32,  1, 80, 81, 62, 79, 79, 70, 75, 68,  1, 79, 76, 73, 66,  1, 76, 75,
          1, 81, 69, 66,  1, 81, 66, 73, 66, 83, 70, 80, 70, 76, 75,  1, 80, 66,
         79, 70, 66, 80,  1, 52, 69, 66,  1, 34, 70, 73, 73,  1, 70, 75,  1, 19,
         17, 17, 17,  1, 15,  1, 52, 69, 70, 80,  1, 84, 62, 80,  1, 67, 76, 73,
         73, 76, 84, 66, 65,  1, 63, 86,  1, 62,  1, 80, 81, 62, 79, 79, 70, 75,
         68,  1, 79, 76, 73, 66,  1, 70, 75,  1, 81, 69, 66,  1, 77, 73, 62, 86,
          1, 40, 66, 79, 76,

In [240]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
# from https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = block_size
model = GPT(model_config)

number of parameters: 0.12M


In [241]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 100
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [242]:
def batch_end_callback(trainer):
    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 5.56285
iter_dt 1324.71ms; iter 10: train loss 5.08085
iter_dt 1375.60ms; iter 20: train loss 4.71569
iter_dt 1334.08ms; iter 30: train loss 4.37872
iter_dt 1345.50ms; iter 40: train loss 4.07486
iter_dt 1328.63ms; iter 50: train loss 3.83190
iter_dt 1330.18ms; iter 60: train loss 3.61901
iter_dt 1348.46ms; iter 70: train loss 3.44152
iter_dt 1372.46ms; iter 80: train loss 3.30257
iter_dt 1346.94ms; iter 90: train loss 3.18946


In [243]:
# now let's perform some evaluation
model.eval();

In [244]:
with torch.no_grad():
    # sample from the model...
    context = "Robert Boulter"
    x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
    y = model.generate(x, 500, temperature=1.0, do_sample=True, top_k=10)[0]
    completion = ''.join([train_dataset.itos[int(i)] for i in y])
    print(completion)


Robert Boultereealesasshea ntesn sitedaernsesendtt  sio h hastrsoson hiarosietor  srnartes eie  seda r a atro inon ithilat  onrroaa tesriea tho hetito estit rt ienssn stal tasi eoiororo teseearhoeoinrnto h ro ededaitr arshei sss eiitolinee trea onrs ilannetitt tt oanr oeoisosretstoind  antoin tr  il etaat staeolsheis  halsarta rennnnoe sn teat soos anioeote tseeee htt hss otit  n  tee settr a hsr s shtta totiolissstttean e ner  t  et aresee iesn i ss ts eeriee rthe etoitinilalaieeinninialtedei insedi teos rit
