In [42]:
from mingpt.model import GPT
import json

'''using a gpt2 vocab and tokenizer with vocabulary dimension of 50257'''
with open('./gpt_experiment/gpt2_vocab.json',encoding="ascii",errors=" ") as json_file: 
    data = json.load(json_file)
model_config = GPT.get_default_config()
model_config.model_type='gpt-nano'
model_config.vocab_size = len(data)
model_config.block_size =48

In [2]:
model=GPT(model_config)

number of parameters: 2.50M


In [45]:
# first try run without any parameter optimization
import torch
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenized_input=torch.tensor([tokenizer("Vice distortions")['input_ids']], dtype=torch.long)
with torch.no_grad():
    cat = model.generate(tokenized_input, 4, do_sample=False)
cat


tensor([[47910, 47876,    13,   198,   198,   198]])

In [4]:
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 48)
    (wpe): Embedding(48, 48)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=48, out_features=144, bias=True)
          (c_proj): Linear(in_features=48, out_features=48, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=48, out_features=192, bias=True)
          (c_proj): Linear(in_features=192, out_features=48, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAtten

In [5]:
# clean and tokenize every sentence in the dataset
import random
import requests, json
from torch.utils.data import Dataset

class SentenceDataset(Dataset):

    def __init__(self, split, length=48):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        API_URL =  "https://datasets-server.huggingface.co/first-rows?dataset=bookcorpus&config=plain_text&split=train"
        def query():
            response = requests.request("GET", API_URL)
            return response.json()
        data = query()
        tokenized_sentences=[]
        sentences=data['rows']
        for i in range(100):
            tokenized_sen=torch.tensor([tokenizer(sentences[i]['row']['text'])['input_ids']], dtype=torch.long)
            if(len(tokenized_sen)>length):
                tokenized_sen=tokenized_sen[:][:length]
            tokenized_sentences.append(tokenized_sen)
        self.samples=tokenized_sentences
    
    def __len__(self):
        return 100
    
    def get_vocab_size(self):
        return self.num_digits
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length #* 2-1

    def __getitem__(self, idx):
        sentence=self.samples[idx][0]
        residual_length=192-len(sentence)
        masked=torch.cat((torch.tensor(sentence[:-1]),torch.tensor([50256]*(residual_length+1))),0)
        sol=torch.cat((sentence,torch.tensor([50256]*(residual_length))),0)
        sol[0]=50256
        return masked, sol
        #return sentence, sol


In [None]:
import os

outfile=open("./books.txt",'w',encoding="utf16")
for file_name in os.listdir("C:/Users/loren/Documents/Lorenzo/UniPi/BookCorpus/out_txts"):
    with open("C:/Users/loren/Documents/Lorenzo/UniPi/BookCorpus/out_txts/"+file_name,encoding="utf8") as file:
        outfile.write(file.read())
outfile.close()


In [None]:
with open("./books.txt",encoding="utf16") as file:
    tokenized_corpus=tokenizer(file.read())['input_ids']

In [8]:
import os

class BookDataset(Dataset):

    def __init__(self, split, length=48):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        tokenized_sentences=[]
        with open("./books.txt",encoding="utf16") as file:
            tokenized_corpus=torch.tensor(tokenizer(file.read())['input_ids'], dtype=torch.long)
        corpus_len=len(tokenized_corpus)-(len(tokenized_corpus)%length)
        tokenized_corpus=tokenized_corpus[:corpus_len]
        tokenized_sentences=[]
        self.len=int(corpus_len/length)
        for i in range (self.len):
            tokenized_sentences.append(tokenized_corpus[i*length:(i+1)*length])
        self.samples=tokenized_sentences
    
    def __len__(self):
        return self.len #number of sentences
    
    def get_vocab_size(self):
        return self.num_digits
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length #* 2-1

    def __getitem__(self, idx):
        sentence=self.samples[idx]
        input=torch.cat((torch.tensor([50256]),torch.tensor(sentence[:self.length-1])),0)
        output=sentence
        return input, output
        #return sentence, sol


In [None]:
train_dataset = SentenceDataset('train')
test_dataset = SentenceDataset('test')

In [9]:
train_dataset = BookDataset('train')
test_dataset = BookDataset('test')

In [11]:
train_dataset[7809]


  input=torch.cat((torch.tensor([50256]),torch.tensor(sentence[:self.length-1])),0)


(tensor([50256,   673,   714,  2245,   284,   892,   546,   340,   673,  4721,
           607,  5422,   290, 24790,   607,  9941,  2769,   656,   376,  1531,
           447,   247,    82, 11222,    13,  1375,  2982,   257,  7128,   272,
          2406,   422,   376,  1531,   447,   247,    82,  7721,   290,   788,
           262,  1877,  1663,    75,   673,   550,  1716, 23840]),
 tensor([  673,   714,  2245,   284,   892,   546,   340,   673,  4721,   607,
          5422,   290, 24790,   607,  9941,  2769,   656,   376,  1531,   447,
           247,    82, 11222,    13,  1375,  2982,   257,  7128,   272,  2406,
           422,   376,  1531,   447,   247,    82,  7721,   290,   788,   262,
          1877,  1663,    75,   673,   550,  1716, 23840,   284]))

In [12]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config() # we can also change some of them if we want
train_config.learning_rate = 5e-4 
train_config.max_iters = 2000
train_config.num_workers = 0
train_config.batch_size=20
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [21]:
def batch_end_callback(trainer):
    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)
train_config.learning_rate=5e-5
trainer.run()

  input=torch.cat((torch.tensor([50256]),torch.tensor(sentence[:self.length-1])),0)


iter_dt 1509.38ms; iter 0: train loss 5.67433
iter_dt 1512.54ms; iter 10: train loss 6.00966
iter_dt 1586.20ms; iter 20: train loss 5.62879
iter_dt 1525.93ms; iter 30: train loss 5.96450
iter_dt 1585.93ms; iter 40: train loss 6.09331
iter_dt 1401.69ms; iter 50: train loss 5.57066
iter_dt 1451.77ms; iter 60: train loss 5.72183
iter_dt 2690.08ms; iter 70: train loss 6.02653
iter_dt 1745.43ms; iter 80: train loss 5.60882
iter_dt 1400.38ms; iter 90: train loss 5.87655
iter_dt 1448.00ms; iter 100: train loss 5.84129
iter_dt 1387.19ms; iter 110: train loss 5.80312
iter_dt 1514.55ms; iter 120: train loss 6.15001
iter_dt 1369.88ms; iter 130: train loss 5.56134
iter_dt 1592.71ms; iter 140: train loss 5.84125
iter_dt 1603.27ms; iter 150: train loss 5.81761
iter_dt 1440.82ms; iter 160: train loss 5.65560
iter_dt 1599.36ms; iter 170: train loss 6.20329
iter_dt 1457.90ms; iter 180: train loss 5.93310


KeyboardInterrupt: 

In [49]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")['input_ids']
print(inputs)
output=model(inputs)
max_ids=output[0].max(2).indices
data.items()
inverted_vocab= {b:a for a, b in data.items()}
out_string=""
for i in range(5):
      out_string=out_string+inverted_vocab[max_ids[0,i].item()]
print(out_string)
print(out_string.replace("Ġ"," "))

tensor([[15496,    11,   616,  3290,   318, 13779]])
.ĠIĊ.Ġthe
. IĊ. the
