In [1]:
with open('jonah.txt') as f:
    whole_book = f.read()
start = whole_book.find("In those days ")
end = whole_book.find("THE END\
                            \
                            \
                            \
                            TRANSCRIBER’S NOTES:")
content = whole_book[start:end]
print(len(content))

184229


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [3]:
class JonahDataset(Dataset):
    def __init__(self, txt, tokenizer, context_length, stride):
        
        self.tokens = tokenizer.encode(txt)

        self.inputs = []
        self.targets = []

        tokens_length = len(self.tokens)
        for i in range(0, tokens_length - context_length, stride):
            self.inputs.append(torch.tensor(self.tokens[i: i + context_length]))
            self.targets.append(torch.tensor(self.tokens[i+1:i+context_length +1]))
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]


In [4]:
tokenizer = tiktoken.get_encoding('gpt2')
context_length = 10
stride = 10
jdata = JonahDataset(content, tokenizer, context_length, stride)

In [5]:
next(iter(jdata))

(tensor([  818,   883,  1528,   612,   547, 30687,   287,  2692,    13,  1119]),
 tensor([  883,  1528,   612,   547, 30687,   287,  2692,    13,  1119,  5615]))

In [6]:
inputs, targets = next(iter(jdata))
for i in range(1, context_length):
    print(f'input: {inputs[:i]}')
    print(f'target: {inputs[i]}')
    

input: tensor([818])
target: 883
input: tensor([818, 883])
target: 1528
input: tensor([ 818,  883, 1528])
target: 612
input: tensor([ 818,  883, 1528,  612])
target: 547
input: tensor([ 818,  883, 1528,  612,  547])
target: 30687
input: tensor([  818,   883,  1528,   612,   547, 30687])
target: 287
input: tensor([  818,   883,  1528,   612,   547, 30687,   287])
target: 2692
input: tensor([  818,   883,  1528,   612,   547, 30687,   287,  2692])
target: 13
input: tensor([  818,   883,  1528,   612,   547, 30687,   287,  2692,    13])
target: 1119


In [7]:
batch = 32
jonah_loader = DataLoader(jdata, batch)

In [8]:
GPT2_CONFIG = {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": None,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": True,
  "summary_type": "cls_index",
  "summary_use_proj": True,
  "task_specific_params": {
    "text-generation": {
      "do_sample": True,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

In [9]:
import sys
sys.path.append('../')  # project root

from model.final_model import MyLLM
mllm = MyLLM(GPT2_CONFIG)

In [10]:
for inputs, targets in jonah_loader:
    outputs = mllm(inputs)
    print(outputs.shape)
    print(outputs) 
    break

torch.Size([32, 50257])
tensor([[1.8733e-05, 1.3332e-06, 1.2994e-05,  ..., 6.4996e-06, 4.7396e-05,
         7.7725e-05],
        [1.8678e-05, 7.7256e-06, 4.8023e-05,  ..., 1.4252e-05, 8.2557e-06,
         5.4162e-06],
        [5.5097e-06, 1.2348e-05, 3.0162e-05,  ..., 1.1595e-05, 2.2796e-05,
         3.6563e-05],
        ...,
        [5.2321e-06, 1.0770e-05, 1.1378e-05,  ..., 1.3350e-06, 1.7355e-05,
         4.7236e-05],
        [6.6689e-06, 1.7189e-07, 1.9120e-06,  ..., 9.9183e-07, 3.6660e-05,
         5.0009e-05],
        [5.0174e-06, 1.7802e-06, 5.5738e-05,  ..., 6.2422e-06, 1.7987e-05,
         6.6110e-05]], grad_fn=<SoftmaxBackward0>)
