# IIC-3670 NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.23.5
- torch 1.10.0


In [1]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import bpe
from utils import CfgNode as CN

In [2]:
class NewGELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


In [3]:
class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd),
            c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
            act     = NewGELU(),
            dropout = nn.Dropout(config.resid_pdrop),
        ))
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x


In [4]:
class GPT(nn.Module):
    """ GPT Language Model """

    @staticmethod
    def get_default_config():
        C = CN()
        # either model_type or (n_layer, n_head, n_embd) must be given in the config
        C.model_type = 'gpt'
        C.n_layer = None
        C.n_head = None
        C.n_embd =  None
        # these options must be filled in externally
        C.vocab_size = None
        C.block_size = None
        # dropout hyperparameters
        C.embd_pdrop = 0.1
        C.resid_pdrop = 0.1
        C.attn_pdrop = 0.1
        return C
    
    
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.block_size = config.block_size

        type_given = config.model_type is not None
        params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
        assert type_given ^ params_given # exactly one of these (XOR)
        if type_given:
            # translate from model_type to detailed configuration
            config.merge_from_dict({
                # names follow the huggingface naming conventions
                # GPT-1
                'openai-gpt':   dict(n_layer=12, n_head=12, n_embd=768),  # 117M params
                # GPT-2 configs
                'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
                'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
                'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
                'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
                # Gophers
                'gopher-44m':   dict(n_layer=8, n_head=16, n_embd=512),
                # (there are a number more...)
                # I made these tiny models up
                'gpt-mini':     dict(n_layer=6, n_head=6, n_embd=192),
                'gpt-micro':    dict(n_layer=4, n_head=4, n_embd=128),
                'gpt-nano':     dict(n_layer=3, n_head=3, n_embd=48),
            }[config.model_type])

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)
    
    @classmethod
    def from_pretrained(cls, model_type):
        """
        Initialize a pretrained GPT model by copying over the weights
        from a huggingface/transformers checkpoint.
        """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel

        # create a from-scratch initialized minGPT model
        config = cls.get_default_config()
        config.model_type = model_type
        config.vocab_size = 50257 # openai's model vocabulary
        config.block_size = 1024  # openai's model block_size
        model = GPT(config)
        sd = model.state_dict()

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla nn.Linear.
        # this means that we have to transpose these weights when we import them
        assert len(keys) == len(sd)
        for k in keys:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model
    
    def configure_optimizers(self, train_config):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer
    
    
    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss
    
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [5]:
import time
from collections import defaultdict
from torch.utils.data.dataloader import DataLoader

class Trainer:

    @staticmethod
    def get_default_config():
        C = CN()
        # device to train on
        #C.device = 'auto'
        C.device = 'cpu'
        # dataloder parameters
        C.num_workers = 4
        # optimizer parameters
        C.max_iters = None
        C.batch_size = 64
        C.learning_rate = 3e-4
        C.betas = (0.9, 0.95)
        C.weight_decay = 0.1
        C.grad_norm_clip = 1.0
        return C

    def __init__(self, config, model, train_dataset):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_dataset = train_dataset
        self.callbacks = defaultdict(list)

        # determine the device we'll train on
        if config.device == 'auto':
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = config.device
        self.model = self.model.to(self.device)
        print("running on device", self.device)

        # variables that will be assigned to trainer class later for logging and etc
        self.iter_num = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0

    def add_callback(self, onevent: str, callback):
        self.callbacks[onevent].append(callback)

    def set_callback(self, onevent: str, callback):
        self.callbacks[onevent] = [callback]

    def trigger_callbacks(self, onevent: str):
        for callback in self.callbacks.get(onevent, []):
            callback(self)

    def run(self):
        model, config = self.model, self.config

        # setup the optimizer
        self.optimizer = model.configure_optimizers(config)

        # setup the dataloader
        train_loader = DataLoader(
            self.train_dataset,
            sampler=torch.utils.data.RandomSampler(self.train_dataset, replacement=True, num_samples=int(1e10)),
            shuffle=False,
            pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
        )

        model.train()
        self.iter_num = 0
        self.iter_time = time.time()
        data_iter = iter(train_loader)
        while True:

            # fetch the next batch (x, y) and re-init iterator if needed
            try:
                batch = next(data_iter)
            except StopIteration:
                data_iter = iter(train_loader)
                batch = next(data_iter)
            batch = [t.to(self.device) for t in batch]
            x, y = batch

            # forward the model
            logits, self.loss = model(x, y)

            # backprop and update the parameters
            model.zero_grad(set_to_none=True)
            self.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
            self.optimizer.step()

            self.trigger_callbacks('on_batch_end')
            self.iter_num += 1
            tnow = time.time()
            self.iter_dt = tnow - self.iter_time
            self.iter_time = tnow

            # termination conditions
            if config.max_iters is not None and self.iter_num >= config.max_iters:
                break

## En este ejemplo vamos a trabajar con texto. Al final, igual son secuencias de enteros

In [6]:
import pandas as pd
import spacy

train_stories = pd.read_csv('example_train_stories.csv', encoding='utf-8')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('sentencizer')

def text_to_tokens(text_seqs):
    token_seqs = [[word.lower_ for word in nlp(text_seq)] for text_seq in text_seqs]
    return token_seqs

train_stories['Tokenized_Story'] = text_to_tokens(train_stories['Story'])
    
train_stories[['Story','Tokenized_Story']][:10]

Unnamed: 0,Story,Tokenized_Story
0,Dan's parents were overweight. Dan was overwei...,"[dan, 's, parents, were, overweight, ., dan, w..."
1,Carrie had just learned how to ride a bike. Sh...,"[carrie, had, just, learned, how, to, ride, a,..."
2,Morgan enjoyed long walks on the beach. She an...,"[morgan, enjoyed, long, walks, on, the, beach,..."
3,Jane was working at a diner. Suddenly a custom...,"[jane, was, working, at, a, diner, ., suddenly..."
4,I was talking to my crush today. She continued...,"[i, was, talking, to, my, crush, today, ., she..."
5,Frank had been drinking beer. He got a call fr...,"[frank, had, been, drinking, beer, ., he, got,..."
6,Dave was in the Bahamas on vacation. He decide...,"[dave, was, in, the, bahamas, on, vacation, .,..."
7,Sunny enjoyed going to the beach. As she stepp...,"[sunny, enjoyed, going, to, the, beach, ., as,..."
8,Sally was happy when her widowed mom found a n...,"[sally, was, happy, when, her, widowed, mom, f..."
9,Dan hit his golf ball and watched it go. The b...,"[dan, hit, his, golf, ball, and, watched, it, ..."


In [7]:
import pickle

def make_vocab(token_seqs, min_freq=1):
    token_counts = {}
    for seq in token_seqs:
        for token in seq:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1

    
    vocab = [token for token, count in token_counts.items() if count >= min_freq]
    
    vocab = {token:idx + 2 for idx,token in enumerate(vocab)}
    vocab[u'<UNK>'] = 1 
    vocab_size = len(vocab)

    print("VOCABULARY SAMPLE ({} total items):".format(len(vocab)))
    print(dict(list(vocab.items())[:20]))
    
    return vocab

vocab = make_vocab(token_seqs=train_stories['Tokenized_Story'], min_freq=1)

with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

VOCABULARY SAMPLE (1272 total items):
{'dan': 2, "'s": 3, 'parents': 4, 'were': 5, 'overweight': 6, '.': 7, 'was': 8, 'as': 9, 'well': 10, 'the': 11, 'doctors': 12, 'told': 13, 'his': 14, 'it': 15, 'unhealthy': 16, 'understood': 17, 'and': 18, 'decided': 19, 'to': 20, 'make': 21}


In [8]:
def tokens_to_idxs(token_seqs, vocab):
    idx_seqs = [[vocab[token] if token in vocab else vocab['<UNK>'] for token in token_seq]  
                                                                     for token_seq in token_seqs]
    return idx_seqs

train_stories['Story_Idxs'] = tokens_to_idxs(token_seqs=train_stories['Tokenized_Story'], vocab=vocab)
                                   
train_stories[['Tokenized_Story', 'Story_Idxs']][:10]

Unnamed: 0,Tokenized_Story,Story_Idxs
0,"[dan, 's, parents, were, overweight, ., dan, w...","[2, 3, 4, 5, 6, 7, 2, 8, 6, 9, 10, 7, 11, 12, ..."
1,"[carrie, had, just, learned, how, to, ride, a,...","[29, 30, 31, 32, 33, 20, 34, 22, 35, 7, 36, 37..."
2,"[morgan, enjoyed, long, walks, on, the, beach,...","[57, 58, 59, 60, 27, 11, 61, 7, 36, 18, 41, 62..."
3,"[jane, was, working, at, a, diner, ., suddenly...","[76, 8, 77, 78, 22, 79, 7, 80, 22, 81, 82, 83,..."
4,"[i, was, talking, to, my, crush, today, ., she...","[98, 8, 99, 20, 100, 101, 102, 7, 36, 103, 20,..."
5,"[frank, had, been, drinking, beer, ., he, got,...","[123, 30, 124, 125, 126, 7, 74, 25, 22, 127, 1..."
6,"[dave, was, in, the, bahamas, on, vacation, .,...","[146, 8, 147, 11, 148, 27, 149, 7, 74, 19, 20,..."
7,"[sunny, enjoyed, going, to, the, beach, ., as,...","[169, 58, 170, 20, 11, 61, 7, 9, 36, 171, 121,..."
8,"[sally, was, happy, when, her, widowed, mom, f...","[182, 8, 183, 159, 41, 184, 185, 160, 22, 186,..."
9,"[dan, hit, his, golf, ball, and, watched, it, ...","[2, 203, 14, 204, 205, 18, 206, 15, 63, 7, 11,..."


## Voy a definir una ventana de contexto, y creamos secuencias usando ventana deslizante. Como es next token prediction, el x e y difieren en un shift.

In [9]:
CONTEXT_SIZE = 8

def sequences_from_data(data, win_len):
    input_data = []
    num_sequences = len(data)
    for i in range(num_sequences):
        sequence = data[i]
        seq_len = len(sequence)
        movil_lim = seq_len - win_len - 2
        for j in range(movil_lim):
            sliding_window = sequence[j:j+win_len+1]
            x = sliding_window[:-1]
            y = sliding_window[1:]
            input_data.append((torch.tensor(x),torch.tensor(y)))
    return input_data

input_data = sequences_from_data(train_stories['Story_Idxs'], CONTEXT_SIZE)

SPLIT = 0.8
num_inputs = len(input_data)
train_lim = math.floor(num_inputs*SPLIT)
train_dataset = input_data[:train_lim]
test_dataset = input_data[train_lim:]

In [10]:
model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = len(vocab)
model_config.block_size = CONTEXT_SIZE
model = GPT(model_config)

number of parameters: 0.15M


In [11]:
CUDA_LAUNCH_BLOCKING = 1
train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 
train_config.max_iters = 10000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [12]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 7.15632
iter_dt 12.99ms; iter 100: train loss 5.51112
iter_dt 13.17ms; iter 200: train loss 4.52903
iter_dt 12.11ms; iter 300: train loss 4.16918
iter_dt 15.22ms; iter 400: train loss 3.50400
iter_dt 19.23ms; iter 500: train loss 3.06709
iter_dt 12.17ms; iter 600: train loss 2.57223
iter_dt 29.87ms; iter 700: train loss 2.45793
iter_dt 12.12ms; iter 800: train loss 2.09716
iter_dt 12.05ms; iter 900: train loss 2.00525
iter_dt 13.28ms; iter 1000: train loss 1.95617
iter_dt 12.16ms; iter 1100: train loss 1.70056
iter_dt 17.05ms; iter 1200: train loss 1.46060
iter_dt 12.23ms; iter 1300: train loss 1.39647
iter_dt 12.30ms; iter 1400: train loss 1.23176
iter_dt 12.16ms; iter 1500: train loss 1.17739
iter_dt 12.06ms; iter 1600: train loss 1.15464
iter_dt 17.37ms; iter 1700: train loss 1.14446
iter_dt 18.23ms; iter 1800: train loss 1.05320
iter_dt 12.08ms; iter 1900: train loss 1.04718
iter_dt 14.34ms; iter 2000: train loss 0.90387
iter_dt 13.74ms; iter 2100

## El eval_split toma los primeros tokens de la secuencia (4 en este caso) y completa los siguientes 4. Para evaluar, ajustamos el largo máximo a 8, ya que es el largo de la ventana de contexto.

In [14]:
def eval_split(trainer, dataset, max_batches):
    n = 8
    split = 4
    loader = DataLoader(dataset, batch_size=10, num_workers=0, drop_last=False)
    for b, (x, _) in enumerate(loader):
        x = x.to(trainer.device)
        inp_full = x[:, :n]
        inp = x[:, :split]
        cat = model.generate(inp, n, temperature=0.1, do_sample=False)
        sol = cat[:, :n] 
        for i in range(x.size(0)):
            print("Input %s but gpt says %s" % (inp_full[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    

with torch.no_grad():
    test_score  = eval_split(trainer, test_dataset,  max_batches=50)

Input [37, 140, 809, 20, 220, 7, 15, 324] but gpt says [37, 140, 809, 20, 220, 7, 15, 8]
Input [140, 809, 20, 220, 7, 15, 324, 15] but gpt says [140, 809, 20, 220, 7, 22, 810, 591]
Input [809, 20, 220, 7, 15, 324, 15, 30] but gpt says [809, 20, 220, 7, 15, 8, 174, 169]
Input [20, 220, 7, 15, 324, 15, 30, 1101] but gpt says [20, 220, 7, 15, 8, 1079, 235, 801]
Input [1102, 8, 27, 22, 805, 20, 41, 685] but gpt says [1102, 8, 27, 22, 1022, 411, 7, 250]
Input [8, 27, 22, 805, 20, 41, 685, 7] but gpt says [8, 27, 22, 805, 7, 22, 96, 814]
Input [27, 22, 805, 20, 41, 685, 7, 36] but gpt says [27, 22, 805, 20, 22, 96, 443, 444]
Input [22, 805, 20, 41, 685, 7, 36, 30] but gpt says [22, 805, 20, 41, 579, 565, 580, 581]
Input [805, 20, 41, 685, 7, 36, 30, 20] but gpt says [805, 20, 41, 685, 8, 64, 197, 7]
Input [20, 41, 685, 7, 36, 30, 20, 1103] but gpt says [20, 41, 685, 7, 74, 25, 113, 98]
Input [41, 685, 7, 36, 30, 20, 1103, 569] but gpt says [41, 685, 7, 36, 884, 41, 1090, 8]
Input [685, 7, 36

Input [1129, 7, 74, 8, 594, 22, 1130, 18] but gpt says [1129, 7, 74, 8, 760, 133, 3, 109]
Input [7, 74, 8, 594, 22, 1130, 18, 440] but gpt says [7, 74, 8, 594, 595, 7, 74, 596]
Input [74, 8, 594, 22, 1130, 18, 440, 107] but gpt says [74, 8, 594, 22, 96, 7, 74, 13]
Input [8, 594, 22, 1130, 18, 440, 107, 11] but gpt says [8, 594, 22, 1130, 7, 74, 19, 74]
Input [594, 22, 1130, 18, 440, 107, 11, 1131] but gpt says [594, 22, 1130, 18, 116, 912, 36, 234]
Input [22, 1130, 18, 440, 107, 11, 1131, 7] but gpt says [22, 1130, 18, 440, 20, 853, 15, 147]
Input [1130, 18, 440, 107, 11, 1131, 7, 74] but gpt says [1130, 18, 440, 107, 368, 166, 36, 1009]
Input [18, 440, 107, 11, 1131, 7, 74, 331] but gpt says [18, 440, 107, 11, 170, 27, 18, 726]
Input [440, 107, 11, 1131, 7, 74, 331, 1132] but gpt says [440, 107, 11, 1131, 133, 36, 1009, 7]
Input [107, 11, 1131, 7, 74, 331, 1132, 18] but gpt says [107, 11, 1131, 7, 74, 8, 760, 133]
Input [11, 1131, 7, 74, 331, 1132, 18, 1133] but gpt says [11, 1131, 7,

Input [505, 11, 802, 152, 20, 11, 1159, 87] but gpt says [505, 11, 802, 152, 15, 8, 612, 425]
Input [11, 802, 152, 20, 11, 1159, 87, 11] but gpt says [11, 802, 152, 20, 15, 8, 612, 425]
Input [802, 152, 20, 11, 1159, 87, 11, 1158] but gpt says [802, 152, 20, 11, 61, 7, 36, 358]
Input [152, 20, 11, 1159, 87, 11, 1158, 584] but gpt says [152, 20, 11, 1159, 18, 111, 20, 308]
Input [20, 11, 1159, 87, 11, 1158, 584, 1160] but gpt says [20, 11, 1159, 87, 288, 310, 7, 98]
Input [11, 1159, 87, 11, 1158, 584, 1160, 7] but gpt says [11, 1159, 87, 11, 152, 40, 315, 316]
Input [1159, 87, 11, 1158, 584, 1160, 7, 11] but gpt says [1159, 87, 11, 1158, 7, 98, 19, 20]
Input [87, 11, 1158, 584, 1160, 7, 11, 1159] but gpt says [87, 11, 1158, 584, 585, 7, 36, 131]
Input [11, 1158, 584, 1160, 7, 11, 1159, 1161] but gpt says [11, 1158, 584, 1160, 7, 74, 19, 20]
Input [1158, 584, 1160, 7, 11, 1159, 1161, 1162] but gpt says [1158, 584, 1160, 7, 36, 8, 865, 20]
Input [584, 1160, 7, 11, 1159, 1161, 1162, 8] but

Input [8, 1190, 18, 1191, 20, 629, 121, 40] but gpt says [8, 1190, 18, 1191, 74, 8, 38, 133]
Input [1190, 18, 1191, 20, 629, 121, 40, 11] but gpt says [1190, 18, 1191, 20, 11, 909, 910, 7]
Input [18, 1191, 20, 629, 121, 40, 11, 1187] but gpt says [18, 1191, 20, 629, 121, 7, 74, 8]
Input [1191, 20, 629, 121, 40, 11, 1187, 7] but gpt says [1191, 20, 629, 121, 7, 74, 373, 22]
Input [20, 629, 121, 40, 11, 1187, 7, 27] but gpt says [20, 629, 121, 40, 100, 587, 18, 274]
Input [629, 121, 40, 11, 1187, 7, 27, 11] but gpt says [629, 121, 40, 11, 524, 7, 509, 30]
Input [121, 40, 11, 1187, 7, 27, 11, 605] but gpt says [121, 40, 11, 1187, 18, 74, 157, 20]
Input [40, 11, 1187, 7, 27, 11, 605, 152] but gpt says [40, 11, 1187, 7, 317, 18, 41, 299]
Input [11, 1187, 7, 27, 11, 605, 152, 40] but gpt says [11, 1187, 7, 27, 11, 488, 20, 11]
Input [1187, 7, 27, 11, 605, 152, 40, 757] but gpt says [1187, 7, 27, 11, 488, 20, 11, 489]
Input [7, 27, 11, 605, 152, 40, 757, 11] but gpt says [7, 27, 11, 605, 724,