# IIC-3670 NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.23.5
- torch 1.10.0


____________________________________________________________________________________________________________

## Actividad en clase

Evaluaremos varios modelos **GPT text**. Para esto haga lo siguiente:

- Complete el jupyter 14, reemplazando en la función eval_split las secuencias de enteros por secuencias de tokens. Use la función **get_vocab_lookup(vocab)** del jupyter 11 para abordar esto.
- Corra gpt con SPLIT = 0.7. Mida el error rate y success rate del modelo sobre los 4 útimos tokens de cada secuencia de testing. Reporte los rates globales.
- Vuelva a correr eval_split subiendo la temperatura a 0.5. Reporte los rates globales. Comente los resultados.
- Comente los resultados de la comparación.
- Cuanto termine, me avisa para entregarle una **L (logrado)**.
- Recuerde que las L otorgan un bono en la nota final de la asignatura.


***Tiene hasta el final de la clase.***

_________________________________________________________________________________________________________________

# Solución

In [1]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import bpe
from utils import CfgNode as CN

In [2]:
class NewGELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


In [3]:
class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd),
            c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
            act     = NewGELU(),
            dropout = nn.Dropout(config.resid_pdrop),
        ))
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x


In [4]:
class GPT(nn.Module):
    """ GPT Language Model """

    @staticmethod
    def get_default_config():
        C = CN()
        # either model_type or (n_layer, n_head, n_embd) must be given in the config
        C.model_type = 'gpt'
        C.n_layer = None
        C.n_head = None
        C.n_embd =  None
        # these options must be filled in externally
        C.vocab_size = None
        C.block_size = None
        # dropout hyperparameters
        C.embd_pdrop = 0.1
        C.resid_pdrop = 0.1
        C.attn_pdrop = 0.1
        return C
    
    
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.block_size = config.block_size

        type_given = config.model_type is not None
        params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
        assert type_given ^ params_given # exactly one of these (XOR)
        if type_given:
            # translate from model_type to detailed configuration
            config.merge_from_dict({
                # names follow the huggingface naming conventions
                # GPT-1
                'openai-gpt':   dict(n_layer=12, n_head=12, n_embd=768),  # 117M params
                # GPT-2 configs
                'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
                'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
                'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
                'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
                # Gophers
                'gopher-44m':   dict(n_layer=8, n_head=16, n_embd=512),
                # (there are a number more...)
                # I made these tiny models up
                'gpt-mini':     dict(n_layer=6, n_head=6, n_embd=192),
                'gpt-micro':    dict(n_layer=4, n_head=4, n_embd=128),
                'gpt-nano':     dict(n_layer=3, n_head=3, n_embd=48),
            }[config.model_type])

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)
    
    @classmethod
    def from_pretrained(cls, model_type):
        """
        Initialize a pretrained GPT model by copying over the weights
        from a huggingface/transformers checkpoint.
        """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel

        # create a from-scratch initialized minGPT model
        config = cls.get_default_config()
        config.model_type = model_type
        config.vocab_size = 50257 # openai's model vocabulary
        config.block_size = 1024  # openai's model block_size
        model = GPT(config)
        sd = model.state_dict()

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla nn.Linear.
        # this means that we have to transpose these weights when we import them
        assert len(keys) == len(sd)
        for k in keys:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model
    
    def configure_optimizers(self, train_config):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer
    
    
    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss
    
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [5]:
import time
from collections import defaultdict
from torch.utils.data.dataloader import DataLoader

class Trainer:

    @staticmethod
    def get_default_config():
        C = CN()
        # device to train on
        #C.device = 'auto'
        C.device = 'cpu'
        # dataloder parameters
        C.num_workers = 4
        # optimizer parameters
        C.max_iters = None
        C.batch_size = 64
        C.learning_rate = 3e-4
        C.betas = (0.9, 0.95)
        C.weight_decay = 0.1
        C.grad_norm_clip = 1.0
        return C

    def __init__(self, config, model, train_dataset):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_dataset = train_dataset
        self.callbacks = defaultdict(list)

        # determine the device we'll train on
        if config.device == 'auto':
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = config.device
        self.model = self.model.to(self.device)
        print("running on device", self.device)

        # variables that will be assigned to trainer class later for logging and etc
        self.iter_num = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0

    def add_callback(self, onevent: str, callback):
        self.callbacks[onevent].append(callback)

    def set_callback(self, onevent: str, callback):
        self.callbacks[onevent] = [callback]

    def trigger_callbacks(self, onevent: str):
        for callback in self.callbacks.get(onevent, []):
            callback(self)

    def run(self):
        model, config = self.model, self.config

        # setup the optimizer
        self.optimizer = model.configure_optimizers(config)

        # setup the dataloader
        train_loader = DataLoader(
            self.train_dataset,
            sampler=torch.utils.data.RandomSampler(self.train_dataset, replacement=True, num_samples=int(1e10)),
            shuffle=False,
            pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
        )

        model.train()
        self.iter_num = 0
        self.iter_time = time.time()
        data_iter = iter(train_loader)
        while True:

            # fetch the next batch (x, y) and re-init iterator if needed
            try:
                batch = next(data_iter)
            except StopIteration:
                data_iter = iter(train_loader)
                batch = next(data_iter)
            batch = [t.to(self.device) for t in batch]
            x, y = batch

            # forward the model
            logits, self.loss = model(x, y)

            # backprop and update the parameters
            model.zero_grad(set_to_none=True)
            self.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
            self.optimizer.step()

            self.trigger_callbacks('on_batch_end')
            self.iter_num += 1
            tnow = time.time()
            self.iter_dt = tnow - self.iter_time
            self.iter_time = tnow

            # termination conditions
            if config.max_iters is not None and self.iter_num >= config.max_iters:
                break

In [6]:
import pandas as pd
import spacy

train_stories = pd.read_csv('example_train_stories.csv', encoding='utf-8')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('sentencizer')

def text_to_tokens(text_seqs):
    token_seqs = [[word.lower_ for word in nlp(text_seq)] for text_seq in text_seqs]
    return token_seqs

train_stories['Tokenized_Story'] = text_to_tokens(train_stories['Story'])
    
train_stories[['Story','Tokenized_Story']][:10]

Unnamed: 0,Story,Tokenized_Story
0,Dan's parents were overweight. Dan was overwei...,"[dan, 's, parents, were, overweight, ., dan, w..."
1,Carrie had just learned how to ride a bike. Sh...,"[carrie, had, just, learned, how, to, ride, a,..."
2,Morgan enjoyed long walks on the beach. She an...,"[morgan, enjoyed, long, walks, on, the, beach,..."
3,Jane was working at a diner. Suddenly a custom...,"[jane, was, working, at, a, diner, ., suddenly..."
4,I was talking to my crush today. She continued...,"[i, was, talking, to, my, crush, today, ., she..."
5,Frank had been drinking beer. He got a call fr...,"[frank, had, been, drinking, beer, ., he, got,..."
6,Dave was in the Bahamas on vacation. He decide...,"[dave, was, in, the, bahamas, on, vacation, .,..."
7,Sunny enjoyed going to the beach. As she stepp...,"[sunny, enjoyed, going, to, the, beach, ., as,..."
8,Sally was happy when her widowed mom found a n...,"[sally, was, happy, when, her, widowed, mom, f..."
9,Dan hit his golf ball and watched it go. The b...,"[dan, hit, his, golf, ball, and, watched, it, ..."


In [7]:
import pickle

def make_vocab(token_seqs, min_freq=1):
    token_counts = {}
    for seq in token_seqs:
        for token in seq:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1

    
    vocab = [token for token, count in token_counts.items() if count >= min_freq]
    
    vocab = {token:idx + 2 for idx,token in enumerate(vocab)}
    vocab[u'<UNK>'] = 1 
    vocab_size = len(vocab)

    print("VOCABULARY SAMPLE ({} total items):".format(len(vocab)))
    print(dict(list(vocab.items())[:20]))
    
    return vocab

vocab = make_vocab(token_seqs=train_stories['Tokenized_Story'], min_freq=1)

with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

VOCABULARY SAMPLE (1272 total items):
{'dan': 2, "'s": 3, 'parents': 4, 'were': 5, 'overweight': 6, '.': 7, 'was': 8, 'as': 9, 'well': 10, 'the': 11, 'doctors': 12, 'told': 13, 'his': 14, 'it': 15, 'unhealthy': 16, 'understood': 17, 'and': 18, 'decided': 19, 'to': 20, 'make': 21}


In [8]:
def get_vocab_lookup(vocab):
    vocab_lookup = {idx: vocab_item for vocab_item, idx in vocab.items()}
    vocab_lookup[0] = "" 
    print("LOOKUP SAMPLE:")
    print(dict(list(vocab_lookup.items())[:20]))
    return vocab_lookup

vocab_lookup = get_vocab_lookup(vocab)

LOOKUP SAMPLE:
{2: 'dan', 3: "'s", 4: 'parents', 5: 'were', 6: 'overweight', 7: '.', 8: 'was', 9: 'as', 10: 'well', 11: 'the', 12: 'doctors', 13: 'told', 14: 'his', 15: 'it', 16: 'unhealthy', 17: 'understood', 18: 'and', 19: 'decided', 20: 'to', 21: 'make'}


In [9]:
def tokens_to_idxs(token_seqs, vocab):
    idx_seqs = [[vocab[token] if token in vocab else vocab['<UNK>'] for token in token_seq]  
                                                                     for token_seq in token_seqs]
    return idx_seqs

train_stories['Story_Idxs'] = tokens_to_idxs(token_seqs=train_stories['Tokenized_Story'], vocab=vocab)
                                   
train_stories[['Tokenized_Story', 'Story_Idxs']][:10]

Unnamed: 0,Tokenized_Story,Story_Idxs
0,"[dan, 's, parents, were, overweight, ., dan, w...","[2, 3, 4, 5, 6, 7, 2, 8, 6, 9, 10, 7, 11, 12, ..."
1,"[carrie, had, just, learned, how, to, ride, a,...","[29, 30, 31, 32, 33, 20, 34, 22, 35, 7, 36, 37..."
2,"[morgan, enjoyed, long, walks, on, the, beach,...","[57, 58, 59, 60, 27, 11, 61, 7, 36, 18, 41, 62..."
3,"[jane, was, working, at, a, diner, ., suddenly...","[76, 8, 77, 78, 22, 79, 7, 80, 22, 81, 82, 83,..."
4,"[i, was, talking, to, my, crush, today, ., she...","[98, 8, 99, 20, 100, 101, 102, 7, 36, 103, 20,..."
5,"[frank, had, been, drinking, beer, ., he, got,...","[123, 30, 124, 125, 126, 7, 74, 25, 22, 127, 1..."
6,"[dave, was, in, the, bahamas, on, vacation, .,...","[146, 8, 147, 11, 148, 27, 149, 7, 74, 19, 20,..."
7,"[sunny, enjoyed, going, to, the, beach, ., as,...","[169, 58, 170, 20, 11, 61, 7, 9, 36, 171, 121,..."
8,"[sally, was, happy, when, her, widowed, mom, f...","[182, 8, 183, 159, 41, 184, 185, 160, 22, 186,..."
9,"[dan, hit, his, golf, ball, and, watched, it, ...","[2, 203, 14, 204, 205, 18, 206, 15, 63, 7, 11,..."


In [10]:
CONTEXT_SIZE = 8

def sequences_from_data(data, win_len):
    input_data = []
    num_sequences = len(data)
    for i in range(num_sequences):
        sequence = data[i]
        seq_len = len(sequence)
        movil_lim = seq_len - win_len - 2
        for j in range(movil_lim):
            sliding_window = sequence[j:j+win_len+1]
            x = sliding_window[:-1]
            y = sliding_window[1:]
            input_data.append((torch.tensor(x),torch.tensor(y)))
    return input_data

input_data = sequences_from_data(train_stories['Story_Idxs'], CONTEXT_SIZE)

SPLIT = 0.7
num_inputs = len(input_data)
train_lim = math.floor(num_inputs*SPLIT)
train_dataset = input_data[:train_lim]
test_dataset = input_data[train_lim:]

In [11]:
model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = len(vocab)
model_config.block_size = CONTEXT_SIZE
model = GPT(model_config)

number of parameters: 0.15M


In [13]:
CUDA_LAUNCH_BLOCKING = 1
train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 
train_config.max_iters = 5000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [14]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 7.16733
iter_dt 13.27ms; iter 100: train loss 5.53629
iter_dt 14.50ms; iter 200: train loss 4.60686
iter_dt 12.09ms; iter 300: train loss 3.92753
iter_dt 12.13ms; iter 400: train loss 3.26354
iter_dt 12.06ms; iter 500: train loss 2.87742
iter_dt 14.40ms; iter 600: train loss 2.51870
iter_dt 12.07ms; iter 700: train loss 2.20422
iter_dt 12.12ms; iter 800: train loss 1.97500
iter_dt 12.14ms; iter 900: train loss 1.92439
iter_dt 12.57ms; iter 1000: train loss 1.65203
iter_dt 14.90ms; iter 1100: train loss 1.56715
iter_dt 14.71ms; iter 1200: train loss 1.44966
iter_dt 12.12ms; iter 1300: train loss 1.30608
iter_dt 12.12ms; iter 1400: train loss 1.19890
iter_dt 12.18ms; iter 1500: train loss 1.08094
iter_dt 12.25ms; iter 1600: train loss 1.00235
iter_dt 12.22ms; iter 1700: train loss 0.90197
iter_dt 12.21ms; iter 1800: train loss 0.84505
iter_dt 12.22ms; iter 1900: train loss 0.77765
iter_dt 12.03ms; iter 2000: train loss 0.74965
iter_dt 12.15ms; iter 2100

In [16]:
def eval_split(trainer, dataset, max_batches):
    n = 8
    split = 4
    correct = 0
    mismatch = 0
    loader = DataLoader(dataset, batch_size=10, num_workers=0, drop_last=False)
    for b, (x, _) in enumerate(loader):
        x = x.to(trainer.device)
        inp_full = x[:, :n]
        inp = x[:, :split]
        cat = model.generate(inp, n, temperature=0.5, do_sample=False) # using greedy argmax, not sampling
        sol = cat[:, :n]
        for i in range(x.size(0)):
            list1 = inp_full[i].tolist()
            text1 = ''
            for idx in list1:
                text1 = text1 + ' ' + vocab_lookup[idx]
            list2 = sol[i].tolist()
            text2 = ''
            for idx in list2:
                text2 = text2 + ' ' + vocab_lookup[idx]
            print("Input: %s , but gpt says: %s" % (text1, text2))
            for j in range(split,split+1):
                    if list1[j] == list2[j]:
                        correct += 1
                    else:
                        mismatch +=1
                        
            
            
        if max_batches is not None and b+1 >= max_batches:
            print("Success rate: %d", correct/(correct+mismatch))
            print("Error rate: %d", mismatch/(correct+mismatch))
            break
    

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    test_score  = eval_split(trainer, test_dataset,  max_batches=50)

Input:  movies to watch . they had a very , but gpt says:  movies to watch . they had a very
Input:  to watch . they had a very fun , but gpt says:  to watch . they had a movie was
Input:  watch . they had a very fun evening , but gpt says:  watch . they had a friend who came
Input:  . they had a very fun evening and , but gpt says:  . they had a great time but covered
Input:  they had a very fun evening and sandy , but gpt says:  they had a very grateful to be able
Input:  had a very fun evening and sandy realized , but gpt says:  had a very fun to hide the candy
Input:  a very fun evening and sandy realized she , but gpt says:  a very fun evening . the six of
Input:  very fun evening and sandy realized she had , but gpt says:  very fun evening and the six of them
Input:  fun evening and sandy realized she had missed , but gpt says:  fun evening and sandy spent a lot of
Input:  evening and sandy realized she had missed them , but gpt says:  evening and sandy realized his starter was b

Input:  n't have mint either . betty ended up , but gpt says:  n't have mint either and she heard hundreds
Input:  have mint either . betty ended up buying , but gpt says:  have mint either . he had a diagram
Input:  mint either . betty ended up buying some , but gpt says:  mint either . betty 's house quickly .
Input:  either . betty ended up buying some ice , but gpt says:  either . betty ended up moving on california
Input:  . betty ended up buying some ice some , but gpt says:  . betty ended up moving to a new
Input:  betty ended up buying some ice some cream , but gpt says:  betty ended up buying some sunglasses and headed
Input:  ended up buying some ice some cream and , but gpt says:  ended up buying some sunglasses and headed back
Input:  up buying some ice some cream and some , but gpt says:  up buying some ice was far out of
Input:  buying some ice some cream and some mint , but gpt says:  buying some ice some sunglasses and headed back
Input:  some ice some cream and some mi

Input:  she bought all the ingredients at the store , but gpt says:  she bought all the dip and double dipped
Input:  bought all the ingredients at the store . , but gpt says:  bought all the ingredients to the store todd
Input:  all the ingredients at the store . when , but gpt says:  all the ingredients at his house . he
Input:  the ingredients at the store . when she , but gpt says:  the ingredients at the community pool with her
Input:  ingredients at the store . when she came , but gpt says:  ingredients at the store . inside she walked
Input:  at the store . when she came home , but gpt says:  at the store . inside she walked around
Input:  the store . when she came home she , but gpt says:  the store . when inside she walked around
Input:  store . when she came home she remembered , but gpt says:  store . when she arrived at her new
Input:  . when she came home she remembered her , but gpt says:  . when she came out it was completely
Input:  when she came home she remembered her

Input:  for extra money . they are totes that , but gpt says:  for extra money . eventually his winnings were
Input:  extra money . they are totes that are , but gpt says:  extra money . they searched for a shower
Input:  money . they are totes that are sold , but gpt says:  money . they are wrecked . she is
Input:  . they are totes that are sold to , but gpt says:  . they are totes to both . after
Input:  they are totes that are sold to mother , but gpt says:  they are totes that 's what he went
Input:  are totes that are sold to mother 's , but gpt says:  are totes that are wrecked . she is
Input:  totes that are sold to mother 's typically , but gpt says:  totes that are sold . it was delicious
Input:  that are sold to mother 's typically . , but gpt says:  that are sold to walked around looking for
Input:  are sold to mother 's typically . they , but gpt says:  are sold to mother terrified eyes . jane
Input:  sold to mother 's typically . they like , but gpt says:  sold to mother '

In [18]:
def eval_split(trainer, dataset, max_batches):
    n = 8
    split = 4
    correct = 0
    mismatch = 0
    loader = DataLoader(dataset, batch_size=10, num_workers=0, drop_last=False)
    for b, (x, _) in enumerate(loader):
        x = x.to(trainer.device)
        inp_full = x[:, :n]
        inp = x[:, :split]
        cat = model.generate(inp, n, temperature=0.1, do_sample=False) # using greedy argmax, not sampling
        sol = cat[:, :n]
        for i in range(x.size(0)):
            list1 = inp_full[i].tolist()
            text1 = ''
            for idx in list1:
                text1 = text1 + ' ' + vocab_lookup[idx]
            list2 = sol[i].tolist()
            text2 = ''
            for idx in list2:
                text2 = text2 + ' ' + vocab_lookup[idx]
            print("Input: %s , but gpt says: %s" % (text1, text2))
            for j in range(split,split+1):
                    if list1[j] == list2[j]:
                        correct += 1
                    else:
                        mismatch +=1
                        
            
            
        if max_batches is not None and b+1 >= max_batches:
            print("Success rate: %d", correct/(correct+mismatch))
            print("Error rate: %d", mismatch/(correct+mismatch))
            break
    

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    test_score  = eval_split(trainer, test_dataset,  max_batches=50)

Input:  movies to watch . they had a very , but gpt says:  movies to watch . they had a lot
Input:  to watch . they had a very fun , but gpt says:  to watch . they had a . when
Input:  watch . they had a very fun evening , but gpt says:  watch . they had a lot to catch
Input:  . they had a very fun evening and , but gpt says:  . they had a great time but covered
Input:  they had a very fun evening and sandy , but gpt says:  they had a very cute and popular .
Input:  had a very fun evening and sandy realized , but gpt says:  had a very fun to hide the candy
Input:  a very fun evening and sandy realized she , but gpt says:  a very fun evening . the six of
Input:  very fun evening and sandy realized she had , but gpt says:  very fun evening and the six of them
Input:  fun evening and sandy realized she had missed , but gpt says:  fun evening and sandy spent six of them
Input:  evening and sandy realized she had missed them , but gpt says:  evening and sandy realized she should stop being


Input:  worked at an electronic store . one day , but gpt says:  worked at an electronic decided to make a
Input:  at an electronic store . one day he , but gpt says:  at an electronic store . inside she walked
Input:  an electronic store . one day he had , but gpt says:  an electronic store . inside she walked around
Input:  electronic store . one day he had customers , but gpt says:  electronic store . one day rex met a
Input:  store . one day he had customers who , but gpt says:  store . one day he bought a scratching
Input:  . one day he had customers who were , but gpt says:  . one day he bought a scratching ticket
Input:  one day he had customers who were unsure , but gpt says:  one day he had just graduated college .
Input:  day he had customers who were unsure . , but gpt says:  day he had customers and drove home and
Input:  he had customers who were unsure . he , but gpt says:  he had customers who won the lottery .
Input:  had customers who were unsure . he convinced , but g

Input:  inch . and within one year cade was , but gpt says:  inch . and within did n't stop to
Input:  . and within one year cade was the , but gpt says:  . and within one day he bought a
Input:  and within one year cade was the tallest , but gpt says:  and within one year . he decided he
Input:  within one year cade was the tallest boy , but gpt says:  within one year cade of it 's that
Input:  one year cade was the tallest boy in , but gpt says:  one year cade was willing to help me
Input:  year cade was the tallest boy in his , but gpt says:  year cade was the place needed meat she
Input:  cade was the tallest boy in his class , but gpt says:  cade was the tallest car hits her and
Input:  was the tallest boy in his class . , but gpt says:  was the tallest boy felt . gina realized
Input:  the tallest boy in his class . cade , but gpt says:  the tallest boy in her class . then
Input:  tallest boy in his class . cade was , but gpt says:  tallest boy in his me up . however
Input:  boy i

Input:  knocked over and newspaper was everywhere shredded . , but gpt says:  knocked over and newspaper of coffee . john
Input:  over and newspaper was everywhere shredded . ben , but gpt says:  over and newspaper was glad that the weather
Input:  and newspaper was everywhere shredded . ben called , but gpt says:  and newspaper was everywhere she had been that
Input:  newspaper was everywhere shredded . ben called for , but gpt says:  newspaper was everywhere shredded . he had just
Input:  was everywhere shredded . ben called for his , but gpt says:  was everywhere shredded . he had just seen
Input:  everywhere shredded . ben called for his dog , but gpt says:  everywhere shredded . ben had gained a few
Input:  shredded . ben called for his dog sternly , but gpt says:  shredded . ben called the friend who came
Input:  . ben called for his dog sternly . , but gpt says:  . ben called for a friend who came
Input:  ben called for his dog sternly . but , but gpt says:  ben called for his c