In [22]:
import numpy as np
from torch.nn.modules.transformer import Transformer, TransformerEncoder, TransformerEncoderLayer
from pytorch_transformers.tokenization_distilbert import DistilBertTokenizer
from pytorch_transformers.modeling_distilbert import DistilBertModel
import torch
import torch.nn
from torch import optim
import random

from torch.utils.data import Dataset
import json
from torch import nn

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
special_tokens_dict = {'additional_special_tokens': ['<PLH>', '<s>','</s>']}
tokenizer.add_special_tokens(special_tokens_dict)
#encoder = DistilBertModel.from_pretrained('distilbert-base-uncased').cuda()
#encoder.resize_token_embeddings(len(tokenizer))  

noop = "N"
sub = "S"
insert = "I"
delete = "D"

def ld(s1, s2, subcost=1, delcost=1, inscost=1):
    operations = [["" for j in range(len(s2) + 1)] for i in range(len(s1) + 1)]
    
    matrix = np.zeros((len(s1)+1, len(s2)+1))
    
    for j in range(len(s2) + 1):
        matrix[0,j] = j
        operations[0][j] = insert
        for i in range(len(s1) + 1):
            matrix[i,0] = i
            operations[i][0] = insert
            if i > 0 and j > 0:
                subCost = matrix[i-1, j-1] if s1[i-1] == s2[j-1] else matrix[i-1, j-1] + subcost
                insertCost = matrix[i, j-1] + inscost
                deleteCost = matrix[i-1, j] + delcost
                minCost = min(subCost, insertCost, deleteCost)
                matrix[i,j] = minCost
                if minCost == 0:
                    operations[i][j] = noop
                elif minCost == deleteCost:
                    operations[i][j] = delete
                elif minCost == insertCost:
                    operations[i][j] = insert
                elif minCost == subCost:
                    operations[i][j] = sub
    i = len(s1)
    j = len(s2)
    history = []
    while j > 0 or i > 0:        
        if delcost != np.inf:
            if j == 0:
                history.append(delete)
                i -= 1
                continue
            if matrix[i-1][j-1] < matrix[i-1,j]:
                history.append(noop)
                i -= 1
                j -= 1
            else:
                history.append(delete)
                i -= 1
        elif inscost != np.inf:
            if j == 0:
                history.append(noop)
                i -= 1
                continue
            if matrix[i-1][j-1] < matrix[i,j-1]:
                history.append(noop)
                i -= 1
                j -= 1
            else:
                history.append((insert,s2[j-1]))
                #history.append(insert)
                j -= 1
    history.reverse()
    return matrix, matrix[len(s1),len(s2)], history
    


In [48]:
class QADataset(Dataset):
    def __init__(self, path):
        with open(path, "r") as infile:
            self.data = json.load(infile)

    def __len__(self):
        return len(self.data["Data"])

    def sample(self):
        key = "Question" if random.random() > 0.5 else "Answer"
        return "<s> " + self.data["Data"][random.randint(0, len(self.data["Data"]) - 1)]["Question"] + " </s>"
    
plh = tokenizer.encode("<PLH>")[0]
ts = tokenizer.encode("<s>")[0]
te = tokenizer.encode("</s>")[0]

class PlaceholderClassifier(nn.Module):
    def __init__(self, hsz, transformer, max_placeholders=5):
        super().__init__()
        self.dense = nn.Linear(
            hsz*2,max_placeholders
        )
        self.hsz = hsz
        self.transformer = transformer

    def forward(self, input, hidden=None):
        transformed = self.transformer(input)
        concatenated = torch.cat([transformed[:, :-1, :], transformed[:, 1:, :]], 2)
        #print(concatenated.size())
        #print(self.transformer)
        return self.dense(concatenated)
    
class TokenClassifier(nn.Module):    
    def __init__(self, hsz, embedding, transformer, vsz, max_seq_len):
        super().__init__()
        self.embedding = embedding
        self.dense = nn.Linear(
            hsz,vsz
        )
        self.transformer = transformer
        
    def forward(self, input, hidden=None):
        return self.dense(self.transformer(self.embedding(input)))
        
class DeletionClassifier(nn.Module):    
    def __init__(self, hsz, transformer):
        super().__init__()
        self.dense = nn.Linear(
            hsz, 2,
        )
        self.transformer = transformer
    
    def forward(self, input, hidden=None):
        return self.dense(self.transformer(input))

class TransformerEncoderLayers(nn.Module):
    def __init__(self, vsz, hsz,num_heads=8, num_layers=2):
        super().__init__()
        
        self.transformer = TransformerEncoder(TransformerEncoderLayer(hsz, num_heads), num_layers)
        
    def forward(self, input, hidden=None):
        return self.transformer(input)
    
class Embedding(nn.Module):
    def __init__(self, vsz, hsz):
        super().__init__()
        self.embedding = nn.Embedding(
            vsz, hsz,
        )
        
    def forward(self, input, hidden=None):
        return self.embedding(input)
    
dataset = QADataset("/virtualmachines/data/trivia_qa/qa/wikipedia-train.json")
dataset.sample()

'<s> The subtitle of the 2015 Mad Max film is? </s>'

In [49]:
def deleted_indices_to_placeholders(deleted, max_placeholders):
    i = 0
    placeholders = []
    num_deleted = 0
    while True:
        if i == len(deleted):
            if num_deleted > 0:
                placeholders.append(min(num_deleted, max_placeholders))
            break
        if deleted[i] == 1:
            num_deleted += 1
        else:
            if num_deleted > 0:
                placeholders.append(min(num_deleted, max_placeholders))
            placeholders.append(0)
            num_deleted = 0
        i += 1
    while len(placeholders) < len(deleted):
        placeholders.append(0)
    return torch.unsqueeze(torch.LongTensor(placeholders), 0)


#delete_random([tokenizer.encode("Hello, my name is "), tokenizer.encode("What is your name?")])

In [50]:
def delete_minimal(y, y_ground, pad=True, max_placeholders=0):
    """Apply the sequence of deletions from y that give the smallest possible Levenshtein distance from y_ground
    Returns tensors of:
    - the token sequence post-deletion
    - the token sequence post-deletion, with PLH inserted at each deleted position
    - the number of placeholders inserted at each post-deletion index
    """
    batched = torch.zeros(len(y),)
    
    # if y_ground is longer than y, there is no sequence of deletes with a shorter distance
    if len(y_ground) > len(y): 
        return y
    
    # calculate LD directly against tokens
    deleted = []
    matrix, dist, edits = ld(y.numpy(),y_ground.numpy(), subcost=np.inf, inscost=np.inf)
    num_deleted = 0
    for i in range(len(edits)):
        if edits[i] == "D":
            deleted.append(1)
            num_deleted += 1
        else:
            deleted.append(0)
    placeholders = deleted_indices_to_placeholders(deleted,max_placeholders=max_placeholders)
    post_deletion, post_deletion_with_placeholders = deleted_boolean_to_tensors(y, deleted, num_deleted)    
    return post_deletion, post_deletion_with_placeholders, torch.LongTensor([deleted])#,placeholders

def pad(items, pad_len=10, pad_token=0, pad_dim=1):
    if type(items) == list:
        if pad_len == 0:
            pad_len = max([len(i) for i in items])
        copy = items.copy()
        for i in copy:
            while len(i) < pad_len:
                i.append(pad_token)
        return copy
    else:
        copy = torch.zeros((items.size(0),pad_len), dtype=items.dtype)
        copy[:,:items.size(1)] = items
        return copy
    
def insert_minimal(y, y_ground,max_placeholders):
    """Apply the sequence of insertions to y resulting in the smallest possible Levenshtein distance from y_ground
    Accepts tensor of:
    - bsz * max_seq_len
    Returns tensors of:
    - size (n+1), where the value at position 0 <= i < n represents the number of PLH tags inserted at that position
        - n is the number of tokens in y
    - size(k) containing the indices of each inserted token, where k is the total number of tokens added
    - size(n+k) containing the tokens of the entire post-insertion sequence
    """

    batch_placeholders = []
    batch_inserted = []
    batch_new = []
    for batch_idx in range(len(y)):
        matrix, dist, edits = ld(y[batch_idx],y_ground[batch_idx], subcost=np.inf, delcost=np.inf)
        inserted = 0
        y_placeholders = []
        y_inserted = []
        y_new = []
        y_idx = 0
        i = 0
        while i < len(edits):
            if edits[i][0] == "I":
                accum = 0
                while i < len(edits) and edits[i][0] == "I":
                    y_inserted.append(edits[i][1])
                    y_new.append(edits[i][1])
                    accum += 1
                    i += 1
                y_placeholders.append(min(accum,max_placeholders-1))
            else:
                y_placeholders.append(0)
                y_new.append(y[batch_idx][y_idx])
                i += 1
                y_idx += 1
        batch_placeholders.append(y_placeholders)
        batch_inserted.append(y_inserted)
        batch_new.append(y_new)
    batch_placeholders = pad(batch_placeholders, pad_token=tokenizer.pad_token_id)

    batch_new = pad(batch_new, pad_token=tokenizer.pad_token_id)
    
    return batch_placeholders, batch_inserted, torch.LongTensor(batch_new)
   

In [51]:
def apply_deletion(source, deletions):
    def apply():
        yield ts
        for i in range(1, len(source) - 1):
            if deletions[i] == 0:
                yield source[i].item()  
        yield te
    return list(apply())

def placeholders_from_mask(source, placeholder_mask, strip_tags=False, max_seq_len=50):
    if type(source) == list:
            source = torch.LongTensor(source)
    if type(placeholder_mask) == list:
        placeholder_mask = torch.LongTensor(placeholder_mask)
    
    def generate():
        yield ts
        yielded = 1
        for i in range(1, placeholder_mask.size(0)):
            if yielded >= max_seq_len:
                break
            if source[i].item() == te:
                placeholder_mask[i] = 0
                yield te
            else:
                if placeholder_mask[i] == 1:
                    yield plh
                    yielded += 1
                yield source[i].item()
                yielded += 1
    placeholders= list(generate())
    deletion_mask = [1 if p == plh else 0 for p in placeholders]
    return torch.LongTensor(placeholders), torch.LongTensor(deletion_mask)

In [62]:
class InsertionInput():
    '''
    Wraps tensors of;
    - the original sequence
    - the post-deletion sequence
    - the post-deletion sequence (including placeholders)
    - boolean indicating whether the tokens at index i was deleted
    '''
    def __init__(self, ground, max_placeholders=5, p_del=0.05,pad_len=200):
        ground = pad(ground, pad_len=max([len(x) for x in ground]))
        self.ground = torch.cuda.LongTensor(ground)
        self.p_del = p_del
        self.max_placeholders = max_placeholders
        self.pad_len = pad_len
        self.delete_random()
        
    def delete_random(self):
        """Deletes token(s) randomly from the passed (tokenized) string with probability p
        Accepts:
        - a list of token sequences bsz * pad_length
        Returns tensors of:
        - the token sequence post-deletion
        - the token sequence post-deletion, with PLH inserted at each deleted position
        - the number of placeholders inserted at each post-deletion index
        """
        # don't delete <s> or </s> tags
        deletion_mask = torch.cuda.FloatTensor(self.ground.size()).uniform_() > (1 - self.p_del)
        deletion_mask[self.ground == ts] = False
        deletion_mask[self.ground == te] = False
        self.deleted = self.ground.clone() * (deletion_mask == False).long()
        if self.deleted.size(1) > self.pad_len:
            self.deleted = self.deleted[:,:self.pad_len]
        else:
            self.deleted = pad(self.deleted, pad_len=self.pad_len)
        
        self.placeholders = self.ground.clone()
        
        self.placeholders[deletion_mask == 1] = plh
        
        # sum every adjacent deletion to get the total number of placeholders to be inserted at that position
        def get_placeholders(batch_idx):
            accum = 0
            seq_len = deletion_mask.size(1) - 2 # exclude start and end tags
            for i in range(seq_len):
                if deletion_mask[batch_idx,i] == 1:
                    accum += 1
                else:
                    if accum > 0:
                        yield min(accum, self.max_placeholders - 1)
                    yield 0
                    accum = 0
            if accum > 0:
                yield min(accum, self.max_placeholders - 1)
        
        self.num_placeholders = [list(get_placeholders(i)) for i in range(self.ground.size(0))]
        self.num_placeholders = pad(self.num_placeholders, pad_len=self.deleted.size(1) - 1)
        self.num_placeholders = torch.LongTensor(self.num_placeholders)
        #assert self.num_placeholders.size(1) == self.deleted.size(1) - 1
        if self.num_placeholders.size(1) != self.deleted.size(1) - 1:
            self.num_placeholders = self.num_placeholders[:,self.deleted.size(1) - 1]

class DeletionInput():
    """Randomly inserts tokens into the sequence
    Wraps tensors of:
    - the post-insertion sequence
    - the indices of the inserted tokens
    """ 
    def __init__(self, insertion_input, t_classifier, p_ins=0.05, max_inserts=5, pad_len=200):
        self.to_delete = torch.zeros((insertion_input.ground.size(0),pad_len),dtype=torch.long)
        with_placeholders = torch.zeros((insertion_input.ground.size(0),pad_len), dtype=torch.long)
        
        # generate placeholder/deletion mask for each sample
        for batch_idx in range(insertion_input.ground.size(0)):
            seq_len = insertion_input.ground[batch_idx].size(0)
            placeholder_mask = torch.cuda.FloatTensor(seq_len).uniform_() > (1 - p_ins)
            placeholder_mask = placeholder_mask.long()
            placeholders, deletion_mask = placeholders_from_mask(insertion_input.ground[batch_idx], placeholder_mask)
            with_placeholders[batch_idx,:placeholders.size(0)] = placeholders
            self.to_delete[batch_idx, :len(deletion_mask)] = deletion_mask
        
        # replace all PLH tokens with tokens from the insertion classifier
        insert_tokens = t_classifier(with_placeholders.cuda())
        
        insert_tokens = torch.argmax(insert_tokens,dim=2)

        self.with_tokens = with_placeholders.clone()
        
        for batch_idx in range(insertion_input.ground.size(0)):
            for i in range(insert_tokens.size(1)):
                if self.with_tokens[batch_idx,i] == te:
                    break
                    
                if self.with_tokens[batch_idx,i] == plh:
                    self.with_tokens[batch_idx,i] = insert_tokens[batch_idx,i]
        
        if insertion_input.ground.size(1) < pad_len:
            self.ground_padded = pad(insertion_input.ground,pad_len=pad_len)
        else:
            print(tokenizer.decode(insertion_input.ground))
            self.ground_padded = None
        

In [67]:
class Model():
    def __init__(self, vocab_size, hsz=768, lr=0.0001, max_placeholders=5, alpha=0, beta=0):
        super().__init__()
        self.transformer = TransformerEncoderLayers(len(tokenizer), hsz)
        self.p_classifier = PlaceholderClassifier(hsz, self.transformer).cuda()
        self.embedder = Embedding(vocab_size,hsz)
        self.t_classifier = TokenClassifier(hsz, self.embedder, self.transformer, vocab_size, 20).cuda()
        self.d_classifier = DeletionClassifier(hsz, self.transformer).cuda()
        
        self.dataset = QADataset("/virtualmachines/data/trivia_qa/qa/wikipedia-train.json")
        self.tokenizer = tokenizer
        self.alpha = alpha
        self.beta = beta
        self.max_placeholders = max_placeholders             
        
        self.alpha = 0.5
        self.beta = 0.5
        self.p_loss = nn.CrossEntropyLoss()
        self.t_loss = nn.CrossEntropyLoss()
        self.d_loss = nn.CrossEntropyLoss()
        
        self.optims = {
            'p_classifier': optim.SGD(self.p_classifier.parameters(), lr=lr),
            't_classifier': optim.SGD(self.t_classifier.parameters(), lr=lr),
            'd_classifier': optim.SGD(self.d_classifier.parameters(), lr=lr),
        }
        
        self.step = 0
        self.loss = 0
                        
    def sample(self, bs=10, pad_to=30,max_samples=30):
        '''Sample an observation and return tensor tuples:
        
        2) (a) 1(a), but with PLH replacing each deleted token. For input to the token insertion classifier 
           (b) the number of placeholders tokens to insert at each index in 1(a) (for the placeholder classifier loss)
        3) (a) the observation perturbed with insertion (tokens), for input to the token deletion classifier 
           (b) 2(a) (for the token insertion classifier loss)
        '''
        
        v = random.random()
        
        # sample an untokenized string
        ground = [self.dataset.sample() for i in range(bs)]
        # tokenize/encode
        encoded = [self.tokenizer.encode(s) for s in ground]
        
        deletion_input = None
        drawn = 0
        while deletion_input is None or deletion_input.ground_padded is None:
            if drawn > max_samples:
                raise Exception("Reached max number of sample iterations without a valid sample")
            drawn += 1
            insertion_input = InsertionInput(encoded)
            deletion_input = DeletionInput(insertion_input, self.t_classifier)
    
        return insertion_input, deletion_input
    
    def zero_grad(self):
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        for optimizer in self.optims.values():
            optimizer.step()
            
    def encode_list(self, tokens):
        if type(tokens) == list:
            return encoder(torch.LongTensor([tokens]).cuda())[0]
        return encoder(tokens)[0]
            
    def pretty_print(self, label, tokens):
        if type(tokens) == torch.Tensor:
            tokens = tokens.tolist()
        pretty = "%s \n %s" % (label, tokenizer.decode(tokens))
        pretty = pretty.replace("[PAD]","")
        print(pretty)
        
    def decode_step(self):
        with torch.no_grad():
            self.p_classifier.eval()
            self.t_classifier.eval()
            self.d_classifier.eval()

            insertion_input, deletion_input = self.sample(bs=1)
            #self.pretty_print("Ground truth", insertion_input.ground[0])
            #self.pretty_print("Ground truth post-deletion", insertion_input.deleted[0].tolist())

            step = 0
            max_steps = 5
            last = insertion_input.deleted
            ground = last
            
            while True:            
                self.pretty_print("Last", last[0])
                if step > max_steps:
                    ground = last
                    break
                
                if step > 0 and (last.size() == ground.size() and torch.all(last == ground)):
                    break
                
                print("Decode step %d" % step)
                
                step += 1    
                
                # run a deletion pass
                preds_deletes = self.d_classifier(self.embedder(ground.cuda()))
                if preds_deletes.size(1) > 0:
                    deletions = torch.argmax(preds_deletes,2)
                    assert ground.size(1) >= deletions.size(1)
                    deleted = [ground[0,i].item() for i in range(deletions.size(1)) if deletions[0,i] == 0]
                    ground = torch.LongTensor([deleted])
                    self.pretty_print("Post-deletion", deleted)
                else:
                    print("No deletions")
                
                preds_placeholders = self.p_classifier(self.embedder(ground.cuda()).cuda())
                # then run a placeholder pass                    
                if preds_placeholders.size(1) == 0:
                    print("No placeholders")
                    continue
                    
                #print("preds_placeholders")                
                #print(preds_placeholders.size())                
                placeholders = torch.argmax(preds_placeholders,2)
                #print("ground")
                #print(ground.size())
                #print("placeholders:")
                #print(placeholders.size())
                
                reconstructed = [ground[0,0].item()]
                for i in range(1, ground.size(1) - 1): # skip first (<s>) and last (</s>) tokens 
                    for j in range(placeholders[0,i]):
                        if len(reconstructed) < 512:
                            reconstructed.append(plh)
                    if len(reconstructed) < 512:
                        item = ground[0,i].item()
                        reconstructed.append(item)
                        if item == te:
                            break

                self.pretty_print("Post-placeholders", reconstructed)
                      
                # then an insertion pass
                preds_inserts = self.t_classifier(torch.cuda.LongTensor([reconstructed]))
                inserts = torch.argmax(preds_inserts, 2)
                #print("inserts")
                #print(inserts.size())
                
                output = [reconstructed[0]]
                for i in range(1,len(reconstructed)):
                    if reconstructed[i] == plh:
                        output.append(inserts[0,i].item())
                    else:
                        output.append(reconstructed[i])
                self.pretty_print("Post-insert", output)
                last = torch.LongTensor([output])
            if last is not None:
                #print(last[0].tolist())
                print(tokenizer.decode(last[0].tolist()))
            else:
                print("Couldn't decode")
            
    def train_step(self):
        loss = 0
        self.zero_grad()
        self.p_classifier.train()
        self.t_classifier.train()
        self.d_classifier.train()
        
        insertion_input, deletion_input = self.sample(bs=1)
        
        preds_deletes = self.d_classifier(self.embedder(deletion_input.to_delete.cuda()).cuda())
        preds_placeholders = self.p_classifier(self.embedder(insertion_input.deleted.cuda()))
        preds_inserts = self.t_classifier(deletion_input.with_tokens.cuda())
        
        loss = self.d_loss(torch.transpose(preds_deletes, 1, 2).cuda(), deletion_input.to_delete.cuda())
        loss += self.p_loss(torch.transpose(preds_placeholders, 1, 2).cuda(), insertion_input.num_placeholders.cuda())
        loss += self.t_loss(torch.transpose(preds_inserts,1,2).cuda(), deletion_input.ground_padded.cuda())
        
        self.step += 1
        self.loss += loss
        if self.step % 250 == 0:
            print(self.step)
            self.pretty_print("Ground", insertion_input.ground[0])
            self.pretty_print("Input to deletion classifier", deletion_input.with_tokens[0])
            self.pretty_print("Expected deletion classifier output", apply_deletion(deletion_input.with_tokens[0], deletion_input.to_delete[0]))
            self.pretty_print("Input to placeholder classifier", insertion_input.deleted[0])
            #print(placeholders_from_mask(insertion_input.deleted[0], insertion_input.num_placeholders[0]))
            self.pretty_print("Expected placeholder classifier output", placeholders_from_mask(insertion_input.deleted[0], insertion_input.num_placeholders[0])[0])
            self.pretty_print("Input to insertion classifier", insertion_input.placeholders[0])
            self.pretty_print("Expected insertion classifier output", deletion_input.ground_padded[0])
        if self.step % 500 == 0:
            print(self.step)
            print(self.loss / 50)
            self.loss = 0
            self.decode_step()
        
        loss.backward()
        self.update_params()

model = Model(len(tokenizer))
#for i in range(100000):
#    model.train_step()
#model.decode_step()

In [70]:
model.decode_step()

Last 
  <s>what is one of the world's most prestigious scientific journals, first published on 4 november 1869, one of the few that publish original research articles across a wide range of scientific fields? </s>                                                                                                                                                                
Decode step 0
Post-deletion 
 what is one of the world's most prestigious journals, first published on 4 november 1869, one of the that publish original research across a wide range of fields? </s>                                                                                                                                                                
Post-placeholders 
 what is one of the world's most prestigious journals, first published on 4 november 1869, one of the that publish original research across a wide range of fields? </s>
Post-insert 
 what is one of the world's most prestigious journals, first publis

In [71]:
for i in range(10000):
    model.train_step()

2250
Ground 
  <s>where was the first 360 degree roller coaster in england </s>
Input to deletion classifier 
  <s>where was the first 360 degree roller coaster in england </s>                                                                                                                                                                                           
Expected deletion classifier output 
  <s>where was the first 360 degree roller coaster in england </s>                                                                                                                                                                                           </s>
Input to placeholder classifier 
  <s>where was the first 360 degree roller  in england </s>                                                                                                                                                                                           
Expected placeholder classifier output 
  <s>where was the fi

3250
Ground 
  <s>the wife of humphrey verdon roe, with whom she founded a ground - breaking clinic in britain, was also an author, palaeobotanist and campaigner for women's rights. what was the field in which she is known for pioneering effective, widespread and consistent action? </s>
Input to deletion classifier 
  <s>the wife annexation of humphrey verdon roe, with whom she founded a ground - breaking clinic in britain, was also an author, palaeobotanist and campaigner for women's rights solitary. what was the field in which she is                                                                                                                                                      
Expected deletion classifier output 
  <s>the wife of humphrey verdon roe, with whom she founded a ground - breaking clinic in britain, was also an author, palaeobotanist and campaigner for women's rights. what was the field in which she is                                                                    

4000
Ground 
  <s>who was the ferryman over the styx? </s>
Input to deletion classifier 
  <s>who was the ferryman over the styx? </s>                                                                                                                                                                                            
Expected deletion classifier output 
  <s>who was the ferryman over the styx? </s>                                                                                                                                                                                            </s>
Input to placeholder classifier 
  <s>who was the ferryman over the styx? </s>                                                                                                                                                                                            
Expected placeholder classifier output 
  <s>who was the ferryman over the styx? </s>                                       
Input to i

4750
Ground 
  <s>a cicatrix on the skin is more commonly called a what? </s>
Input to deletion classifier 
  <s>a cicatrix on the skin is more commonly called a what? </s>                                                                                                                                                                                       
Expected deletion classifier output 
  <s>a cicatrix on the skin is more commonly called a what? </s>                                                                                                                                                                                       </s>
Input to placeholder classifier 
  <s>a cicatrix on the skin is more commonly  a what? </s>                                                                                                                                                                                       
Expected placeholder classifier output 
  <s>a cicatrix on the skin is more comm

5750
Ground 
  <s>how many medals did gb win in the 2012 olympics? </s>
Input to deletion classifier 
  <s>how many medals did gb win in the 2012 olympics? </s>                                                                                                                                                                                          
Expected deletion classifier output 
  <s>how many medals did gb win in the 2012 olympics? </s>                                                                                                                                                                                          </s>
Input to placeholder classifier 
  <s>how many medals did gb win in the 2012 olympics? </s>                                                                                                                                                                                          
Expected placeholder classifier output 
  <s>how many medals did gb win in the 2012 olymp

6750
Ground 
  <s>in which country were nobel prize winners alan macdiarmid, maurice wilkins and ernest rutherford born? </s>
Input to deletion classifier 
  <s>in which country were nobel prize winners alan macdiarmid, maurice wilkins and  ernest rutherford born? </s>                                                                                                                                                                                
Expected deletion classifier output 
  <s>in which country were nobel prize winners alan macdiarmid, maurice wilkins and ernest rutherford born? </s>                                                                                                                                                                                </s>
Input to placeholder classifier 
  <s>in which country were nobel prize winners alan diarmid, maurice wilkins and ernest rutherford born? </s>                                                                                  

7500
Ground 
  <s>kate moss currently advertises which brand of cosmetics? </s>
Input to deletion classifier 
  <s>kate moss currently advertises which brand of cosmetics? </s>                                                                                                                                                                                          
Expected deletion classifier output 
  <s>kate moss currently advertises which brand of cosmetics? </s>                                                                                                                                                                                          </s>
Input to placeholder classifier 
  <s>kate moss currently advertises which brand of ? </s>                                                                                                                                                                                          
Expected placeholder classifier output 
  <s>kate moss currently a

8250
Ground 
  <s>which irish poet and dramatist was awarded the nobel prize in literature in 1923? </s>
Input to deletion classifier 
  <s> which irish poet and dramatist was awarded the nobel prize in literature in 1923? </s>                                                                                                                                                                                    
Expected deletion classifier output 
  <s>which irish poet and dramatist was awarded the nobel prize in literature in 1923? </s>                                                                                                                                                                                    </s>
Input to placeholder classifier 
  <s>which irish poet and dramatist was awarded the nobel prize in literature in 1923? </s>                                                                                                                                                           

9250
Ground 
  <s>what is a surgical procedure to improve the function or the appearance of a human nose? </s>
Input to deletion classifier 
  <s>what is a surgical  procedure to improve the function or the appearance of a human nose? </s>                                                                                                                                                                                   
Expected deletion classifier output 
  <s>what is a surgical procedure to improve the function or the appearance of a human nose? </s>                                                                                                                                                                                   </s>
Input to placeholder classifier 
  <s>what  a surgical procedure to improve the function or the appearance of a human nose? </s>                                                                                                                                       

10250
Ground 
  <s>on which planet is olympus mons? </s>
Input to deletion classifier 
  <s>on which planet is olympus mons? </s>                                                                                                                                                                                              
Expected deletion classifier output 
  <s>on which planet is olympus mons? </s>                                                                                                                                                                                              </s>
Input to placeholder classifier 
  <s>on  planet is olympus mons? </s>                                                                                                                                                                                              
Expected placeholder classifier output 
  <s>on <PLH> planet is olympus mons? </s>                                        
Input to insertion cla

11000
Ground 
  <s>name the famous male ballet dancer who defected to the west in 1961 while dancing with the kirov ballet in paris? </s>
Input to deletion classifier 
  <s>name the famous male ballet dancer who defected  to the west in 1961 while dancing with the kirov ballet in  paris? </s>                                                                                                                                                                            
Expected deletion classifier output 
  <s>name the famous male ballet dancer who defected to the west in 1961 while dancing with the kirov ballet in paris? </s>                                                                                                                                                                            </s>
Input to placeholder classifier 
  <s>name the famous male ballet dancer who defected to the west  1961 while dancing with the kirov ballet in paris? </s>                                           

11750
Ground 
  <s>which danish explorer discovered alaska in the eighteenth century? </s>
Input to deletion classifier 
  <s>which danish explorer discovered alaska in the eighteenth century? </s>                                                                                                                                                                                           
Expected deletion classifier output 
  <s>which danish explorer discovered alaska in the eighteenth century? </s>                                                                                                                                                                                           </s>
Input to placeholder classifier 
  <s>which danish explorer discovered alaska in the eighteenth century? </s>                                                                                                                                                                                           
Expected plac

In [None]:
tokenizer.encode("hello how are you")