In [1]:
import numpy as np
from pytorch_transformers.tokenization_distilbert import DistilBertTokenizer
from pytorch_transformers.modeling_distilbert import DistilBertModel
import torch
import torch.nn
from torch import optim
import random

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
special_tokens_dict = {'additional_special_tokens': ['<PLH>']}
tokenizer.add_special_tokens(special_tokens_dict)
encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')

noop = "N"
sub = "S"
insert = "I"
delete = "D"

def ld(s1, s2, subcost=1, delcost=1, inscost=1):
    operations = [["" for j in range(len(s2) + 1)] for i in range(len(s1) + 1)]
    
    matrix = np.zeros((len(s1)+1, len(s2)+1))
    
    for j in range(len(s2) + 1):
        matrix[0,j] = j
        operations[0][j] = insert
        for i in range(len(s1) + 1):
            matrix[i,0] = i
            operations[i][0] = insert
            if i > 0 and j > 0:
                subCost = matrix[i-1, j-1] if s1[i-1] == s2[j-1] else matrix[i-1, j-1] + subcost
                insertCost = matrix[i, j-1] + inscost
                deleteCost = matrix[i-1, j] + delcost
                minCost = min(subCost, insertCost, deleteCost)
                matrix[i,j] = minCost
                if minCost == 0:
                    operations[i][j] = noop
                elif minCost == deleteCost:
                    operations[i][j] = delete
                elif minCost == insertCost:
                    operations[i][j] = insert
                elif minCost == subCost:
                    operations[i][j] = sub
    i = len(s1)
    j = len(s2)
    history = []
    while j > 0 or i > 0:        
        if delcost != np.inf:
            if j == 0:
                history.append(delete)
                i -= 1
                continue
            if matrix[i-1][j-1] < matrix[i-1,j]:
                history.append(noop)
                i -= 1
                j -= 1
            else:
                history.append(delete)
                i -= 1
        elif inscost != np.inf:
            if j == 0:
                history.append(noop)
                i -= 1
                continue
            if matrix[i-1][j-1] < matrix[i,j-1]:
                history.append(noop)
                i -= 1
                j -= 1
            else:
                history.append((insert,s2[j-1]))
                #history.append(insert)
                j -= 1
    history.reverse()
    return matrix, matrix[len(s1),len(s2)], history
    


In [3]:
from torch.utils.data import Dataset
import json
class QADataset(Dataset):
    def __init__(self, path):
        with open(path, "r") as infile:
            self.data = json.load(infile)

    def __len__(self):
        return len(self.data["Data"])

    def sample(self):
        return self.data["Data"][random.randint(0, len(self.data["Data"]))]["Question"]
    
from torch import nn

PLH = "<PLH>"
TS = "<s>"
TE = "</s>"

class PlaceholderClassifier(nn.Module):
    def __init__(self, hsz, max_placeholders=10):
        super().__init__()
        self.dense = nn.Linear(
            hsz, max_placeholders,
        )
        self.activation = nn.ReLU()

    def forward(self, input, hidden=None):
        return self.activation(self.dense(input))
    
class TokenClassifier(nn.Module):    
    def __init__(self, hsz, vsz, max_seq_len):
        super().__init__()
        
        self.dense = nn.Linear(
            hsz, vsz
        )
        self.activation = nn.ReLU()
        
    def forward(self, input, hidden=None):
        return self.activation(self.dense(input))
        
class DeletionClassifier(nn.Module):    
    def __init__(self, hsz):
        super().__init__()
        self.dense = nn.Linear(
            hsz, 2,
        )
        self.activation = nn.ReLU()
    
    def forward(self, input, hidden=None):
        return self.activation(self.dense(input))
dataset = QADataset("/virtualmachines/data/trivia_qa/qa/wikipedia-train.json")
dataset.sample()
plh = tokenizer.encode("<PLH>")[0]
plh

30522

In [4]:
def deleted_indices_to_placeholders(deleted):
    i = 0
    placeholders = []
    num_deleted = 0
    while True:
        if i == len(deleted):
            if num_deleted > 0:
                placeholders.append(num_deleted)
            break
        if deleted[i] == 1:
            num_deleted += 1
        else:
            if num_deleted > 0:
                placeholders.append(num_deleted)
            placeholders.append(0)
            num_deleted = 0
        i += 1
    while len(placeholders) < len(deleted):
        placeholders.append(0)
    return torch.unsqueeze(torch.LongTensor(placeholders), 0)

def apply_deletions(y, deleted, pad=False):
#    if pad:
#        post_deletion = torch.zeros(y.size(), dtype=torch.int64)
#    else:
#        post_deletion = torch.zeros(y.size(0), y.size(1) - len(deleted), dtype=torch.int64)
#    post_deletion_with_placeholders = torch.zeros(y.size())
    #print("applying deletions")
    batch_post_deletion = []
    batch_post_deletion_with_placeholders = []
    for batch_index in range(len(y)):
    #    print(y[batch_index])
        j = 0
        post_deletion = []
        post_deletion_with_placeholders = []
        for i in range(len(y[batch_index])):
            if deleted[batch_index][i] == 0:
                post_deletion.append(y[batch_index][i])
                post_deletion_with_placeholders.append(y[batch_index][i])
                j += 1
            else:
                post_deletion_with_placeholders.append(plh)
        batch_post_deletion.append(post_deletion)
        batch_post_deletion_with_placeholders.append(post_deletion_with_placeholders)
    return batch_post_deletion, batch_post_deletion_with_placeholders

def delete_random(y, p=0.25, pad=True):
    """Deletes token(s) randomly from the passed (tokenized) string with probability p
    Accepts:
    - a list of token sequences bsz * pad_length
    Returns tensors of:
    - the token sequence post-deletion
    - the token sequence post-deletion, with PLH inserted at each deleted position
    - the number of placeholders inserted at each post-deletion index
    """
    batch_deletions = []
    for i in range(len(y)):
        deletions = []
        for i in range(len(y[i])):
            if random.random() < p:
                deletions.append(1)
            else:
                deletions.append(0)
        batch_deletions.append(deletions)
    post_deletion, post_deletion_with_placeholders = apply_deletions(y, batch_deletions)
    #placeholders = deleted_indices_to_placeholders(deleted)
    #assert placeholders.size() == post_deletion.size()
    return post_deletion, post_deletion_with_placeholders, batch_deletions#,placeholders

delete_random([tokenizer.encode("Hello, my name is "), tokenizer.encode("What is your name?")])

([[7592, 1010, 2171], [2054, 2003, 2171, 1029]],
 [[7592, 1010, 30522, 2171, 30522], [2054, 2003, 30522, 2171, 1029]],
 [[0, 0, 1, 0, 1], [0, 0, 1, 0, 0]])

In [5]:
def delete_minimal(y, y_ground, pad=True):
    """Apply the sequence of deletions from y that give the smallest possible Levenshtein distance from y_ground
    Returns tensors of:
    - the token sequence post-deletion
    - the token sequence post-deletion, with PLH inserted at each deleted position
    - the number of placeholders inserted at each post-deletion index
    """
    batched = torch.zeros(len(y),)
    
    # if y_ground is longer than y, there is no sequence of deletes with a shorter distance
    if len(y_ground) > len(y): 
        return y
    
    # calculate LD directly against tokens
    deleted = []
    matrix, dist, edits = ld(y.numpy(),y_ground.numpy(), subcost=np.inf, inscost=np.inf)
    num_deleted = 0
    for i in range(len(edits)):
        if edits[i] == "D":
            deleted.append(1)
            num_deleted += 1
        else:
            deleted.append(0)
    placeholders = deleted_indices_to_placeholders(deleted)
    post_deletion, post_deletion_with_placeholders = deleted_boolean_to_tensors(y, deleted, num_deleted)    
    return post_deletion, post_deletion_with_placeholders, torch.LongTensor([deleted])#,placeholders

def pad(items, pad_len=0, pad_token=0):
    if pad_len == 0:
        pad_len = max([len(i) for i in items])
    for i in items:
        while len(i) < pad_len:
            i.append(pad_token)
    return items
    
def insert_minimal(y, y_ground):
    """Apply the sequence of insertions to y resulting in the smallest possible Levenshtein distance from y_ground
    Accepts tensor of:
    - bsz * max_seq_len
    Returns tensors of:
    - size (n+1), where the value at position 0 <= i < n represents the number of PLH tags inserted at that position
        - n is the number of tokens in y
    - size(k) containing the indices of each inserted token, where k is the total number of tokens added
    - size(n+k) containing the tokens of the entire post-insertion sequence
    """
    #if y.size(1) >= y_ground.size(1): # if y is larger than y_ground, no sequence of inserts that will give a smaller LD
    #    return torch.LongTensor([[0] * y.size(1)]), torch.LongTensor([]), y
    #print("inserting minimal")
    #print(y)
    batch_placeholders = []
    batch_inserted = []
    batch_new = []
    for batch_idx in range(len(y)):
        matrix, dist, edits = ld(y[batch_idx],y_ground[batch_idx], subcost=np.inf, delcost=np.inf)
        inserted = 0
        y_placeholders = []
        y_inserted = []
        y_new = []
        y_idx = 0
        i = 0
        while i < len(edits):
            if edits[i][0] == "I":
                accum = 0
                while i < len(edits) and edits[i][0] == "I":
                    y_inserted.append(edits[i][1])
                    y_new.append(edits[i][1])
                    accum += 1
                    i += 1
                y_placeholders.append(accum)
            else:
                y_placeholders.append(0)
                y_new.append(y[batch_idx][y_idx])
                i += 1
                y_idx += 1
        batch_placeholders.append(y_placeholders)
        batch_inserted.append(y_inserted)
        batch_new.append(y_new)
    batch_placeholders = pad(batch_placeholders, pad_token=tokenizer.pad_token_id)
    #batch_inserted = pad(batch_inserted)
    batch_new = pad(batch_new, pad_token=tokenizer.pad_token_id)
    #print(batch_placeholders)
    #print(batch_new)
    return batch_placeholders, batch_inserted, torch.LongTensor(batch_new)

y1 = tokenizer.encode("My name is Nick, what is your name")
y2 = tokenizer.encode("My name is Nick")
y3 = tokenizer.encode("What's the dog bro you say now")
y4 = tokenizer.encode("What's the")

#delete_minimal(torch.LongTensor([y1]), torch.LongTensor([y2]))
insert_minimal([y2,y4], [y1,y3])
#y = torch.LongTensor([tokenizer.encode("Hi dude My name is Nick what name")])
#y_ground = torch.LongTensor([tokenizer.encode("My name is Nick, what is your name")])

#placeholders, inserted, new = insert_minimal(y, y_ground)


([[0, 0, 0, 0, 5], [0, 0, 0, 0, 5]],
 [[1010, 2054, 2003, 2115, 2171], [3899, 22953, 2017, 2360, 2085]],
 tensor([[ 2026,  2171,  2003,  4172,  1010,  2054,  2003,  2115,  2171],
         [ 2054,  1005,  1055,  1996,  3899, 22953,  2017,  2360,  2085]]))

In [None]:
class DatasetSampler():
    def __init__(self, tokenizer, encoder, alpha=0, beta=0):
        self.dataset = QADataset("/virtualmachines/data/trivia_qa/qa/wikipedia-train.json")
        self.tokenizer = tokenizer
        self.encoder = encoder
        self.alpha = alpha
        self.beta = beta
            
    def encode_and_pad(self, string, pad_length):
        string = self.tokenizer.encode(string)
        while len(string) < pad_length:
            string.append(self.tokenizer.pad_token_id)
        return torch.LongTensor([string])
    
    '''
        Encode the passed strings and pad to the specified length
    '''
    def encode_and_pad_batch(self, strings):
        encoded = [self.tokenizer.encode(s) for s in strings]
        pad_length = max([len(s) for s in encoded])
        padded = torch.zeros(len(strings), pad_length, dtype=torch.int64) # because self.tokenizer.pad_token_id == 0
        for i in range(len(encoded)):
            for j in range(len(encoded[i])):
                padded[i,j] = encoded[i][j]
        return padded
        
    def sample(self, bs=10, pad_to=30):
        '''Sample an observation and return tensor tuples:
        1) (a) the observation perturbed with deletions, for input to the placeholder classifier
           (b) the indices of the deleted tokens (i.e. where placeholders should be inserted, for the placeholder classifier loss) 
        2) (a) 1(a), but with PLH replacing each deleted token. For input to the token insertion classifier 
           (b) the number of placeholders tokens to insert at each index in 1(a) (for the placeholder classifier loss)
        3) (a) the observation perturbed with insertion (tokens), for input to the token deletion classifier 
           (b) 2(a) (for the token insertion classifier loss)
        '''
        u = random.random()
        v = random.random()
        # sample a pair of (untokenized) strings 
        y_ground = [self.dataset.sample() for i in range(bs)]
        y0 = ""
        
        # first, pad/encode y0 and y_ground
        y_ground = [self.tokenizer.encode(s) for s in y_ground]
        #y_ground = self.encode_and_pad_batch(y_ground)
        #y0 = self.encode_and_pad(y0, pad_to)
        
        y0 = y_ground
        #print("y0")
        #print(y0)
        # randomly choose between LD-minimal deletion and random deletion
        # Returns tensors of:
        # - the token sequence post-deletion
        # - the token sequence post-deletion, with PLH inserted at each deleted position
        # - whether a given index was deleted or not
        if u >= self.alpha:
            y_ins, y_ins_with_placeholders, y_ins_p = delete_random(y0)
        else:
            y_ins, y_ins_with_placeholders, y_ins_p = delete_minimal(y0, y_ground)
        #print("y_ins")
        #print(y_ins)
        # Returns tensors of:
        # - (1, n+1) - the number of PLH tags at each index
        # - (1, k) - the inserted tokens
        # - (1, n+k) - the entire post-insertion sequence
        # where n is the length of the original sequence and k is the number of tokens added 
        y_placeholders, y_inserted, y_ins_prime = insert_minimal(y_ins, y_ground)

        # input to the deletion classifier will be randomly chosen between:
        # - the input to the placeholder classifier (i.e. the deleted input)
        # - the output from applying the token classifier to y_placeholders
        if v >= self.alpha:
            y_del = y_ins
        else:
            enc = self.encoder(y_ins_with_placeholders)[0]
            logits = self.t_classifier(enc)
            y_del = torch.argmax(logits,dim=2)
            y_del = torch.LongTensor(y_del)

        # y_ins will be one token shorter than y_placeholders to account for placeholders added to the end of the sequence
        #if len(y_ins) != len(y_placeholders):
            #y_ins_copy = torch.zeros(y_ins.size(0), y_ins.size(1) + 1, dtype=torch.int64)
            #y_ins_copy[:,:y_ins.size(1)] = y_ins
            #y_ins = y_ins_copy
        #    y_ins.append(0)
        y_ins_p = torch.LongTensor(pad(y_ins_p, pad_token=0))
        
        y_del = torch.LongTensor(pad(y_del, pad_len=y_ins_p.size(1), pad_token=0))
        
        y_ins = torch.LongTensor(pad(y_ins, pad_token=0))
        y_placeholders = torch.LongTensor(pad(y_placeholders, pad_len=y_ins.size(1), pad_token=0))
        
        y_ins_prime = torch.LongTensor(pad(y_ins_prime, pad_token=0))
        
        #print(y_ins_prime.size())
        #print(y_ins.size())
        
        return ((self.encoder(y_del)[0], y_ins_p), 
               (self.encoder(y_ins)[0], y_placeholders), 
               (self.encoder(y_ins_prime)[0], y_ins))
    
class Model():
    def __init__(self, sampler, vocab_size, hsz=768, lr=0.0001):
        super().__init__()
        self.sampler = sampler
        self.p_classifier = PlaceholderClassifier(hsz)
        self.t_classifier = TokenClassifier(hsz, vocab_size, 20)
        self.d_classifier = DeletionClassifier(hsz)        
        self.alpha = 0.5
        self.beta = 0.5
        self.p_loss = nn.CrossEntropyLoss()
        self.t_loss = nn.CrossEntropyLoss()
        self.d_loss = nn.CrossEntropyLoss()
        
        self.optims = {
            'p_classifier': optim.SGD(self.p_classifier.parameters(), lr=lr),
            't_classifier': optim.SGD(self.t_classifier.parameters(), lr=lr),
            'd_classifier': optim.SGD(self.d_classifier.parameters(), lr=lr),
        }
        
        self.step = 0
        self.loss = 0
    
    def zero_grad(self):
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        for optimizer in self.optims.values():
            optimizer.step()
   
    def train_step(self):
        loss = 0
        self.zero_grad()
        self.p_classifier.train()
        self.t_classifier.train()
        self.d_classifier.train()
        
        (y_del,y_del_out), (y_ins,y_ins_out), (y_ins_prime, y_ins_prime_out) = self.sampler.sample(bs=10)
                
        preds_deletes = self.d_classifier(y_del).cuda()
        preds_placeholders = self.p_classifier(y_ins).cuda()
        #print(y_ins_prime.size())
        #print(y_ins_prime)
        preds_inserts = self.t_classifier(y_ins_prime).cuda()
        #print(preds_inserts)
        #print(y_ins_prime_out.size())
        
        loss = self.d_loss(torch.transpose(preds_deletes, 1, 2), y_del_out.cuda())
        loss += self.p_loss(torch.transpose(preds_placeholders, 1, 2), y_ins_out.cuda())
        loss += self.t_loss(torch.transpose(preds_inserts,1,2), y_ins_prime_out.cuda())
        self.step += 1
        self.loss += loss
        if self.step % 50 == 0:
            print(self.step)
            print(self.loss / 50)
            self.loss = 0
        
        loss.backward()
        self.update_params()

sampler = DatasetSampler(tokenizer, encoder)
model = Model(sampler, tokenizer.vocab_size)
for i in range(10000):
    model.train_step()


50
tensor(12.9842, device='cuda:0', grad_fn=<DivBackward0>)
100
tensor(12.6423, device='cuda:0', grad_fn=<DivBackward0>)
150
tensor(12.3041, device='cuda:0', grad_fn=<DivBackward0>)
200
tensor(11.9483, device='cuda:0', grad_fn=<DivBackward0>)


In [None]:
model.delete_random("<s>hellodude</s>")