In [4]:
import numpy as np
from pytorch_transformers.tokenization_distilbert import DistilBertTokenizer
from pytorch_transformers.modeling_distilbert import DistilBertModel
import torch
import torch.nn
from torch import optim
import random

In [5]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [6]:
noop = "N"
sub = "S"
insert = "I"
delete = "D"

In [7]:
def ld(s1, s2, subcost=1, delcost=1, inscost=1):
    operations = [["" for j in range(len(s2) + 1)] for i in range(len(s1) + 1)]
    matrix = np.zeros((len(s1)+1, len(s2)+1))
    for j in range(len(s2) + 1):
        matrix[0,j] = j
        operations[0][j] = insert
        for i in range(len(s1) + 1):
            matrix[i,0] = i
            operations[i][0] = insert
            if i > 0 and j > 0:
                subCost = matrix[i-1, j-1] if s1[i-1] == s2[j-1] else matrix[i-1, j-1] + subcost
                insertCost = matrix[i, j-1] + inscost
                deleteCost = matrix[i-1, j] + delcost
                #print("subCost %d" % subCost)
                minCost = min(subCost, insertCost, deleteCost)
                #print("minbCost %d" % minCost)
                matrix[i,j] = minCost
                if minCost == 0:
                    operations[i][j] = noop
                elif minCost == deleteCost:
                    operations[i][j] = delete
                elif minCost == insertCost:
                    operations[i][j] = insert
                elif minCost == subCost:
                    operations[i][j] = sub
    i = len(s1)
    j = len(s2)
    history = []
    while j > 0 or i > 0:        
        if delcost != np.inf:
            if j == 0:
                history.append(delete)
                i -= 1
                continue
            if matrix[i-1][j-1] < matrix[i-1,j]:
                history.append(noop)
                i -= 1
                j -= 1
            else:
                history.append(delete)
                i -= 1
        elif inscost != np.inf:
            if j == 0:
                history.append(noop)
                i -= 1
                continue
            if matrix[i-1][j-1] < matrix[i,j-1]:
                history.append(noop)
                i -= 1
                j -= 1
            else:
                history.append((insert,s2[j-1]))
                #history.append(insert)
                j -= 1
    history.reverse()
    return matrix, matrix[len(s1),len(s2)], history
    


In [8]:
from torch.utils.data import Dataset
import json
class QADataset(Dataset):
    def __init__(self, path):
        with open(path, "r") as infile:
            self.data = json.load(infile)

    def __len__(self):
        return len(self.data["Data"])

    def sample(self):
        #print(self.data["Data"][random.randint(0, len(self.data["Data"]))])
        return self.data["Data"][random.randint(0, len(self.data["Data"]))]["Question"]
    


In [16]:
from torch import nn

PLH = "<PLH>"
TS = "<s>"
TE = "</s>"

class PlaceholderClassifier(nn.Module):
    def __init__(self, hsz, max_placeholders=10):
        super().__init__()
        self.dense = nn.Linear(
            hsz, max_placeholders,
        )
        self.activation = nn.ReLU()

    def forward(self, input, hidden=None):
        return self.activation(self.dense(input))
    
class TokenClassifier(nn.Module):    
    def __init__(self, hsz, vsz, max_seq_len):
        super().__init__()
        
        self.dense = nn.Linear(
            hsz
        )
        self.activation = nn.ReLU()
        
    def forward(self, input, hidden=None):
        s = input.sum(dim=1)
        return self.activation(self.dense(s))
        
class DeletionClassifier(nn.Module):    
    def __init__(self, hsz):
        super().__init__()
        self.dense = nn.Linear(
            hsz, 2,
        )
        self.activation = nn.ReLU()
    
    def forward(self, input, hidden=None):
        return self.activation(self.dense(input))

In [10]:
dataset = QADataset("/virtualmachines/data/trivia_qa/qa/wikipedia-train.json")
dataset.sample()

'Errol Brown who died last month was best known as a member of which band?'

In [17]:
class Model():
    def __init__(self, tokenizer, encoder, hsz=768, alpha=0, beta=0, lr=0.0001):
        super().__init__()
        self.tokenizer = tokenizer
        self.encoder = encoder
        self.p_classifier = PlaceholderClassifier(hsz)
        self.t_classifier = TokenClassifier(hsz)
        self.d_classifier = DeletionClassifier(hsz)        
        self.alpha = 0.5
        self.beta = 0.5
        self.p_loss = nn.NLLLoss()
        self.t_loss = nn.NLLLoss()
        self.d_loss = nn.NLLLoss()
        
        self.optims = {
            'p_classifier': optim.SGD(self.p_classifier.parameters(), lr=lr),
            't_classifier': optim.SGD(self.t_classifier.parameters(), lr=lr),
            'd_classifier': optim.SGD(self.d_classifier.parameters(), lr=lr),
        }
        
        self.dataset = QADataset("/virtualmachines/data/trivia_qa/qa/wikipedia-train.json")

        
    def delete_random(self, y):
        for i in range(3, len(y) - 4): # don't delete <s> or </s>
            if random.random() > 0.05:
                y[i] = None
        return str([yi for yi in y if yi is not None])

    def delete_minimal(self, y, y_ground):
        if len(y_ground) > len(y): # if y_ground is longer than y, there is no sequence of deletes that will give a smaller LD
            return y
        y = y[3:len(y) - 4]
        y_ground = y_ground[3:len(y_ground) - 4] 
        matrix, dist, edits = ld(y,y_ground, subcost=np.inf, inscost=np.inf)
        deleted = 0
        for i in range(len(edits)):
            if edits[i] == "D":
                y = y[:i-deleted] + y[i-deleted+1:]
                deleted += 1
        return TS + str(y) + TE

    def insert_minimal(self, y, y_ground):
        if len(y) > len(y_ground): # if y is larger than y_ground, no sequence of inserts that will give a smaller LD
            return y
        y = y[3:len(y) - 4]
        y_ground = y_ground[3:len(y_ground) - 4] 
        matrix, dist, edits = ld(y,y_ground, subcost=np.inf, delcost=np.inf)
        inserted = 0
        i = 0
        p = y
        for edit in edits:
            if edit[0] == "I":
                y = str(y[:i+inserted]) + edit[1] + str(y[i+inserted+1:])
                p = str(p[:i+inserted]) + PLH + str(y[i+inserted+1:])
                inserted += 1
            i += 1
        return TS + y + TE, TS + p + TE
            

    def sample(self, alpha, beta):
        u = random.random()
        v = random.random()
        y_ground = TS + self.dataset.sample() + TE # produce a pair of strings (i.e. untokenized)
        y0 = TS + TE
        if len(y0) > 2: # "<s></s>" is an empty sequence, so no deletion is possible
            if u >= alpha:
                y_ins = self.delete_random(y0)
            else:
                y_ins = self.delete_minimal(y0, y_ground)       

            y_ins_prime_p,y_ins_prime_t = self.insert_minimal(y_ins, y_ground)
            y_ins_prime_p = torch.LongTensor([self.tokenizer.encode(y_ins_prime_p)])
            y_ins_prime_t = self.tokenizer.encode(y_ins_prime_t)
            y_ins_prime_t = torch.LongTensor([y_ins_prime_t])
            
            #y_ins_prime_t = self.encoder(y_ins_prime_t)[0]
            
            y_ins = torch.LongTensor([self.tokenizer.encode(y_ins)])
            #y_ins = self.encoder(y_ins)[0]
            
            if v > alpha:
                y_del = y0
                y_del = self.tokenizer.encode(y_del)
                y_del = torch.LongTensor([y_del])
                #y_del = self.encoder(y_del)[0]
            else:
                logits = self.t_classifier(self.encoder(y_ins_prime_t)[0])
                y_del = torch.argmax(logits,dim=2)
        
        return y_del, y_ins, y_ins_prime_p, y_ins_prime_t
    
    def zero_grad(self):
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        for optimizer in self.optims.values():
            optimizer.step()
   
    def train_step(self):
        y_del, y_ins, y_ins_prime_p, y_ins_prime_t = self.sample(self.alpha, self.beta)
        preds_placeholders = self.p_classifier(self.encoder(y_ins)[0])
        preds_inserts = self.t_classifier(self.encoder(y_ins_prime_p)[0])
        preds_deletes = self.d_classifier(self.encoder(y_del)[0])

        loss = 0
        self.zero_grad()
        self.encoder.train()
        
        loss = self.d_loss(preds_deletes, y_del)
        loss += self.t_loss(preds_inserts, y_ins_prime_t)
        loss += self.p_loss(preds_placeholders, y_ins)
        loss.backward()
        self.update_params()
model = Model(tokenizer, encoder)
model.train_step()

TypeError: __init__() missing 2 required positional arguments: 'vsz' and 'max_seq_len'

In [24]:
insert_minimal("<s>fello</s>","<s>ellodude</s>")

NameError: name 'insert_minimal' is not defined

TypeError: delete_minimal() missing 1 required positional argument: 'y_ground'