In [2]:
!git clone https://github.com/Eric-Wallace/universal-triggers.git

Cloning into 'universal-triggers'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 45 (delta 3), reused 3 (delta 0), pack-reused 31[K
Unpacking objects: 100% (45/45), done.


In [5]:
!pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/bb/bb/041115d8bad1447080e5d1e30097c95e4b66e36074277afce8620a61cee3/allennlp-0.9.0-py3-none-any.whl (7.6MB)
[K     |████████████████████████████████| 7.6MB 396kB/s 
[?25hCollecting word2number>=1.1
  Downloading https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcbe5d0457e45e6/word2number-1.1.zip
Collecting conllu==1.3.1
  Downloading https://files.pythonhosted.org/packages/ae/54/b0ae1199f3d01666821b028cd967f7c0ac527ab162af433d3da69242cea2/conllu-1.3.1-py2.py3-none-any.whl
Collecting parsimonious>=0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/02/fc/067a3f89869a41009e1a7cdfb14725f8ddd246f30f63c645e8ef8a1c56f4/parsimonious-0.8.1.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 8.3MB/s 
Collecting pytorch-transformers==1.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/89/ad0d6bb932d0a51793eaabcf1617a36ff530dc9ab9e38f7

In [0]:
import os
PATH = '/content/universal-triggers/sst'
os.chdir(PATH)

In [96]:
import sys
import os.path
from sklearn.neighbors import KDTree
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator, BasicIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.embedding import _read_pretrained_embeddings_file
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.trainer import Trainer
from allennlp.common.util import lazy_groups_of
from allennlp.data.token_indexers import SingleIdTokenIndexer
sys.path.append('..')
import utils
import attacks

# Simple LSTM classifier that uses the final hidden state to classify Sentiment. Based on AllenNLP
class LstmClassifier(Model):
    def __init__(self, word_embeddings, encoder, vocab):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, tokens, label):
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)
        return output

    def get_metrics(self, reset=False):
        return {'accuracy': self.accuracy.get_metric(reset)}

EMBEDDING_TYPE = "w2v" # what type of word embeddings to use


### MAIN
# load the binary SST dataset.
single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer
# use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                token_indexers={"tokens": single_id_indexer},
                                                use_subtrees=True)
train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                token_indexers={"tokens": single_id_indexer})
dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
# test_dataset = reader.read('data/sst/test.txt')

vocab = Vocabulary.from_instances(train_data)

# Randomly initialize vectors
if EMBEDDING_TYPE == "None":
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
    word_embedding_dim = 300

# Load word2vec vectors
elif EMBEDDING_TYPE == "w2v":
    embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
    weight = _read_pretrained_embeddings_file(embedding_path,
                                                embedding_dim=300,
                                                vocab=vocab,
                                                namespace="tokens")
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=300,
                                weight=weight,
                                trainable=False)
    word_embedding_dim = 300

# Initialize model, cuda(), and optimizer
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                hidden_size=512,
                                                num_layers=2,
                                                batch_first=True))
model = LstmClassifier(word_embeddings, encoder, vocab)
model.cuda()

# where to save the model
model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
# if the model already exists (its been trained), load the pre-trained weights and vocabulary
if os.path.isfile(model_path):
    vocab = Vocabulary.from_files(vocab_path)
    model = LstmClassifier(word_embeddings, encoder, vocab)
    with open(model_path, 'rb') as f:
        model.load_state_dict(torch.load(f))
# otherwise train model from scratch and save its weights
else:
    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    optimizer = optim.Adam(model.parameters())
    trainer = Trainer(model=model,
                        optimizer=optimizer,
                        iterator=iterator,
                        train_dataset=train_data,
                        validation_dataset=dev_data,
                        num_epochs=5,
                        patience=1,
                        cuda_device=0)
    trainer.train()
    with open(model_path, 'wb') as f:
        torch.save(model.state_dict(), f)
    vocab.save_to_files(vocab_path)
model.train().cuda() # rnn cannot do backwards in train mode

# Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
# We use the gradient later in the attack.
utils.add_hooks(model)
embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix

# Use batches of size universal_perturb_batch_size for the attacks.
universal_perturb_batch_size = 128
iterator = BasicIterator(batch_size=universal_perturb_batch_size)
iterator.index_with(vocab)

# Build k-d Tree if you are using gradient + nearest neighbor attack
# tree = KDTree(embedding_weight.numpy())

# filter the dataset to only positive or negative examples
# (the trigger will cause the opposite prediction)
dataset_label_filter = "0"
targeted_dev_data = []
for instance in dev_data:
    if instance['label'].label == dataset_label_filter:
        targeted_dev_data.append(instance)

# get accuracy before adding triggers
utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
model.train() # rnn cannot do backwards in train mode

classifier = model

# initialize triggers which are concatenated to the input
num_trigger_tokens = 3
trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

# sample batches, update the triggers, and repeat
for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1):
    # get accuracy with current triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
    model.train() # rnn cannot do backwards in train mode

    # get gradient w.r.t. trigger embeddings for current batch
    averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

    # pass the gradients to a particular attack to generate token candidates for each token.
    cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                    embedding_weight,
                                                    trigger_token_ids,
                                                    num_candidates=40,
                                                    increase_loss=True)
    # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
    #                                                trigger_token_ids,
    #                                                num_candidates=40)
    # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
    #                                                        embedding_weight,
    #                                                        trigger_token_ids,
    #                                                        tree,
    #                                                        100,
    #                                                        num_candidates=40,
    #                                                        increase_loss=True)

    # Tries all of the candidates and returns the trigger sequence with highest loss.
    trigger_token_ids = utils.get_best_candidates(model,
                                                    batch,
                                                    trigger_token_ids,
                                                    cand_trigger_token_ids)

# print accuracy after adding triggers
utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)

98794it [00:06, 16119.81it/s]
872it [00:00, 1103.07it/s]
100%|██████████| 98794/98794 [00:00<00:00, 169448.15it/s]
100%|██████████| 1999995/1999995 [00:34<00:00, 57593.31it/s]


Without Triggers: 0.9112149532710281
Current Triggers: the, the, the,  : 0.9182242990654206
Current Triggers: vibrant, vibrant, thought-provoking,  : 0.1939252336448598
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, restored,  : 0.08177570093457943
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, improves,  : 0.08411214953271028
Current Triggers: irresistibly, flavorful, restored,  : 0.08177570093457943
Current Triggers: irresistibly, flavorful, restored,  : 0.0817757009345

In [0]:
from allennlp.data.dataset_readers.text_classification_json import TextClassificationJsonReader
reader = TextClassificationJsonReader()
text = []
text.append(reader.text_to_instance(text="irresistibly, flavorful, improves,The movie is bad", label="1"))
text.append(reader.text_to_instance(text="The movie is bad", label="1"))
text.append(reader.text_to_instance(text="irresistibly, flavorful, improves", label="1"))

In [129]:
import numpy as np
iterator = BasicIterator(batch_size=universal_perturb_batch_size)
iterator.index_with(vocab)

for i, batch in enumerate(iterator(text, num_epochs=1, shuffle=False)):
    batch = move_to_device(batch, cuda_device=0)
    ans = classifier(batch['tokens'], batch['label'])
    print(ans)

{'logits': tensor([[ 0.1357, -0.3485],
        [-2.3997,  2.4109],
        [ 5.2446, -5.5963]], device='cuda:0', grad_fn=<AddmmBackward>), 'loss': tensor(1.7663, device='cuda:0', grad_fn=<NllLossBackward>)}
