In [None]:
!pip install allennlp==1.0.0
!pip install allennlp-models==1.0.0

In [None]:
from typing import Dict
from operator import itemgetter
import numpy as np
import torch
import torch.optim as optim
from copy import deepcopy
from allennlp.data import DataLoader
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import GradientDescentTrainer
from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader

#from allennlp.common.util import lazy_groups_of
from allennlp.modules.token_embedders.embedding import _read_pretrained_embeddings_file
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.modules.text_field_embedders import TextFieldEmbedder
from allennlp.nn.util import move_to_device

In [None]:
class LstmClassifier(Model):
    def __init__(self, embedder, encoder, vocab):  
        super().__init__(vocab)

        self.embedder = embedder

        self.encoder = encoder

        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        
        self.accuracy = CategoricalAccuracy()
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, tokens, label=None):
        
        mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.embedder(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)
        probs = torch.softmax(logits, dim=1)

        output = {"logits": logits, "probs": probs}
        if label is not None:
            self.accuracy(logits, label)
            
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset = False):

        return {'accuracy': self.accuracy.get_metric(reset)}

In [None]:
single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True)
train_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt')
reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer})
dev_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt')

In [None]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)
print("created a vocab,", vocab)
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
train_data_loader = DataLoader(train_dataset,
                         batch_sampler=BucketBatchSampler(
                             train_dataset,
                             batch_size=32,
                             sorting_keys=["tokens"]))
dev_data_loader = DataLoader(dev_dataset,
                         batch_sampler=BucketBatchSampler(
                             dev_dataset,
                             batch_size=32,
                             sorting_keys=["tokens"]))

In [None]:
EMBEDDING_TYPE = "w2v"
if EMBEDDING_TYPE == None:
  token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
                                    
  embedding_dim = 300
elif EMBEDDING_TYPE == "w2v":
  embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
  weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens")
  token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False)

In [None]:
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
EMBEDDING_DIM=300


In [None]:
encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, hidden_size=512, num_layers=2, batch_first=True))

In [None]:
model = LstmClassifier(word_embeddings, encoder, vocab)

In [None]:
model_path = '/Text_Attack/tmp_probs/w2v_model.th'
vocab_path = '/Text_Attack/tmp_probs/w2v_vocab'
vocab = Vocabulary.from_files(vocab_path)
model = LstmClassifier(word_embeddings, encoder, vocab)
with open(model_path, 'rb') as f:
  model.load_state_dict(torch.load(f))
model.eval()

In [None]:
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
train_data_loader = DataLoader(train_dataset,
                         batch_sampler=BucketBatchSampler(
                             train_dataset,
                             batch_size=32,
                             sorting_keys=["tokens"]))
dev_data_loader = DataLoader(dev_dataset,
                         batch_sampler=BucketBatchSampler(
                             dev_dataset,
                             batch_size=32,
                             sorting_keys=["tokens"]))

In [None]:
for batch in dev_data_loader:
  print(batch)

In [None]:
 model.train().cuda()

In [None]:
extracted_grads= []
def extract_grad_hook(module, grad_in, grad_out):
  extracted_grads.append(grad_out[0])

def add_hooks(model):
   for module in model.modules():
     if isinstance(module, TextFieldEmbedder):
       for embed in module._token_embedders.keys():
         module._token_embedders[embed].weight.requires_grad = True
       module.register_backward_hook(extract_grad_hook)
add_hooks(model)

In [None]:
def get_embedding_weight(model):
  for module in model.modules():
    if isinstance(module, TextFieldEmbedder):
      for embed in module._token_embedders.keys():
        embedding_weight = module._token_embedders[embed].weight.cpu().detach()
      return embedding_weight

In [None]:
embedding_weight = get_embedding_weight(model)

In [None]:
universal_perturb_batch_size = 128
dataset_label_filter = '0'
targeted_dev_data = []
for instance in dev_dataset:
  if instance['label'].label == dataset_label_filter:
    targeted_dev_data.append(instance)
model.get_metrics(reset=True)
model.eval()

In [None]:
dev_data_loader = DataLoader(targeted_dev_data, batch_size=128, shuffle=False )
for batch in dev_data_loader:
   batch = move_to_device(batch, cuda_device=0)
   model(batch['tokens'], batch['label'])
print("the accuracy without triggers:")
model.get_metrics()['accuracy']

In [None]:
model.train()

In [None]:
def evaluate_word_saliency(doc, batch, input_y):
  batch = move_to_device(batch, cuda_device=0)
  original_tokens = batch['tokens']['tokens']['tokens']
  word_saliency_list = []
  max_len = len(original_tokens[0])
  original_vector = batch['tokens']['tokens']['tokens']
  #print(original_vector)
  origin_prob = (model(batch['tokens'], batch['label']))['probs'][0][1] #the output of true label(label 1)
  for position in range(max_len):
    if position >= max_len:
      break
    without_word_vector = deepcopy(original_vector)
    #print(without_word_vector)
    without_word_vector[0][position] = 1 # changes to <unk> token
    batch['tokens']['tokens']['tokens'] = without_word_vector
    prob_without_word_vector = (model(batch['tokens'], batch['label']))['probs'][0][1]
    word_saliency = origin_prob - prob_without_word_vector
    word_saliency_list.append((position, doc[position], word_saliency, doc[position].tag_))
  position_word_list = []
  for word in word_saliency_list:
    position_word_list.append((word[0], word[1]))
  return position_word_list, word_saliency_list


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
dev_data_iter = DataLoader(targeted_dev_data, batch_size=1, shuffle=True )

In [None]:
def convert_batch_to_text(batch, vocab):
  tokens_id = batch['tokens']['tokens']['tokens'].squeeze(0) # torch.Size([37]) or torch.size([14]) or ...  
  tokens_id = tokens_id.tolist()
  tokens = []
  for idx in tokens_id:
    tokens.append(vocab.get_token_from_index(idx))
  #print(tokens)
  # convert tokens to text
  text = " ".join(map(str,tokens))
  return text

In [None]:
def predict_classes(batch, model):
  batch = move_to_device(batch, cuda_device=0)
  predict_classes = (model(batch['tokens'], batch['label'])['probs']).squeeze()
  predict_classes_nump = predict_classes.cpu().detach().numpy()
  adv_y = np.argmax(predict_classes_nump)
  return adv_y


In [None]:
def predict_probs(batch, model):
  batch = move_to_device(batch, cuda_device=0)
  predict_classes = (model(batch['tokens'], batch['label'])['probs']).squeeze()
  predict_classes_nump = predict_classes.cpu().detach().numpy()
  return predict_classes_nump

In [None]:
def adversarial_paraphrase(batch, model, embedding_matrix, true_y=1, verbose=True):
  
  #text = convert_batch_to_text(batch, vocab)
  #print(text)

  doc = nlp(text)
  input_y = 1
  position_word_list, word_saliency_list = evaluate_word_saliency(doc, batch, input_y)
  print(word_saliency_list)
  averaged_grad = get_averaged_grad(model, batch)
  perturbed_text, perturbed_batch, sub_rate, change_tuple_list = Hotflip_PWWS(doc, batch, averaged_grad, embedding_matrix,true_y, word_saliency_list, verbose=verbose, increase_loss=True)
  
  perturbed_y = predict_classes(perturbed_batch, model)
  return perturbed_text, perturbed_y, sub_rate, change_tuple_list

In [None]:
def evaluate_batch(model, batch):
    """
    Takes a batch of classification examples (SNLI or SST), and runs them through the model.
    If trigger_token_ids is not None, then it will append the tokens to the input.
    This funtion is used to get the model's accuracy and/or the loss with/without the trigger.
    """
    batch = move_to_device(batch, cuda_device=0)
    original_tokens =batch['tokens']['tokens']['tokens']
    output_dict = model(batch['tokens'], batch['label'])
    batch['tokens']['tokens']['tokens'] = original_tokens
    return output_dict

In [None]:
def get_averaged_grad(model, batch, target_label=None):
    optimizer = optim.Adam(model.parameters())
    optimizer.zero_grad()

    # prepend triggers to the batch
    original_labels = batch['label'].clone()
    if target_label is not None:
        batch['label'] = int(target_label) * torch.ones_like(batch['label']).cuda()
    global extracted_grads
    extracted_grads = [] # clear existing stored grads
    loss = evaluate_batch(model, batch)['loss']
    loss.backward()
    # index 0 has the hypothesis grads for SNLI. For SST, the list is of size 1.
    grads = extracted_grads[0].cpu()
    batch['label'] = original_labels # reset labels

    # average grad across batch size, result only makes sense for trigger tokens at the front
    #averaged_grad = torch.sum(grads, dim=0)
    averaged_grad = grads # return just trigger grads
    return averaged_grad

In [None]:
def compile_perturbed_batch(perturbed_batch, position_substitute):
  origin = perturbed_batch.copy()
  for (position, substitute) in position_substitute:
    substitute = torch.from_numpy(substitute) # convert substitute which numpy array to tensor
    perturbed_batch['tokens']['tokens']['tokens'][0][position] = substitute
    print(perturbed_batch)
  return perturbed_batch

In [None]:
def Hotflip_PWWS(doc,batch, averaged_grad, embedding_matrix, true_y, word_saliency_list=None, rank_fn=None, halt_condition=None, verbose=True, increase_loss=True):
  def softmax(x):
    exp_x = np.exp(x)
    softmax_x = exp_x / np.sum(exp_x)
    return softmax_x
  def halt_condition(perturbed_batch, model, true_y=1):
    perturbed_batch = move_to_device(perturbed_batch, cuda_device=0)
    predict_classes = (model(perturbed_batch['tokens'], perturbed_batch['label'])['probs']).squeeze()
    predict_classes_nump = predict_classes.cpu().detach().numpy()
    adv_y = np.argmax(predict_classes_nump)
    if adv_y != true_y:
      return True
    else:
      return False

  perturbed_batch = batch
  perturbed_doc = doc
  perturbed_text = perturbed_doc.text

  substitute_count = 0  # calculate how many substitutions used in a doc
  substitute_tuple_list = []  # save the information of substitute word
  saliency_array = [word_tuple[2] for word_tuple in word_saliency_list]
  word_saliency_array = np.array(saliency_array)
  word_saliency_array = [x.cpu().detach() for x in word_saliency_array]
  word_saliency_array = softmax(word_saliency_array)
  #print(word_saliency_array)

  averaged_grad = averaged_grad.cpu()
  embedding_matrix = embedding_matrix.cpu()
  gradient_dot_embedding_matrix = torch.einsum("bij,kj->bik", averaged_grad, embedding_matrix)
  #print(gradient_dot_embedding_matrix)
  if not increase_loss:
    gradient_dot_embedding_matrix *=-1
  best_grad, best_id = torch.topk(gradient_dot_embedding_matrix,1,dim=2)
  grads = best_grad.detach().cpu().numpy()[0] # [max_len, 1] -> if k=40 --> [max_len, 40]
  substitute = best_id.detach().cpu().numpy()[0] # [max_len, 1]
  for (position, token, word_saliency, tag) in word_saliency_list:
    substitute_tuple_list.append((position, token.text, substitute[position], grads[position]*word_saliency_array[position], token.tag_)) #substitute is numpy array so substitute[position] is a number
  sorted_substitute_tuple_list = sorted(substitute_tuple_list, key=lambda t:t[3], reverse=True)
  print('sorted_substitute_tuple_list is:')
  print(sorted_substitute_tuple_list)
  change_tuple_list = []
  position_substitute = []
  for (position, token, substitute, score, tag) in sorted_substitute_tuple_list:
    print("posi is:")
    print(position)
    change_tuple_list.append((position, token, substitute, score, tag))
    position_substitute.append((position, substitute))
    perturbed_text = convert_batch_to_text(perturbed_batch, vocab)
    
    perturbed_batch = compile_perturbed_batch(perturbed_batch, position_substitute) # substitute whtch coming from sorted_substitute_tuple_list is having the substitute[position] from substitute_tuple_list
  
    perturbed_text = convert_batch_to_text(perturbed_batch, vocab)
    
    perturbed_doc = nlp(perturbed_text)
    substitute_count += 1
    perturbed_batch = perturbed_batch
  
    if halt_condition(perturbed_batch, model):
      if verbose:
        print("use", substitute_count)
      sub_rate = substitute_count / len(doc)
      return perturbed_text, perturbed_batch, sub_rate, change_tuple_list
  if verbose:
    print("use", substitute_count)
  sub_rate = substitute_count / len(doc)
  print('sub_rate is')
  print(sub_rate)
  return perturbed_text, perturbed_batch, sub_rate, change_tuple_list

In [None]:
successful_perturbations = 0
failed_perturbations = 0
true_y = 1
sub_rate_list = []

for batch in dev_data_iter:
  sub_rate = 0
  if batch['label'] == predict_classes(batch, model):
    text = convert_batch_to_text(batch, vocab)
    print(text)
    adv_doc, adv_y, sub_rate, change_tuple_list = adversarial_paraphrase(batch, model, embedding_weight, true_y=1, verbose=True)
    print("adv_y is")
    print(adv_y)
    print("true_y is:")
    print(true_y)
    if adv_y != true_y:
      successful_perturbations +=1
      print('successfule example crafted!')
    else:
      failed_perturbations +=1
      print('failed!')
    text = adv_doc
    sub_rate_list.append(sub_rate)
    mean_sub_rate = sum(sub_rate_list)/ len(sub_rate_list)
print(failed_perturbations)
accuracy = failed_perturbations / len(dev_data_iter)
print("the accuracy is:")
print(accuracy)
print("success rate is:")
print(successful_perturbations/len(dev_data_iter))
print(sub_rate_list)
print("mean_sub_rate is:")
mean_sub_rate = sum(sub_rate_list)/ len(sub_rate_list)
print(mean_sub_rate)

# special thanks to https://github.com/Eric-Wallace/universal-triggers/blob/master/sst/sst.py
# https://github.com/JHL-HUST/PWWS
