In [None]:
import glob
import importlib
import numpy as np
import os
import pickle
import sys
import time
import torch

from torch.utils.data import TensorDataset

from transformers import (DistilBertForSequenceClassification, 
                          DistilBertTokenizer,
                          BertForSequenceClassification,
                          BertTokenizer)

# Our code imports
sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))
import train_eval
import synonym

importlib.reload(synonym)
importlib.reload(train_eval)

## Set up Data

In [None]:
dataset = 'yelp'
imdb = train_eval.ReviewDataset(source=dataset)
test_sentences, test_labels = imdb.reviewsAndLabels(test_train="test")

## Models


In [None]:
bert_pretrained_weights = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_pretrained_weights)

distil_pretrained_weights = 'distilbert-base-cased'
distil_tokenizer = DistilBertTokenizer.from_pretrained(distil_pretrained_weights)

## Experiments 1.1 - 1.2 and 2.1 - 2.2

In [None]:
# set up the local (attack training) model - which we'll be using to identify misclassified reviews 
model_path = 'imdb_bert.model'
model = torch.load(model_path)

# set up the target models (the one's we're trying to show we can fool by assuming we can transfer 
# successful attacks from our local model)
target_models = ['yelp_distil.model', 'yelp_bert.model']

bert_pretrained_weights = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_pretrained_weights)

distil_pretrained_weights = 'distilbert-base-cased'
distil_tokenizer = DistilBertTokenizer.from_pretrained(distil_pretrained_weights)

# set up the reviews we are evaluating on
dataset = 'yelp'
yelp = train_eval.ReviewDataset(source=dataset, number_reviews=50)
test_sentences, test_labels = yelp.reviewsAndLabels(test_train="train")

# the pre-computed hot-word distribution
vocab, word_weights, word_sorts = pickle.load(open("bert-vcb_wt_sort.p", "rb"))
word_dist = dict(zip(vocab, word_weights))

# set up hyperparameters
gamma = 80
methods = ['hot_word', 'random']

# set up outputs and intermediate values
accuracies = []
no_reviews = dict()
output = dict()
non_adv_output = dict()

# instantiate attack class
s_attacks = synonym.SynonymAttacks(model_path, bert_tokenizer, 256)

for method in methods:
    # we'll do it for both the hot word and random methods

    # here we generate the adversarial sentences (i.e., with synonyms replaced) 
    # and labels (ground truth)
    adv_reviews, adv_label = s_attacks.generateSynonymReviews(test_sentences, test_labels,
                                                           replacements=gamma, 
                                                           hot_word_distribution=word_dist,
                                                           method=method)
    
    # set up data is a convenience static method, to create a pytorch dataset object
    evaluation_data, _ = train_eval.ReviewDataset.setUpData(adv_reviews, 
                                                            adv_label, 
                                                            tokenizer, 256, split="no_shuffle")
    
    # run the evaluation loop using the fine-tuned model, to identify misclassified reviews
    _, pred_labels, _, _, true_labels= train_eval.evaluate(model, 
                                                           evaluation_data, 
                                                           batch_size=20,
                                                           return_pred_labels=True)
    
    # flatten and find misclassified reviews
    pred = np.array([q for p in pred_labels for q in p]) 
    true = np.array([q for p in true_labels for q in p])
    m = np.where(pred!=true)

    no_reviews[method] = len(m[0])

    # return the ids, true labels and attention masks of the MISCLASSIFIED reviews
    true_flat = true[m]

    # recover the misclassified reviews, and set up to attack target model
    attack_reviews = adv_reviews[m]

    # for comparison, the same reviews without an adversarial attack
    non_attack_reviews = test_sentences

    # evaluate the accuracy on the target models (both yelp trained Bert and DistilBert)
    for target in target_models:
        if "distil" in target:
            target_tokenizer = distil_tokenizer
        elif "bert" in target:
            target_tokenizer = bert_tokenizer
        else:
            raise NotImplementedError

        # static method to create pytorch dataset object
        model_input, _ = train_eval.ReviewDataset.setUpData(attack_reviews,
                                                            true_flat,
                                                            target_tokenizer)
        non_adv_model_input, _ = train_eval.ReviewDataset.setUpData(non_attack_reviews,
                                                                    true,
                                                                    target_tokenizer)

        # load the fine-tuned target model
        loaded_target_model = torch.load(target)

        # evaluate on the target data, and return the accuracies
        adv_accuracies, _, _, _, _ = train_eval.evaluate(loaded_target_model, 
                                                         model_input, 
                                                         batch_size=20)
        
        # for comparison, run on the original test data
        non_adv_accuracies, _, _, _, _ = train_eval.evaluate(loaded_target_model, 
                                                             non_adv_model_input, 
                                                             batch_size=20)
    
        # get the average accuracy across the batches
        accuracy = np.mean(adv_accuracies)
        non_adv_accuracy = np.mean(non_adv_accuracies)

        # and store them
        output["{}_{}".format(target.split(".")[0], method)] = accuracy
        non_adv_output["{}_{}".format(target.split(".")[0], method)] = non_adv_accuracy

In [None]:
output