In [2]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import HTML

In [3]:

import numpy as np
import os
import data_utils
import pickle
import random
from keras.models import load_model

Using TensorFlow backend.


In [4]:
TOKENIZER_PATH = 'tokenizer.pickle'
TEXT_DATA_DIR = '20_newsgroup/'
MODEL_PATH = '20news_model.h5'

In [5]:
tokenizer, inverse_tokenizer = data_utils.load_tokenizer(TOKENIZER_PATH)
model = load_model(MODEL_PATH)
labels =  list(sorted(os.listdir(TEXT_DATA_DIR)))

In [6]:
sample_file, true_label = data_utils.pick_random_file(TEXT_DATA_DIR)
file_text = data_utils.load_textfile(sample_file)
file_features = data_utils.process_text(tokenizer, file_text)

In [7]:
pred_scores = model.predict(file_features)
orig_prediction = np.argmax(pred_scores[0])
print('TrueLabel = %s' %true_label)
print('Predicted "%s" with %f .' %(labels[orig_prediction], pred_scores[0][orig_prediction]))

TrueLabel = rec.sport.hockey
Predicted "rec.sport.hockey" with 0.153542 .


## Random Attack (Non-targeted)

 Random attack will pick and replace words randomly.

In [8]:
num_words = len(tokenizer.word_index)

In [21]:
y_orig = []
x_adv = file_features.copy()
orig_pred = orig_prediction
iter_idx = 0
while True:
    iter_idx += 1
    # perturb another random word
    word_idx = np.random.choice(len(x_adv[0]))
    while x_adv[0][word_idx] == 0:
        # don't perturb paddings
        word_idx = np.random.choice(len(x_adv[0]))
    # select new word
    x_adv[0][word_idx] = np.random.choice(num_words)
    pred_scores = model.predict(x_adv)
    new_pred = np.argmax(pred_scores)
    y_orig.append(pred_scores[0][orig_pred])
    if new_pred != orig_pred:
        # Attack done !
        break
print("Attack successful after : %d iterations" %(iter_idx))
num_changed = np.count_nonzero(file_features!=x_adv)
num_features = np.count_nonzero(file_features)
print("Number of changed words = %d (%0.3f %%)"
      %(num_changed, 100.0*float(num_changed)/num_features))
print("Original class \"%s\" - New class: \"%s\"" %(labels[orig_pred], labels[new_pred]))

Attack successful after : 231 iterations
Number of changed words = 119 (75.316 %)
Original class "rec.sport.hockey" - New class: "comp.windows.x"


## Reconstruct Text Document

In [22]:
orig_text = data_utils.reconstruct_text(inverse_tokenizer, file_features[0])
adv_text = data_utils.reconstruct_text(inverse_tokenizer, x_adv[0])

## Visualize Attack

In [23]:
orig_html, adv_html = data_utils.render_attack(orig_text, adv_text)

In [24]:
HTML("<b> Original Text </b>")
HTML(orig_html)

In [25]:
HTML("<b> Adversarial Text </b>")
HTML(adv_html)

## Greedy Attack

Greedy Method // choose random word and change it to the words with high probability in goal class

In [26]:
y_orig = []
x_adv = file_features.copy()
orig_pred = orig_prediction
iter_idx = 0

label_dic = {}
label_ind = 0
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    label_dic[name] = label_ind
    label_ind = label_ind + 1

dic = pickle.load(open('probability_dic.p','rb'))
class_original = label_dic[true_label]


#randomly select another class as goal
class_goal = random.randint(0, 19)
while class_goal == class_original:
    class_goal = random.randint(0, 19)
#select next class as goal
#class_goal = (class_original+1)%20
print("target class: %s" %(labels[class_goal]))
prob_original = sorted(dic, key=lambda k: dic[k][class_original], reverse=True)
to_perturbed = []
for i in range(len(prob_original)):
    if(prob_original[i] in tokenizer.word_index):
        to_perturbed = to_perturbed + [tokenizer.word_index[prob_original[i]]]
prob_goal = sorted(dic, key=lambda k: dic[k][class_goal], reverse=True)

priority_perturb = 0
idx_target = 0
flag_change = False
while True:
    iter_idx += 1
    # sequentially choose word with higest p(original class|word) in the original text
    # and sequentially change it to word with highest p(goal class|word)
    idx_find = 0
    find = False
    while find==False:
        idx_find = 0
        while idx_find < len(x_adv[0]) and x_adv[0][idx_find]!= to_perturbed[priority_perturb]:
            idx_find = idx_find + 1
        if idx_find == len(x_adv[0]):
            priority_perturb = priority_perturb + 1
        else:
            find = True
            word_idx = idx_find
    while prob_goal[idx_target] not in tokenizer.word_index:
        idx_target = idx_target + 1
    x_adv[0][word_idx] = tokenizer.word_index[prob_goal[idx_target]]
    idx_target = idx_target + 1
    pred_scores = model.predict(x_adv)
    new_pred = np.argmax(pred_scores)
    y_orig.append(pred_scores[0][orig_pred])
    if new_pred != orig_pred and flag_change == False:
        iter_temp = iter_idx
        x_adv_temp = x_adv.copy()
        num_changed_temp = np.count_nonzero(file_features!=x_adv_temp)
        pred_temp = new_pred
        flag_change = True
    if new_pred == class_goal:
        break
print("Attack change to predicted class to another class after : %d iterations" %(iter_temp))
num_features = np.count_nonzero(file_features)
print("Number of changed words = %d (%0.3f %%)"
      %(num_changed_temp, 100.0*float(num_changed_temp)/num_features))
print("Original class \"%s\" - New class: \"%s\"" %(labels[orig_pred], labels[pred_temp]))
print("Attack change to predicted class to TARGET class after : %d iterations" %(iter_idx))
num_changed = np.count_nonzero(file_features!=x_adv)
print("Number of changed words = %d (%0.3f %%)"
      %(num_changed, 100.0*float(num_changed)/num_features))
print("Original class \"%s\" - New class: \"%s\"" %(labels[orig_pred], labels[new_pred]))

target class: rec.motorcycles
Attack change to predicted class to another class after : 17 iterations
Number of changed words = 17 (10.759 %)
Original class "rec.sport.hockey" - New class: "rec.sport.baseball"
Attack change to predicted class to TARGET class after : 143 iterations
Number of changed words = 143 (90.506 %)
Original class "rec.sport.hockey" - New class: "rec.motorcycles"


In general, to cause the model to change the prediction, the greedy attack takes less iterations (because it chooses the word that contribute higher to the labelling of original text to perturb) than random attack.  However, when the prediction first changed, its prediction is usually not the target class (most of the times it's same as the prediction of the random attack after its first change), and it'll take more iterations for the model to predict it as the target class.

## Reconstruct Text Document

In [27]:
orig_text = data_utils.reconstruct_text(inverse_tokenizer, file_features[0])
adv_text_first_change = data_utils.reconstruct_text(inverse_tokenizer, x_adv_temp[0])

## Visualize Attack

In [28]:
orig_html, adv_1stchange_html = data_utils.render_attack(orig_text, adv_text_first_change)

In [29]:
HTML("<b> Original Text </b>")
HTML(orig_html)

In [30]:
HTML("<b> Adversarial Text (change to another class, not necessarily the target class) </b>")
HTML(adv_1stchange_html)

In [31]:
adv_text = data_utils.reconstruct_text(inverse_tokenizer, x_adv[0])
orig_html, adv_html = data_utils.render_attack(orig_text, adv_text)

In [32]:
HTML("<b> Adversarial Text (target class) </b>")
HTML(adv_html)