In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import HTML

In [2]:

import numpy as np
import os
import data_utils
from keras.models import load_model

Using TensorFlow backend.


In [3]:
TOKENIZER_PATH = 'tokenizer.pickle'
TEXT_DATA_DIR = '20_newsgroup/'
MODEL_PATH = '20news_model.h5'

In [4]:
tokenizer, inverse_tokenizer = data_utils.load_tokenizer(TOKENIZER_PATH)
model = load_model(MODEL_PATH)
labels =  list(sorted(os.listdir(TEXT_DATA_DIR)))

In [5]:
sample_file = '20_newsgroup/alt.atheism/51060'
file_text = data_utils.load_textfile(sample_file)
file_features = data_utils.process_text(tokenizer, file_text)

In [6]:
pred_scores = model.predict(file_features)
orig_prediction = np.argmax(pred_scores[0])
print('Predicted "%s" with %f .' %(labels[orig_prediction], pred_scores[0][orig_prediction]))

Predicted "alt.atheism" with 0.155570 .


## Random Attack (Non-targeted)

 Random attack will pick and replace words randomly.

In [7]:
num_words = len(tokenizer.word_index)

In [8]:
y_orig = []
x_adv = file_features.copy()
orig_pred = orig_prediction
iter_idx = 0
while True:
    iter_idx += 1
    # perturb another random word
    word_idx = np.random.choice(len(x_adv[0]))
    while x_adv[0][word_idx] == 0:
        # don't perturb paddings
        word_idx = np.random.choice(len(x_adv[0]))
    # select new word
    x_adv[0][word_idx] = np.random.choice(num_words)
    pred_scores = model.predict(x_adv)
    new_pred = np.argmax(pred_scores)
    y_orig.append(pred_scores[0][orig_pred])
    if new_pred != orig_pred:
        # Attack done !
        break
print("Attack successful after : %d iterations" %(iter_idx))
num_changed = np.count_nonzero(file_features!=x_adv)
num_features = np.count_nonzero(file_features)
print("Number of changed words = %d (%0.3f)"
      %(num_changed, 100.0*float(num_changed)/num_features))
print("Original class \"%s\" - New class: \"%s\"" %(labels[orig_pred], labels[new_pred]))

Attack successful after : 20 iterations
Number of changed words = 20 (2.000)
Original class "alt.atheism" - New class: "talk.religion.misc"


## Reconstruct Text Document

In [17]:
orig_text = data_utils.reconstruct_text(inverse_tokenizer, file_features[0])
adv_text = data_utils.reconstruct_text(inverse_tokenizer, x_adv[0])



## Visualize Attack

In [18]:
orig_html, adv_html = data_utils.render_attack(orig_text, adv_text)

In [20]:
HTML("<b> Original Text </b>")
HTML(orig_html)

In [21]:
HTML("<b> Adversarial Text </b>")
HTML(adv_html)