In [2]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import HTML

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:

import numpy as np
import os
import data_utils
import pickle
import attacks
import random
from keras.models import load_model

import greedy_utils

Using TensorFlow backend.


In [4]:
TOKENIZER_PATH = 'tokenizer.pickle'
TEXT_DATA_DIR = '20_newsgroup/'
MODEL_PATH = '20news_model.h5'

In [5]:
tokenizer, inverse_tokenizer = data_utils.load_tokenizer(TOKENIZER_PATH)
model = load_model(MODEL_PATH)
labels =  list(sorted(os.listdir(TEXT_DATA_DIR)))

In [6]:
sample_file, true_label = data_utils.pick_random_file(TEXT_DATA_DIR)
file_text = data_utils.load_textfile(sample_file)
file_features = data_utils.process_text(tokenizer, file_text)

In [9]:
pred_scores = model.predict(file_features)
orig_prediction = np.argmax(pred_scores[0])
print('TrueLabel = %s' %true_label)
print('Predicted "%s" with %f .' %(labels[orig_prediction], pred_scores[0][orig_prediction]))

TrueLabel = alt.atheism
Predicted "alt.atheism" with 0.195061 .


## Greedy Attack (targeted)

Pre compute the required probabilities

In [10]:
COMPUTE_PROBS = True
if COMPUTE_PROBS:
    num_words = len(inverse_tokenizer)
    topics_words, topics_words_probs = greedy_utils.compute_topic_words(TEXT_DATA_DIR, tokenizer, 
                                                                  labels, num_words, num_cands=20000,ret_count=500)
    with open('topic_words.pickle', 'wb') as handle:
        pickle.dump(topics_words, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('topc_words_probs.pickle', 'wb') as handle:
        pickle.dump(topics_words_probs, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open('topic_words.pickle', 'rb') as handle:
        topics_words = pickle.load(handle)
    with open('topc_words_probs.pickle', 'rb') as handle:
        topics_words_probs = pickle.load( handle)

In [22]:
embeddings_index = {}
f = open(os.path.join('./glove.6B', 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


## Demonstrating topic words

In [40]:
embeddings_index.get('dbstu1')

In [49]:
# Top 10 words for 
for ix, label in enumerate(labels):
    print("Top 10 words for label : ", label)
    for j in range(10):
        w_idx = topics_words[ix][j]
        w = inverse_tokenizer[w_idx]
        print(w, end="")
        w_vec = embeddings_index.get(w)
        has_embedding = not (w_vec is None)
        if has_embedding:
            print("*", end=" ")
        else:
            print("", end=" ")
        #print(w + has_embedding, end=" " )
    print("\n-----")
    print("")

Top 10 words for label :  alt.atheism
dbstu1 ingles* rosenau* mozumder macalstr mccullou wwc* razor* meng* nm0w 
-----

Top 10 words for label :  comp.graphics
bolson* roundoff* gifconverter denali* fli* hsi* renderer* normals* spline* bib* 
-----

Top 10 words for label :  comp.os.ms-windows.misc
claebaur sqk w4wg mk7 uwt b8g p47 a865 'as' hm9 
-----

Top 10 words for label :  comp.sys.ibm.pc.hardware
1542* viewsonic* p9000 dcoleman dce* cyrix* uart* micron* ebosco sectors* 
-----

Top 10 words for label :  comp.sys.mac.hardware
c650 q700 ntx iici unplug* fpu* lcii oscillators* 68020* binhex 
-----

Top 10 words for label :  comp.windows.x
doit* xtappcontext lxmu xrdb ftms openwinhome olvwm xfilesearchpath elin* dpy 
-----

Top 10 words for label :  misc.forsale
armegedon spiderman* obo* nikon* dryer* typewriter* thd* turtles* vouchers* unregistered* 
-----

Top 10 words for label :  rec.autos
unforgiven* diesels* gibbonsa mustang* lexus* shaz* spiros* shafts* oils* maxima* 
-----

To

In [None]:
greedy_attack = attacks.GreedyAttack(model, topics_words, topics_words_probs, temp=1)

In [None]:
other_labels = [x for x in range(len(labels)) if x != orig_prediction]
random_target = np.random.choice(other_labels)
print('Random target = %s' %labels[random_target])
x_orig = file_features.copy()
x_adv, o_hist, t_hist = greedy_attack.attack(x_orig, random_target, limit=1.0)

In [None]:
if x_adv is None:
    print('Attack failed. !')
else:
    adv_prediction = np.argmax(model.predict(x_adv))
    print('Attack succeeded after %d iterations.' %(len(t_hist)))
    print('Original class: %s, Attack class: %s' %(labels[orig_prediction], labels[adv_prediction]))
    print("Number of changed words = %d (%0.2f %%)"
          %(np.count_nonzero(x_adv != x_orig),
            100*(np.count_nonzero(x_adv != x_orig))/np.count_nonzero(x_orig)))
    plt.plot(o_hist, 'g', label=labels[orig_prediction])
    plt.plot(t_hist, 'r', label=labels[adv_prediction])
    plt.legend()

## Reconstruct Text Document

In [None]:
orig_text = data_utils.reconstruct_text(inverse_tokenizer, file_features[0])
adv_text = data_utils.reconstruct_text(inverse_tokenizer, x_adv[0])

## Visualize Attack

In [None]:
orig_html, adv_html = data_utils.render_attack(orig_text, adv_text)

In [None]:
HTML("<b> Original Text </b>")
HTML(orig_html)

In [None]:
HTML("<b> Adversarial Text </b>")
HTML(adv_html)

## Evaluation

In [None]:
random_files = [data_utils.pick_random_file(TEXT_DATA_DIR) for _ in range(500)]
files_, topics = zip(*random_files)

In [None]:
greedy_attack = attacks.GreedyAttack(model, topics_words, topics_words_probs, temp=0.15)

In [None]:
failed_cnt = 0
success_cnt = 0
cnt_all = 0
dist_list = []
attack_list = []
class_cnt = [0 for _ in range(len(labels))]
attack_matrix = np.zeros((len(labels), len(labels)))
for idx in range(len(files_)):
    f_name = files_[idx]
    f_label = topics[idx]
    x_test = data_utils.load_textfile(f_name)
    x_orig = data_utils.process_text(tokenizer, x_test)
    orig_pred = np.argmax(model.predict(x_orig))
    true_label = [i for i,x in enumerate(labels) if x == f_label][0]
    if orig_pred != true_label:
        # skip
        continue
    class_cnt[true_label] += 1
    # pertrub to every other label
    other_labels = [x for x in range(len(labels)) if x != true_label]
    cnt_all += 1
    
    for t_label in other_labels:
        x_adv,_,_ = greedy_attack.attack(x_orig, t_label)
        if x_adv is None:
            failed_cnt += 1
        else:
            success_cnt += 1
            adv_pred = np.argmax(model.predict(x_adv))
            assert(adv_pred == t_label)
            attack_matrix[orig_pred, adv_pred] += 1
            dist_list.append(np.count_nonzero(x_adv != x_orig) / np.count_nonzero(x_orig))


    if idx % 50 == 0:
        print(idx)


In [None]:
success_rate = 100.0 * success_cnt/ (cnt_all*19)
print('Success rate = %0.2f %%' %success_rate)

In [None]:
len(dist_list)

In [None]:
success_cnt

In [None]:
# Plot CDF
num_bins = 50
counts, bin_edges = np.histogram(dist_list, bins=num_bins, density=False)
cdf = np.cumsum(counts)
cdf = cdf / (success_cnt)
plt.plot(bin_edges[1:], cdf*100)
plt.axhline(y=50,linewidth=1, color='r', linestyle='--')
plt.xlabel('%% change')
plt.ylabel('CDF of success')
plt.savefig('cdf_greedy.png')

In [None]:
import seaborn as sns; sns.set()
import matplotlib

In [None]:
attack_p = attack_matrix /np.array(class_cnt).reshape((-1,1))

In [None]:
attack_matrix

In [None]:
#sns.palplot(sns.color_palette("RdBu_r", 7)) Label
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 12}

matplotlib.rc('font', **font)
fig, ax = plt.subplots(figsize=((10,10)))
sns.heatmap(attack_p, annot=True, fmt="0.1f",
            yticklabels=labels, xticklabels=labels, cbar=False, cmap="OrRd"
           
           )
plt.xlabel('Target Label', fontsize=16)
plt.ylabel('Source Label', fontsize=16)
ax.xaxis.set_label_position('top')
ax.xaxis.set_ticks_position('top')
plt.xticks(rotation=90)
plt.savefig('greedy_heatmap.png')

## Picking similar words

In [None]:
embeddings_index = {}
f = open(os.path.join('./glove.6B', 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
orig_words = topics_words[orig_prediction]

In [None]:
[np.count_nonzero(x_orig==x) for x in orig_words]

In [None]:
orig_words

In [None]:
def pick_most_similar(src_word, target_words, inverse_tokenizer, embedding_index):
    cnt = 0
    shortest_dist = 100000000
    src_vector = embedding_index.get(inverse_tokenizer[src_word])
    ret = -1
    for w_idx in target_words:
        w = inverse_tokenizer[w_idx]
        embedding_vector = embeddings_index.get(w)
        if embedding_vector is not None:
            dist = np.sum((src_vector - embedding_vector)**2)
            print(inverse_tokenizer[w_idx], ' ' , dist)
            if dist < shortest_dist and dist != 0:
                shortest_dist = dist
                ret = w_idx
    return ret
    

In [None]:
src_word=630
ret_word = pick_most_similar(src_word, topics_words[9], inverse_tokenizer, embeddings_index)

In [None]:
inverse_tokenizer[ret_word]

In [None]:
inverse_tokenizer[src_word]