In [1]:
import numpy as np
import tensorflow as tf

from keras.preprocessing.sequence import pad_sequences
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import data_utils
import glove_utils
import models
import display_utils
from goog_lm import LM

In [3]:
import lm_data_utils
import lm_utils

In [4]:
np.random.seed(1001)
tf.set_random_seed(1001)

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
VOCAB_SIZE  = 50000
with open('aux_files/dataset_%d.pkl' %VOCAB_SIZE, 'rb') as f:
    dataset = pickle.load(f)

In [7]:
doc_len = [len(dataset.test_seqs2[i]) for i in 
           range(len(dataset.test_seqs2))]

In [8]:
dist_mat = np.load('aux_files/dist_counter_%d.npy' %VOCAB_SIZE)
# Prevent returning 0 as most similar word because it is not part of the dictionary
dist_mat[0,:] = 100000
dist_mat[:,0] = 100000

skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %VOCAB_SIZE)

### Demonstrating how we find the most similar words

In [9]:
for i in range(300, 305):
    src_word = i
    nearest, nearest_dist = glove_utils.pick_most_similar_words(src_word, dist_mat,20, 0.5)
        
    print('Closest to `%s` are:' %(dataset.inv_dict[src_word]))
    for w_id, w_dist in zip(nearest, nearest_dist):
          print(' -- ', dataset.inv_dict[w_id], ' ', w_dist)

    print('----')

Closest to `later` are:
 --  subsequent   0.18323109771400015
 --  subsequently   0.1867195991340007
 --  afterward   0.2509214012219996
 --  afterwards   0.2576958961479996
 --  thereafter   0.2741981096589998
 --  trailing   0.3368002712810001
 --  after   0.34520261237799876
 --  then   0.36472839338299834
 --  posterior   0.4310855888389997
 --  following   0.4833073676040003
----
Closest to `takes` are:
 --  pick   0.31130546563200046
 --  taking   0.42471158462800007
 --  picked   0.48527412495900113
----
Closest to `instead` are:
 --  conversely   0.30340380498499964
 --  however   0.3475382865829997
 --  alternatively   0.39540487543000014
 --  alternately   0.4439627395600003
 --  nevertheless   0.477163975792001
----
Closest to `seem` are:
 --  seems   0.007052995653001215
 --  appears   0.32837244735200044
 --  looks   0.33534638306400066
 --  transpires   0.456207185493001
----
Closest to `beautiful` are:
 --  gorgeous   0.019236443661999614
 --  wonderful   0.1014964337829

### Preparing the dataset

In [10]:
max_len = 250
train_x = pad_sequences(dataset.train_seqs2, maxlen=max_len, padding='post')
train_y = np.array(dataset.train_y)
test_x = pad_sequences(dataset.test_seqs2, maxlen=max_len, padding='post')
test_y = np.array(dataset.test_y)

### Loading the sentiment analysis model

In [13]:
tf.reset_default_graph()
if tf.get_default_session():
    sess.close()
sess = tf.Session()
batch_size = 1
lstm_size = 128
#max_len =  100

with tf.variable_scope('imdb', reuse=False):
    model = models.SentimentModel(batch_size=batch_size,
                           lstm_size = lstm_size,
                           max_len = max_len,
                           embeddings_dim=300, vocab_size=dist_mat.shape[1],is_train = False)
saver = tf.train.Saver()
saver.restore(sess, './models/imdb_model')



INFO:tensorflow:Restoring parameters from ./models/imdb_model


## Loading the Google Language model

In [14]:
goog_lm = LM()

LM vocab loading done


Recovering graph.


INFO:tensorflow:Recovering Graph goog_lm/graph-2016-09-10.pbtxt


Recovering checkpoint goog_lm/ckpt-*


#### demonstrating the GoogLM

In [15]:
src_word = dataset.dict['play']
nearest, nearest_dist = glove_utils.pick_most_similar_words(src_word, dist_mat,20)
nearest_w = [dataset.inv_dict[x] for x in nearest]
print('Closest to `%s` are %s' %(dataset.inv_dict[src_word], nearest_w))

Closest to `play` are ['playing', 'gaming', 'games', 'toy', 'playback', 'game', 'plaything', 'cheek', 'gambling', 'toys', 'toying', 'replay', 'stake', 'plays', 'jeu', 'gamble', 'staking', 'reproduction', 'casino', 'sets']


In [16]:
prefix = 'is'
suffix = 'with'
lm_preds = goog_lm.get_words_probs(prefix, nearest_w, suffix)
print('most probable is ', nearest_w[np.argmax(lm_preds)])


most probable is  game


## Try Attack

In [17]:
from attacks import GeneticAtack

## Main Attack 

In [18]:
pop_size = 60
n1 = 8

with tf.variable_scope('imdb', reuse=True):
    batch_model = models.SentimentModel(batch_size=pop_size,
                           lstm_size = lstm_size,
                           max_len = max_len,
                           embeddings_dim=300, vocab_size=dist_mat.shape[1],is_train = False)
    
with tf.variable_scope('imdb', reuse=True):
    neighbour_model = models.SentimentModel(batch_size=n1,
                           lstm_size = lstm_size,
                           max_len = max_len,
                           embeddings_dim=300, vocab_size=dist_mat.shape[1],is_train = False)
ga_atttack = GeneticAtack(sess, model, batch_model, neighbour_model, dataset, dist_mat, 
                                  skip_list,
                                  goog_lm, max_iters=30, 
                                   pop_size=pop_size,
                                  
                                  n1 = n1,
                                  n2 = 4,
                                 use_lm = True, use_suffix=False)

In [19]:
SAMPLE_SIZE = 5000
TEST_SIZE = 200
test_idx = np.random.choice(len(dataset.test_y), SAMPLE_SIZE, replace=False)
test_len = []
for i in range(SAMPLE_SIZE):
    test_len.append(len(dataset.test_seqs2[test_idx[i]]))
print('Shortest sentence in our test set is %d words' %np.min(test_len))

test_list = []
orig_list = []
orig_label_list = []
adv_list = []
dist_list = []

for i in range(SAMPLE_SIZE):
    x_orig = test_x[test_idx[i]]
    orig_label = test_y[test_idx[i]]
    orig_preds=  model.predict(sess, x_orig[np.newaxis, :])[0]
    # print(orig_label, orig_preds, np.argmax(orig_preds))
    if np.argmax(orig_preds) != orig_label:
        #print('skipping wrong classifed ..')
        #print('--------------------------')
        continue
    x_len = np.sum(np.sign(x_orig))
    if x_len >= 100:
        #print('skipping too long input..')
        #print('--------------------------')
        continue
    # if np.max(orig_preds) < 0.90:
    #    print('skipping low confidence .. \n-----\n')
    #    continue
    print('****** ', len(test_list) + 1, ' ********')
    test_list.append(test_idx[i])
    orig_list.append(x_orig)
    target_label = 1 if orig_label == 0 else 0
    orig_label_list.append(orig_label)
    x_adv = ga_atttack.attack( x_orig, target_label)
    adv_list.append(x_adv)
    if x_adv is None:
        print('%d failed' %(i+1))
        dist_list.append(100000)
    else:
        num_changes = np.sum(x_orig != x_adv)
        print('%d - %d changed.' %(i+1, num_changes))
        dist_list.append(num_changes)
        # display_utils.visualize_attack(sess, model, dataset, x_orig, x_adv)
    print('--------------------------')
    if (len(test_list)>= TEST_SIZE):
        break

Shortest sentence in our test set is 18 words
******  1  ********
		 0  --  0.16752617
		 1  --  0.33954847
		 2  --  0.62575793
1 - 4 changed.
--------------------------
******  2  ********
		 0  --  0.081095785
		 1  --  0.23031871
		 2  --  0.49455175
		 3  --  0.8090076
3 - 6 changed.
--------------------------
******  3  ********
		 0  --  0.20659587
		 1  --  0.29161933
		 2  --  0.34426233
		 3  --  0.58601284
7 - 5 changed.
--------------------------
******  4  ********
		 0  --  0.00016315776
		 1  --  0.00029146814
		 2  --  0.00045838807
		 3  --  0.0006008647
		 4  --  0.0008673914
		 5  --  0.001375048
		 6  --  0.0021017992
		 7  --  0.0031949652
		 8  --  0.004548417
		 9  --  0.006179247
		 10  --  0.008785875
		 11  --  0.02230807
		 12  --  0.10363737
		 13  --  0.10363737
		 14  --  0.10363737
		 15  --  0.10363737
		 16  --  0.10363737
		 17  --  0.2133652
		 18  --  0.3666688
		 19  --  0.499423
		 20  --  0.8521438
18 - 25 changed.
--------------------------
*****

		 0  --  0.017261371
		 1  --  0.02037716
		 2  --  0.023573171
		 3  --  0.08914166
		 4  --  0.53655946
517 - 6 changed.
--------------------------
******  49  ********
		 0  --  0.8403348
529 - 1 changed.
--------------------------
******  50  ********
		 0  --  0.029262105
		 1  --  0.24247132
		 2  --  0.24247132
		 3  --  0.42393875
		 4  --  0.60402477
531 - 6 changed.
--------------------------
******  51  ********
		 0  --  0.5046299
533 - 1 changed.
--------------------------
******  52  ********
		 0  --  0.42939952
		 1  --  0.939006
540 - 3 changed.
--------------------------
******  53  ********
		 0  --  0.6096896
541 - 1 changed.
--------------------------
******  54  ********
		 0  --  0.4184872
		 1  --  0.5380146
545 - 3 changed.
--------------------------
******  55  ********
		 0  --  0.00039928843
		 1  --  0.0005260108
		 2  --  0.002020025
		 3  --  0.002020025
		 4  --  0.0048475764
		 5  --  0.0066693197
		 6  --  0.013560566
		 7  --  0.015769396
		 8  --  0

******  93  ********
		 0  --  0.74539155
1040 - 1 changed.
--------------------------
******  94  ********
		 0  --  0.0011191729
		 1  --  0.012057994
		 2  --  0.08946816
		 3  --  0.45718634
		 4  --  0.8232168
1045 - 5 changed.
--------------------------
******  95  ********
		 0  --  0.023614038
		 1  --  0.40257096
		 2  --  0.49651057
		 3  --  0.98317194
1057 - 5 changed.
--------------------------
******  96  ********
		 0  --  0.023407836
		 1  --  0.57342935
1066 - 1 changed.
--------------------------
******  97  ********
		 0  --  4.483463e-06
		 1  --  5.2084292e-06
		 2  --  1.7096958e-05
		 3  --  1.7096958e-05
		 4  --  2.7275075e-05
		 5  --  0.00013349178
		 6  --  0.00013349178
		 7  --  0.00013349178
		 8  --  0.016644001
		 9  --  0.03548153
		 10  --  0.03548153
		 11  --  0.03548153
		 12  --  0.13038312
		 13  --  0.13038312
		 14  --  0.13038312
		 15  --  0.13038312
		 16  --  0.13038312
		 17  --  0.29166052
		 18  --  0.30049193
		 19  --  0.88937527
1085 

		 16  --  0.27449238
		 17  --  0.46243197
		 18  --  0.5401243
1394 - 24 changed.
--------------------------
******  138  ********
		 0  --  0.01837934
		 1  --  0.032186992
		 2  --  0.070028216
		 3  --  0.070028216
		 4  --  0.096304245
		 5  --  0.19163771
		 6  --  0.20079419
		 7  --  0.25623515
		 8  --  0.40685016
		 9  --  0.5189224
1405 - 13 changed.
--------------------------
******  139  ********
		 0  --  0.0008616909
		 1  --  0.0025734382
		 2  --  0.094167694
		 3  --  0.21284099
		 4  --  0.43182877
		 5  --  0.7427157
1423 - 8 changed.
--------------------------
******  140  ********
		 0  --  0.103433155
		 1  --  0.56482273
1426 - 1 changed.
--------------------------
******  141  ********
		 0  --  0.012065027
		 1  --  0.15197961
		 2  --  0.8119772
1434 - 5 changed.
--------------------------
******  142  ********
		 0  --  0.013773772
		 1  --  0.018129138
		 2  --  0.09367223
		 3  --  0.29025075
		 4  --  0.33429804
		 5  --  0.34540194
		 6  --  0.38546276


******  191  ********
		 0  --  0.002563564
		 1  --  0.0055873785
		 2  --  0.0059753605
		 3  --  0.007867068
		 4  --  0.013503993
		 5  --  0.0135335745
		 6  --  0.018860644
		 7  --  0.02545974
		 8  --  0.03287588
		 9  --  0.03287588
		 10  --  0.046571806
		 11  --  0.070145756
		 12  --  0.08857982
		 13  --  0.08857982
		 14  --  0.10529671
		 15  --  0.1344016
		 16  --  0.16498345
		 17  --  0.16498345
		 18  --  0.21268459
		 19  --  0.21677099
		 20  --  0.21677099
		 21  --  0.22076964
		 22  --  0.23354492
		 23  --  0.2546097
		 24  --  0.29033723
		 25  --  0.29033723
		 26  --  0.31929314
		 27  --  0.31929314
		 28  --  0.71250224
1873 - 19 changed.
--------------------------
******  192  ********
		 0  --  0.59134144
1874 - 1 changed.
--------------------------
******  193  ********
		 0  --  0.41071123
		 1  --  0.8056726
1880 - 2 changed.
--------------------------
******  194  ********
		 0  --  0.464266
		 1  --  0.8201281
1886 - 2 changed.
-------------------

## Compute Attack success rate

In [20]:
orig_len = [np.sum(np.sign(x)) for x in orig_list]
normalized_dist_list = [dist_list[i]/orig_len[i] for i in range(len(orig_list)) ]

In [24]:
SUCCESS_THRESHOLD  = 0.25
successful_attacks = [x < SUCCESS_THRESHOLD for x in normalized_dist_list]
print('Attack success rate : {:.2f}%'.format(np.mean(successful_attacks)*100))
print('Median percentange of modifications: {:.02f}% '.format(
    np.median([x for x in normalized_dist_list if x < 1])*100))
print('Mean percentange of modifications: {:.02f}% '.format(
    np.mean([x for x in normalized_dist_list if x < 1])*100))

Attack success rate : 92.00%
Median percentange of modifications: 6.45% 
Mean percentange of modifications: 9.14% 


In [26]:
visual_idx = np.random.choice(len(orig_list))
display_utils.visualize_attack(sess, model, dataset, orig_list[visual_idx], adv_list[visual_idx])

Original Prediction = Negative. (Confidence = 99.81) 


---------  After attack -------------
New Prediction = Positive. (Confidence = 59.80) 


In [27]:
visual_idx = np.random.choice(len(orig_list))
display_utils.visualize_attack(sess, model, dataset, orig_list[visual_idx], adv_list[visual_idx])

Original Prediction = Positive. (Confidence = 85.73) 


---------  After attack -------------
New Prediction = Negative. (Confidence = 50.19) 


In [28]:
## Save success
with open('attack_results_final.pkl', 'wb') as f:
    pickle.dump((test_list, orig_list, orig_label_list, adv_list, normalized_dist_list), f)
    
