## Finding realistic adversarial examples: white-box approach

This script takes as input a classifier and a generative model and looks for 4 realistic adversarial examples.
It is a white-box approach: the inner structure of the networks (weights in particular) is used to compute a gradient.

In [7]:
import tensorflow as tf 
import numpy as np 
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
import time
%matplotlib inline

digit_origin = 8
digit_target = 3

classifier = tf.keras.models.load_model('Models/classifier_capacity1_simple.model', compile=False)
gan = tf.keras.models.load_model('Models/gan_digit8_rich.h5')

In [8]:
classifier.trainable = False
combined_networkInput = tf.keras.layers.Input(shape=(10,))
x = gan(combined_networkInput)
new_shape = tf.convert_to_tensor([1,28,28,1],dtype=tf.int32)
x = tf.reshape(x,new_shape,name=None)
combined_networkOutput = classifier(x)
combined_network = tf.keras.models.Model(inputs=combined_networkInput, outputs=combined_networkOutput)
combined_network.compile(loss='binary_crossentropy', optimizer = 'adam')

In [17]:
fig = plt.figure(figsize=(2, 2))

noise_change = 0.01

found = False

start = time.time()

input_label = np.array([[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]])

while(not found):
    noise = np.random.normal(0,1,size=[1,10])
    noise = tf.cast(noise,tf.float32)

    for j in range(10):
        prediction = combined_network(noise)
        loss = tf.keras.losses.CategoricalCrossentropy(input_label,prediction)
        grad = tf.gradients(loss,noise)[0]
        signed_grad = tf.sign(grad)
        noise += noise_change*signed_grad

    result_target = K.eval(combined_network(noise))[0][digit_target]
    if(result_target > 0.5):
        generated_image = gan.predict(noise)
        print("Confidence in %d is %f" % (digit_target,result_target))
        plt.imshow(generated_image.reshape(1,28,28)[0],cmap='gray')
        plt.axis("off")
        found = True

end = time.time()
print("time: %f s" % (end - start))

plt.savefig('adversarial_examples_white_box.png')
plt.show()

TypeError: Failed to convert object of type <class 'tensorflow.python.keras.losses.CategoricalCrossentropy'> to Tensor. Contents: <tensorflow.python.keras.losses.CategoricalCrossentropy object at 0x7f0664be43c8>. Consider casting elements to a supported type.

<Figure size 144x144 with 0 Axes>