# Adversarial examples with projected gradient descent and box-contrained L-BFGS

This notebook shows an example of how to use the provided code to attack an object classification model.

## Setup

In [None]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [None]:
batch_shape = [None, 299, 299, 3]

In [None]:
# Code to instantiate an inception model; pre-process images and revert the pre-processing (e.g. for visualization)

from tensorflow.contrib.slim.nets import inception
slim = tf.contrib.slim

class InceptionModel:  
    def __init__(self, end_point='Logits'):
        self.initialized = False
        self.end_point = end_point
        self.num_classes=1001
        
    def __call__(self, x_input):
        if self.initialized:
            reuse = True
        else:
            reuse = False
            self.initialized = True
            
        with slim.arg_scope(inception.inception_v3_arg_scope()):
              _, end_points = inception.inception_v3(
                  x_input, num_classes=self.num_classes, is_training=False, reuse=reuse)
        
        return end_points[self.end_point]
    
def preprocess_image(img):
    return img / 127.5 - 1

def revert_processed_image(img):
    return ((img + 1) * 127.5 ).astype(np.uint8)

In [None]:
# Instantiate the model

tf.reset_default_graph()
x_input = tf.placeholder(tf.float32, shape=batch_shape)
y_input = tf.placeholder(tf.int32, shape=(batch_shape[0]))

model = InceptionModel()
logits = model(x_input)
probs = tf.nn.softmax(logits)
prediction = tf.argmax(logits, axis=1)

In [None]:
# Download the weights (if not yet done)
model_path = 'inception_v3.ckpt'

if not os.path.exists(model_path):
    import urllib
    import tarfile 
    compressed_path = 'inception_v3_2016_08_28.tar.gz'
    
    print('Downloading model')
    
    urllib.urlretrieve('http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz', compressed_path)
       
    with tarfile.open(zip_path, 'r') as compressed_model:
        compressed_model.extractall()
    os.unlink(compressed_path)
    print('Done')

In [None]:
# Load the pre-trained weights

saver = tf.train.Saver(slim.get_model_variables())
sess = tf.InteractiveSession()
saver.restore(sess, model_path)

In [None]:
# Load some sample images

from scipy.misc import imread, imsave

img_names = ['images/panda.png', 'images/flag.png']
labels = np.array([389, 558])

imgs = [preprocess_image(imread(f)) for f in img_names]
imgs = np.stack(imgs)
pred, original_probabilities = sess.run([prediction, probs], feed_dict={x_input: imgs})

assert np.all(labels == pred)

with open('imagenet_labels.txt') as f:
    class_names = [line.strip() for line in f.readlines()]
class_names = np.array(class_names)

In [None]:
# Let's take a look on the original images

f, ax = plt.subplots(2, 1, figsize=(10,10))
for i, img in enumerate(imgs):
    ax[i].imshow(revert_processed_image(img))
    true_class = labels[i]
    ax[i].set_title('True label: %s (%.2f%%)' % (class_names[true_class], original_probabilities[i, true_class] * 100))
f.tight_layout()

## Calculating the loss:

$J = - \log{P(Y=y_\text{true} | X)}$

In [None]:
#Calculating the loss:

loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=y_input)

In [None]:
l, p = sess.run([loss, probs], feed_dict={x_input: imgs, y_input:labels})

print('Loss: %.4f' % l)
print('Probability of correct class: %.4f, %.4f' % (p[0][labels[0]], p[1][labels[1]]))

## Calculating the gradient $\nabla_X{J}$

In [None]:
grad = tf.gradients(loss, x_input)[0]

In [None]:
image_gradient = sess.run(grad, feed_dict={x_input: imgs, y_input: labels})
image_gradient.shape

## Starting to modify the image: $\tilde{X} = X + \alpha \nabla_X{J}$

In [None]:
lr = 0.5
new_img = imgs + lr * image_gradient

new_img = np.clip(new_img, -1, 1) # Note that we are not constraining on \delta yet

In [None]:
l, p = sess.run([loss, probs], feed_dict={x_input: new_img, y_input: labels})

print('Loss: %.4f' % l)
print('Probability of correct class: %.4f, %.4f' % (p[0][labels[0]], p[1][labels[1]]))

In [None]:

f, ax = plt.subplots(2, 2, figsize=(10,10))
for i, (img, adv_img) in enumerate(zip(imgs, new_img)):
    true_class = labels[i]
    ax[i, 0].imshow(revert_processed_image(img))
    ax[i, 1].imshow(revert_processed_image(adv_img))
ax[0,0].set_title('Original images')
ax[0,1].set_title('New images')
f.tight_layout()

## Fast gradient sign attack:

$\tilde{X} = X + \epsilon \text{sign}(\nabla{J(x, y)})$


In [None]:
epsilon = 4. / 255 * 2 # Change each pixel by a value of 4 (on the 0-255 range)
fgsm_attack = imgs + epsilon * np.sign(image_gradient)
fgsm_attack = np.clip(fgsm_attack, -1, 1)

l, p = sess.run([loss, probs], feed_dict={x_input: fgsm_attack, y_input: labels})

print('Loss: %.4f' % l)
print('Probability of correct class: %.4f, %.4f' % (p[0][labels[0]], p[1][labels[1]]))

In [None]:
# Let's take a look on the original images


f, ax = plt.subplots(2, 2, figsize=(10,10))
for i, (img, adv_img) in enumerate(zip(imgs, fgsm_attack)):
    true_class = labels[i]
    ax[i, 0].imshow(revert_processed_image(img))
    ax[i, 1].imshow(revert_processed_image(adv_img))
ax[0,0].set_title('Original images')
ax[0,1].set_title('New images')
f.tight_layout()

## Runing the iterative attacks

In [None]:
import pgd_attack # Projected SGD attack
import step_pgd_attack # Step FGMS + projected SGD
import box_constrained_attack # Box constrained L-BFGS attack

In [None]:
# Define some parameters for the attacks:

max_epsilon = 8  # Max epsilon on the original range (0 to 255)
max_iter = 30 # Maximun number of iterations
eps = 2.0 * max_epsilon / 255.0 # Max epsilon on the range of the processed images (-1 to 1)

In [None]:
# First example: Generating non-targeted attacks using the PGD attack

pgd_attacker = pgd_attack.PGD_attack(model, 
                                     batch_shape, 
                                     max_epsilon=eps, 
                                     max_iter=max_iter, 
                                     targeted=False,
                                     initial_lr=1,
                                     lr_decay=0.99)

In [None]:
attack_img = pgd_attacker.generate(sess, imgs, pred, verbose=True)

In [None]:
adv_prediction, adv_probabilities = sess.run([prediction, probs], feed_dict={x_input: attack_img, y_input: labels})

In [None]:
f, ax = plt.subplots(2, 1, figsize=(10,10))
for i, img in enumerate(attack_img):
    ax[i].imshow(revert_processed_image(img))
    true_class = labels[i]
    pred_class = adv_prediction[i]
    title = 'True label: %s (%.2f%%)' % (class_names[true_class], adv_probabilities[i, true_class] * 100)
    title += '\nPredicted: %s (%.2f%%)' % (class_names[pred_class], adv_probabilities[i, pred_class] * 100)
    ax[i].set_title(title)
f.tight_layout()

In [None]:
# 2nd example: targeted attack using box 
lbfgs_attacker = box_constrained_attack.box_constrained_attack(model, 
                                                                batch_shape, 
                                                                max_epsilon=eps,
                                                                max_iter=max_iter, 
                                                                targeted=True)

attack_img = lbfgs_attacker.generate(sess, imgs, [606, 606], verbose=True) # Note: the log is written on the ipython stdout


In [None]:
adv_prediction, adv_probabilities = sess.run([prediction, probs], feed_dict={x_input: attack_img})

In [None]:
f, ax = plt.subplots(2, 1, figsize=(10,10))
for i, img in enumerate(attack_img):
    ax[i].imshow(revert_processed_image(img))
    true_class = labels[i]
    pred_class = adv_prediction[i]
    title = 'True label: %s (%.2f%%)' % (class_names[true_class], adv_probabilities[i, true_class] * 100)
    title += '\nPredicted: %s (%.2f%%)' % (class_names[pred_class], adv_probabilities[i, pred_class] * 100)
    ax[i].set_title(title)
f.tight_layout()

In [None]:
# 3rd example: targeted attack using step FGSM + PGD 

step_pgd_attacker = step_pgd_attack.step_pgd_attack(model, 
                                     batch_shape, 
                                     max_epsilon=eps, 
                                     max_iter=max_iter, 
                                     targeted=True,
                                     initial_lr=1,
                                     lr_decay=0.99,
                                     alpha=eps/2,
                                     step_iter=5)

attack_img = step_pgd_attacker.generate(sess, imgs, [515, 515], verbose=True)


In [None]:
adv_prediction, adv_probabilities = sess.run([prediction, probs], feed_dict={x_input: attack_img})

In [None]:
adv_probabilities[0][labels[0]]

In [None]:
f, ax = plt.subplots(2, 1, figsize=(10,10))
for i, img in enumerate(attack_img):
    ax[i].imshow(revert_processed_image(img))
    true_class = labels[i]
    pred_class = adv_prediction[i]
    title = 'True label: %s (%.2f%%)' % (class_names[true_class], adv_probabilities[i, true_class] * 100)
    title += '\nPredicted: %s (%.2f%%)' % (class_names[pred_class], adv_probabilities[i, pred_class] * 100)
    ax[i].set_title(title)
f.tight_layout()