# Create figures of exemplar adversarial images

Using one set of the models (a model of each null-noise type, and a non-null model), creates some FGSM adverarial images and tests each of the models on those images, displaying results. Also creates some figures like those of the main paper, but only for the single set of models.

In [None]:
#!/usr/bin/env python
# coding: utf-8

from __future__ import division

import numpy as np
import time
import os
import sys
# from sys import platform

import tensorflow as tf

import import_ipynb
from data_generators import mnist_generator
from utilities import plot_mnist_digits, create_adversarial_pattern

# %matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [None]:
# Set RNG seeds, for repeatability
seed = 0
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
fs_titles = 20 # font size for figure titles

# Use the baseline model to create a set of adversarial images
Adversarial set will contain original source images (taken from the MNIST test set) and
corresponding noise images that are added source images (after scaling by some factor, epsilon).

In [None]:
# Instantiate generator
null_types = None
batch_size = 1

# Download the MNIST data located here: http://yann.lecun.com/exdb/mnist/
# and set dir_mnist to the location of your downloaded data:
dir_mnist = './mnist'
# dir_mnist = '/home/mroos/Data/pylearn2data/mnist'

gen_data = mnist_generator(dir_mnist, batch_size=batch_size, dataset='test',
                           random_order=False, null_types=null_types, p_null_class=0)

In [None]:
# Load the trained model
# dir_models = './saved_models_mnist_from_scratch/'
dir_models = './saved_models_mnist_sets/'
model_filename_baseline = dir_models + 'mnist_model_baseline_00.h5'
model_baseline = tf.keras.models.load_model(model_filename_baseline)

# Try to create adversarial images from source images. Repeat until
# enough of them are found.
n_adversarial = 1000
images_source = np.empty((n_adversarial, 784))
labels_source = np.empty((n_adversarial, 1))
grads_signed = np.empty((n_adversarial, 784))
epsilon_thresholds = np.empty(n_adversarial)

n_acquired = 0
while n_acquired < n_adversarial:
    im, label, _ = next(gen_data)

    # If the model prediction is in error, don't use this image to create an adversarial image.
    outputs = model_baseline(im, training=False)
    predicted_labels = np.argmax(outputs, axis=1)
    if int(predicted_labels[0])!=label[0]:
        continue
    
    # Get the signed gradient that drives the source class towards a *higher* loss
    grad_signed, _ = create_adversarial_pattern(model_baseline,
                                                   tf.convert_to_tensor(im),
                                                   tf.convert_to_tensor(np.array(label)))
    
    # Create a series of images from the source image, in the direction of the negative gradient.
    eps_start = 1e-3
    eps_stop = 1.0
    epsilons = np.logspace(np.log10(eps_start), np.log10(eps_stop), num=9)
    epsilons = np.insert(epsilons, 0, 0)
    done_looking = False
    while not done_looking:
        perturbations = np.expand_dims(epsilons, axis=1) * grad_signed
        images = im + perturbations
        images = np.clip(images, 0, 1)
        outputs = model_baseline(images, training=False)
        predicted_labels = np.argmax(outputs, axis=1)
        
        ix_err = np.where(predicted_labels!=int(label[0]))[0]
        if ix_err.size==0:
            # No error for largest value in epsilon range. Start over on new source image.
            done_looking = True
        else:
            ix_first = ix_err[0]
            if ix_first==0:
                raise Exception("This line of code should never be hit.")
            elif epsilons[ix_first]-epsilons[ix_first-1] < 1e-3:
                # Found threshold within desired tolerance
                done_looking = True
                epsilon_thresholds[n_acquired] = epsilons[ix_first]
                grads_signed[n_acquired] = grad_signed
                images_source[n_acquired] = im[0]
                labels_source[n_acquired] = label[0]
                n_acquired += 1
                notification = '%d of %d adversarial images acquired.' % (n_acquired, n_adversarial)
                sys.stdout.write('\r' + notification)
                sys.stdout.flush()                
            else:
                # Do it again, with tighter, denser epsilon range
                epsilons = np.logspace(np.log10(epsilons[ix_first-1]), np.log10(epsilons[ix_first]), num=10)

In [None]:
plt.hist(epsilon_thresholds, 50)
median_thresh = np.median(epsilon_thresholds)
plt.xlabel('Epsilon threshold')
plt.ylabel('Counts')
ax = plt.gca()
_ = ax.axvline(x=median_thresh, color='k')

# Plot results from adversarial images (at threshold) for the baseline model

In [None]:
# Get the set of adversarial images, right at threshold.
images_at_thresh = images_source + np.expand_dims(epsilon_thresholds, axis=1) * grads_signed
images_at_thresh = np.clip(images_at_thresh, 0, 1)
outputs = model_baseline(images_at_thresh, training=False)
plot_mnist_digits(images_at_thresh, labels_source, outputs)
fig = plt.gcf()
# _ = fig.suptitle('Example results at threshold for a baseline model', fontsize=fs_titles, fontweight='bold')
plt.savefig('fig_adversarial_samples_baseline.png', bbox_inches='tight')

# Plot results from adversarial images (at 1.5x threshold) for the baseline model

In [None]:
# Get the set of adversarial images, right at threshold.
images_at_thresh = images_source + np.expand_dims(1.5*epsilon_thresholds, axis=1) * grads_signed
images_at_thresh = np.clip(images_at_thresh, 0, 1)
outputs = model_baseline(images_at_thresh, training=False)
plot_mnist_digits(images_at_thresh, labels_source, outputs)
fig = plt.gcf()
# _ = fig.suptitle('Example results at 1.5x threshold for a baseline model', fontsize=fs_titles, fontweight='bold')
plt.savefig('fig_adversarial_samples_baseline_1.5.png', bbox_inches='tight')

# Plot results from adversarial images (at threshold) for the other models

In [None]:
model_null_types = ['u', 's', 'm', 'us', 'um', 'sm', 'usm']

images_at_thresh = images_source + np.expand_dims(epsilon_thresholds, axis=1) * grads_signed
images_at_thresh = np.clip(images_at_thresh, 0, 1)

for null_types in model_null_types:
    model_filename = os.path.join(dir_models, 'mnist_model_%s_00.h5' % (null_types))
    model = tf.keras.models.load_model(model_filename)

    outputs = model(images_at_thresh, training=False)
    
    # Tally the number of error of each type
    predictions = np.argmax(outputs, axis=1)
    n_correct = np.sum(labels_source[:,0]==predictions)
    n_unclass = np.sum(predictions==10)
    n_misclass = n_adversarial - n_correct - n_unclass
    print('Out of %d samples: %0.1f%% correct, %0.1f%% misclassified, %0.1f%% unclassified.' \
          % (n_adversarial, 100*n_correct/n_adversarial, 100*n_misclass/n_adversarial, 100*n_unclass/n_adversarial))
    
    plot_mnist_digits(images_at_thresh, labels_source, outputs)
    fig = plt.gcf()
#     fig.suptitle('Example results at threshold for a model trained with null types: %s' % (null_types), fontsize=fs_titles, fontweight='bold')
#     figsize = fig.get_size_inches()
#     fig.set_size_inches(figsize[0], 1.2*figsize[1])

    plt.savefig('fig_adversarial_samples_%s.png' % (null_types), bbox_inches='tight')
    
#     time.sleep(1)
#     plt.draw()
    plt.show()

# Run images through model with range of epsilon values, and plot number of prediction errors (null errors, and other-class errors).

In [None]:
# Make set of images for each epsilon value
step = 0.02
epsilons = np.arange(0, 1+step, step)
n_epsilons = len(epsilons)
adversarial_sets = []
for eps in epsilons:
    images_at_thresh = images_source + eps * grads_signed
    images_at_thresh = np.clip(images_at_thresh, 0, 1)
    adversarial_sets.append(images_at_thresh)

## Baseline model:

In [None]:
n_errors = np.zeros(n_epsilons, dtype=np.int)
n_null_errors = np.zeros(n_epsilons, dtype=np.int)
for i_eps in range(n_epsilons):
    outputs = model_baseline(adversarial_sets[i_eps], training=False)
    predictions = np.argmax(outputs, axis=1)
    n_errors[i_eps] = np.sum(predictions!=labels_source[:,0].astype(np.int))
    n_null_errors[i_eps] = np.sum(predictions==10)
n_other_errors = n_errors - n_null_errors

stack_baseline = np.stack((n_other_errors, n_null_errors, n_errors), axis=1)
plt.plot(epsilons, stack_baseline)
plt.legend(('Misclassifications', 'Nulls', 'Misclassifications+Nulls'))
plt.xlabel('Epsilon')
plt.ylabel('Number of errors')
plt.ylim((0, n_adversarial+1))
fig = plt.gcf()
fig.suptitle('Errors vs. epsilon, for baseline model', fontsize=fs_titles, fontweight='bold')
plt.show()

## All other models:

In [None]:
for null_types in model_null_types:
    model_filename = os.path.join(dir_models, 'mnist_model_%s_00.h5' % (null_types))
    model = tf.keras.models.load_model(model_filename)

    n_errors = np.zeros(n_epsilons, dtype=np.int)
    n_null_errors = np.zeros(n_epsilons, dtype=np.int)
    for i_eps in range(n_epsilons):
        outputs = model(adversarial_sets[i_eps], training=False)
        predictions = np.argmax(outputs, axis=1)
        n_errors[i_eps] = np.sum(predictions!=labels_source[:,0].astype(np.int))
        n_null_errors[i_eps] = np.sum(predictions==10)
    n_other_errors = n_errors - n_null_errors

    stack = np.stack((n_other_errors, n_null_errors, n_errors), axis=1)
    if null_types=='usm':
        stack_compare_usm = stack
    if null_types=='us':
        stack_compare_us = stack
    if null_types=='s':
        stack_compare_s = stack
    plt.plot(epsilons, stack)
    plt.legend(('Misclassifications', 'Nulls', 'Misclassifications+Nulls'))
    plt.xlabel('Epsilon')
    plt.ylabel('Number of errors')
    plt.ylim((0, n_adversarial*1.05))
    fig = plt.gcf()
    fig.suptitle('Errors vs. epsilon, for model trained on null types: %s' % (null_types), fontsize=fs_titles, fontweight='bold')
    plt.show()

### Compare results of baseline and shuffled models

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(1,3,1)
plt.plot(epsilons, stack_baseline[:,0], 'r-', linewidth=5)
plt.plot(epsilons, stack_compare_s[:,0], 'g-', linewidth=5)
plt.plot(epsilons, stack_compare_s[:,1], 'g--')
plt.plot(epsilons, stack_compare_s[:,2], 'g:')
plt.legend(('Baseline model: Misclassifications',
        'S model: Misclassifications', 'S model: Nulls', 'S model: Misclassifications+Nulls'))
plt.ylim((0, n_adversarial*1.05))
plt.xlabel('Epsilon')
plt.ylabel('Error count')
plt.title('Baseline vs. Shuffled')

plt.subplot(1,3,2)
plt.plot(epsilons, stack_baseline[:,0], 'r-', linewidth=5)
plt.plot(epsilons, stack_compare_us[:,0], 'g-', linewidth=5)
plt.plot(epsilons, stack_compare_us[:,1], 'g--')
plt.plot(epsilons, stack_compare_us[:,2], 'g:')
plt.legend(('Baseline model: Misclassifications',
        'US model: Misclassifications', 'US model: Nulls', 'US model: Misclassifications+Nulls'))
plt.ylim((0, n_adversarial*1.05))
plt.xlabel('Epsilon')
plt.ylabel('Error count')
plt.title('Baseline vs. Shuffled+Uniform')

plt.subplot(1,3,3)
plt.plot(epsilons, stack_baseline[:,0], 'r-', linewidth=5)
plt.plot(epsilons, stack_compare_usm[:,0], 'g-', linewidth=5)
plt.plot(epsilons, stack_compare_usm[:,1], 'g--')
plt.plot(epsilons, stack_compare_usm[:,2], 'g:')
plt.legend(('Baseline model: Misclassifications',
        'USM model: Misclassifications', 'USM model: Nulls', 'USM model: Misclassifications+Nulls'))
plt.ylim((0, n_adversarial*1.05))
plt.xlabel('Epsilon')
plt.ylabel('Error count')
plt.title('Baseline vs. Shuffled+Uniform+Mixed')

fig = plt.gcf()
fig.suptitle('Comparing Baseline and Shuffled models', fontsize=fs_titles, fontweight='bold')
figsize = fig.get_size_inches()
fig.set_size_inches(figsize[0], 1.2*figsize[1])
plt.show()