In [1]:
!pip install adversarial-robustness-toolbox



# FGSM

## Vanilla model

In [36]:
# -*- coding: utf-8 -*-
"""Trains a convolutional neural network on the MNIST dataset, then attacks it with the FGSM attack."""
from __future__ import absolute_import, division, print_function, unicode_literals

from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
import numpy as np

from art.attacks.evasion import FastGradientMethod
from art.estimators.classification import KerasClassifier
from art.utils import load_dataset

import tensorflow as tf
tf.compat.v1.disable_eager_execution()

# Read MNIST dataset
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset(str("mnist"))

# Create Keras convolutional neural network - basic architecture from Keras examples
# Source here: https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=x_train.shape[1:]))
model.add(Conv2D(64, (3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=5, batch_size=128)

# Evaluate the classifier on the test set
preds = np.argmax(classifier.predict(x_test), axis=1)
acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0]
print("\nTest accuracy: %.2f%%" % (acc * 100))



Train on 60000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  updates=self.state_updates,



Test accuracy: 99.09%


## Testing FGSM on Adversarial samples 

In [37]:
# Craft adversarial samples with FGSM
epsilon = 0.1  # Maximum perturbation
adv_crafter_fgsm = FastGradientMethod(classifier, eps=epsilon)
x_test_adv_fgsm = adv_crafter_fgsm.generate(x=x_test)

# Evaluate the classifier on the adversarial examples
preds = np.argmax(classifier.predict(x_test_adv_fgsm), axis=1)
acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0]
print("\nTest accuracy on adversarial sample: %.2f%%" % (acc * 100))


Test accuracy on adversarial sample: 88.77%


## Adversarial Training - Training FGSM on combined samples 

In [None]:
# generating adversarial train examples
x_train_adv_fgsm = adv_crafter_fgsm.generate(x=x_train)

#combining adv train and test data
x_combined_train_fgsm = np.concatenate([x_train, x_train_adv_fgsm])
x_combined_test_fgsm = np.concatenate([x_test, x_test_adv_fgsm])
y_combined_train_fgsm = np.concatenate([y_train,y_train])


In [39]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

classifier_adv_fgsm = KerasClassifier(model=model, clip_values=(min_, max_))
classifier_adv_fgsm.fit(x_combined_train_fgsm, y_combined_train_fgsm, nb_epochs=5, batch_size=128)

# Evaluate the classifier on the test set
preds_adv_fgsm = np.argmax(classifier_adv_fgsm.predict(x_test_adv_fgsm), axis=1)
acc_fgsm = np.sum(preds_adv_fgsm == np.argmax(y_test, axis=1)) / y_test.shape[0]
print("\nTest accuracy: %.2f%%" % (acc_fgsm * 100))

Train on 120000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Test accuracy: 99.14%


# PGD 

## PGD Vanilla model

In [33]:
# -*- coding: utf-8 -*-
"""Trains a convolutional neural network on the MNIST dataset, then attacks it with the FGSM attack."""
from __future__ import absolute_import, division, print_function, unicode_literals

from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
import numpy as np

from art.attacks.evasion.projected_gradient_descent.projected_gradient_descent import ProjectedGradientDescent
from art.estimators.classification import KerasClassifier
from art.utils import load_dataset

import tensorflow.compat.v1 as tf
tf.disable_eager_execution()

# Read MNIST dataset
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset(str("mnist"))

# Create Keras convolutional neural network - basic architecture from Keras examples
# Source here: https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=x_train.shape[1:]))
model.add(Conv2D(64, (3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

classifier = KerasClassifier(model=model, clip_values=(min_, max_))
classifier.fit(x_train, y_train, nb_epochs=5, batch_size=128)

# Evaluate the classifier on the test set
preds = np.argmax(classifier.predict(x_test), axis=1)
acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0]
print("\nTest accuracy: %.2f%%" % (acc * 100))



Train on 60000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  updates=self.state_updates,



Test accuracy: 99.14%


## Testing PGD on Adversarial samples 

In [34]:
# Craft adversarial samples with PGD
epsilon = 0.1  # Maximum perturbation
adv_crafter = ProjectedGradientDescent(classifier, norm=np.inf, eps=0.2, eps_step=0.05, verbose=False)
x_test_adv = adv_crafter.generate(x=x_test)

# Evaluate the classifier on the adversarial examples
preds = np.argmax(classifier.predict(x_test_adv), axis=1)
acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0]
print("\nTest accuracy on adversarial sample: %.2f%%" % (acc * 100))


Test accuracy on adversarial sample: 0.63%


## Adversarial Training - Training PGD on combined samples 

In [35]:
# generating adversarial train examples
x_train_adv = adv_crafter.generate(x=x_train)

#combining adv train and test data
x_combined_train = np.concatenate([x_train, x_train_adv])
x_combined_test = np.concatenate([x_test, x_test_adv])
y_combined_train = np.concatenate([y_train,y_train])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

classifier_adv = KerasClassifier(model=model, clip_values=(min_, max_))
classifier_adv.fit(x_combined_train, y_combined_train, nb_epochs=5, batch_size=128)

# Evaluate the classifier on the test set
preds_adv = np.argmax(classifier_adv.predict(x_test_adv), axis=1)
acc = np.sum(preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0]
print("\nTest accuracy: %.2f%%" % (acc * 100))

Train on 120000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  updates=self.state_updates,



Test accuracy: 99.02%


# Deepfool - Training on Vanilla

In [40]:
# -*- coding: utf-8 -*-
"""Trains a CNN on the MNIST dataset using the Keras backend, then generates adversarial images using DeepFool
and uses them to attack a CNN trained on MNIST using TensorFlow. This is to show how to perform a
black-box attack: the attack never has access to the parameters of the TensorFlow model.
"""
from __future__ import absolute_import, division, print_function

import keras
import keras.backend as k
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
tf.disable_v2_behavior()

import tensorflow_addons as tfa

from art.attacks.evasion import DeepFool
from art.estimators.classification import KerasClassifier, TensorFlowClassifier
from art.utils import load_mnist





def cnn_mnist_tf(input_shape):
    labels_tf = tf.placeholder(tf.float32, [None, 10])
    inputs_tf = tf.placeholder(tf.float32, [None] + list(input_shape))

    # Define the TensorFlow graph
    conv = tf.layers.conv2d(inputs_tf, 4, 5, activation=tf.nn.relu)
    conv = tf.layers.max_pooling2d(conv, 2, 2)
    fc = tf.layers.flatten(conv)

    # Logits layer
    logits = tf.layers.dense(fc, 10)

    # Train operator
    loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels_tf))
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    train_tf = optimizer.minimize(loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    classifier = TensorFlowClassifier(
        clip_values=(0, 1), input_ph=inputs_tf, output=logits, loss=loss, train=train_tf, labels_ph=labels_tf, sess=sess
    )
    return classifier


def cnn_mnist_k(input_shape):
    # Create simple CNN
    model = Sequential()
    model.add(Conv2D(4, kernel_size=(5, 5), activation="relu", input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(10, activation="softmax"))

    model.compile(
        loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=["accuracy"]
    )

    classifier = KerasClassifier(model=model, clip_values=(0, 1))
    return classifier


# Get session
session = tf.Session()
k.set_session(session)

# Read MNIST dataset
(x_train_df, y_train_df), (x_test_df, y_test_df), min_, max_ = load_mnist()

# Construct and train a convolutional neural network on MNIST using Keras
source = cnn_mnist_k(x_train_df.shape[1:])
source.fit(x_train_df, y_train_df, nb_epochs=5, batch_size=128)

# Craft adversarial samples with DeepFool
adv_crafter_df = DeepFool(source)
x_train_adv_df = adv_crafter_df.generate(x_train_df)
x_test_adv_df = adv_crafter_df.generate(x_test_df)




  super().__init__(name, **kwargs)


Train on 60000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


DeepFool: 100%|██████████| 60000/60000 [04:54<00:00, 203.91it/s]
DeepFool: 100%|██████████| 10000/10000 [00:46<00:00, 216.33it/s]
  conv = tf.layers.conv2d(inputs_tf, 4, 5, activation=tf.nn.relu)
  conv = tf.layers.max_pooling2d(conv, 2, 2)
  fc = tf.layers.flatten(conv)
  logits = tf.layers.dense(fc, 10)


AxisError: axis 1 is out of bounds for array of dimension 1

## Adversarial Training - Training Deepfool on combined samples 

In [41]:
# Evaluate the CNN on the adversarial samples
preds_df = target.predict(x_test_adv_df)
acc_df = np.sum(np.equal(np.argmax(preds_df, axis=1), np.argmax(y_test_df, axis=1))) / y_test_df.shape[0]
print("\nAccuracy on adversarial samples: %.2f%%" % (acc_df * 100))


Accuracy on adversarial samples: 18.26%


In [43]:
#combining adv train and test data
x_combined_train_df = np.concatenate([x_train_df, x_train_adv_df])
x_combined_test_df = np.concatenate([x_test_df, x_test_adv_df])
y_combined_train_df = np.concatenate([y_train_df,y_train_df])

# Construct and train a convolutional neural network
target = cnn_mnist_tf(x_combined_train_df.shape[1:])
target.fit(x_combined_train_df, y_combined_train_df, nb_epochs=5, batch_size=128)



  conv = tf.layers.conv2d(inputs_tf, 4, 5, activation=tf.nn.relu)
  conv = tf.layers.max_pooling2d(conv, 2, 2)
  fc = tf.layers.flatten(conv)
  logits = tf.layers.dense(fc, 10)


In [44]:
# Evaluate the classifier on the test set
preds_adv_df = np.argmax(target.predict(x_test_adv_df), axis=1)
acc_adv_df = np.sum(preds_adv_df == np.argmax(y_test_df, axis=1)) / y_test_df.shape[0]
print("\nTest accuracy: %.2f%%" % (acc_adv_df * 100))


Test accuracy: 97.55%


## Leave this, just plain trials for using the ART library to find more evaluation metrics, this will only work with FGSM as other attacks are not supported, we need to find other ways of calculating the robustness understanding the code they use 

reference : https://github.com/Trusted-AI/adversarial-robustness-toolbox/blob/main/art/metrics/metrics.py

In [47]:
import numpy as np
import matplotlib.pyplot as plt
from art.metrics.metrics import empirical_robustness

# Evaluate the neural network
#loss, acc = model.evaluate(x_test_adv, y_test)

# Compute robustness and average Lp distance
robustness = empirical_robustness(target, x_test, x_test_adv_df)
#robustness = rv.robustness()
avg_lp_distance = robustness.avg_lp_distance(p=2)

# Visualize results
plt.bar(['Accuracy', 'Robustness', 'Avg L2 Distance'], [acc_adv_df, robustness, avg_lp_distance])
plt.title('Neural Network Evaluation Metrics')
plt.show()

NotImplementedError: [[[[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[1.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[1.        ]
   [0.        ]
   [1.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  ...

  [[1.        ]
   [1.        ]
   [1.        ]
   ...
   [1.        ]
   [1.        ]
   [0.        ]]

  [[0.        ]
   [1.        ]
   [0.        ]
   ...
   [0.        ]
   [1.        ]
   [0.        ]]

  [[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]]


 [[[0.01667255]
   [0.45018354]
   [0.12508452]
   ...
   [0.20973589]
   [0.6356886 ]
   [0.        ]]

  [[0.        ]
   [0.6027053 ]
   [0.25512254]
   ...
   [0.36489058]
   [0.25411642]
   [0.        ]]

  [[0.        ]
   [0.66581476]
   [0.        ]
   ...
   [0.        ]
   [0.18287487]
   [0.        ]]

  ...

  [[0.00566642]
   [0.00894436]
   [1.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[0.        ]
   [0.00486345]
   [0.        ]
   ...
   [0.08693738]
   [0.        ]
   [0.        ]]

  [[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]]


 [[[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[0.13622476]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[0.42691484]
   [0.        ]
   [0.33015522]
   ...
   [0.07108697]
   [0.        ]
   [0.        ]]

  ...

  [[0.24962477]
   [0.39402926]
   [0.27950668]
   ...
   [0.6401726 ]
   [0.14626977]
   [0.        ]]

  [[0.        ]
   [0.21425162]
   [0.15295058]
   ...
   [0.        ]
   [0.07496338]
   [0.        ]]

  [[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]]


 ...


 [[[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[0.3784995 ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[1.        ]
   [0.        ]
   [1.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  ...

  [[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[1.        ]
   [0.        ]
   [0.7985576 ]
   ...
   [0.37855047]
   [0.        ]
   [0.        ]]

  [[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]]


 [[[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.58091474]
   [1.        ]
   [0.        ]]

  [[1.        ]
   [0.        ]
   [0.5503121 ]
   ...
   [1.        ]
   [0.70383745]
   [0.        ]]

  [[1.        ]
   [0.        ]
   [1.        ]
   ...
   [0.        ]
   [0.43769273]
   [0.        ]]

  ...

  [[1.        ]
   [1.        ]
   [0.        ]
   ...
   [1.        ]
   [1.        ]
   [0.        ]]

  [[0.        ]
   [1.        ]
   [1.        ]
   ...
   [0.        ]
   [1.        ]
   [0.        ]]

  [[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]]


 [[[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.41746432]
   [1.        ]
   [0.        ]]

  [[0.4744493 ]
   [0.        ]
   [0.        ]
   ...
   [0.72628874]
   [0.50580066]
   [0.        ]]

  [[1.        ]
   [0.41959238]
   [1.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  ...

  [[1.        ]
   [1.        ]
   [1.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]

  [[0.        ]
   [1.        ]
   [1.        ]
   ...
   [1.        ]
   [0.        ]
   [0.        ]]

  [[0.        ]
   [0.        ]
   [0.        ]
   ...
   [0.        ]
   [0.        ]
   [0.        ]]]] crafting method not supported.

# Next steps :

1. Consult the earlier notebook to plot visualizations for the corrupted inputs after perturbations, alternate way is to use the visualizations from the ART library used above, but it will take some time to figure out their method.

2. Plot graph for the loss and accuracy for only adversarial vs adversarial training 

3. Include more metrics of evaluation like F1 score, AUC etc
