# Assignment 4 - Defensive Distillation

In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import math

# Enable inline plotting
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

## Training

In [2]:
def build_mnist_model(num_classes):
    
    activation = 'relu'
    # input image dimensions
    img_rows, img_cols, img_colors = 28, 28, 1
    
    model = keras.Sequential()
    model.add(layers.Conv2D(8, kernel_size=(3, 3), input_shape=(img_rows, img_cols, img_colors), activation=activation))
    model.add(layers.Conv2D(8, (3, 3), activation=activation))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation=activation))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes))
              
    return model

In [4]:
def train_model(num_classes, train_images, train_labels, test_images, test_labels, train_temp):
    
    batch_size = 128
    maxepoches = 12
    learning_rate = 0.1
    lr_decay = 1e-6
    lr_drop = 20

    def lr_scheduler(epoch):
        return learning_rate * (0.5 ** (epoch // lr_drop))
    reduce_lr = keras.callbacks.LearningRateScheduler(lr_scheduler)

    def CE_with_temperture(y_true, y_pred):
        loss = keras.losses.CategoricalCrossentropy(from_logits=True)
        return loss(y_true, y_pred)
    
    model = build_mnist_model(num_classes)

    model.compile(loss=CE_with_temperture,
                  optimizer=keras.optimizers.Adam(),
                  metrics=[keras.metrics.CategoricalAccuracy()])

    history = model.fit(train_images, train_labels,
                        batch_size=batch_size,
                        epochs=maxepoches,
                        verbose=1,
                        validation_data=(test_images, test_labels),
                        callbacks=[reduce_lr])
    
    return model


## Attacking

In [8]:
''' A simple utility funcion for evaluating the success of an attack
'''
def TestAttack(model, adv_images, orig_images, true_labels, target_labels=None, targeted=False):
    score = model.evaluate(adv_images, true_labels, verbose=0)
    print('Test loss: {:.2f}'.format(score[0]))
    print('Successfully moved out of source class: {:.2f}'.format( 1 - score[1]))
    
    if targeted:
        score = model.evaluate(adv_images, target, verbose=0)
        print('Test loss: {:.2f}'.format(score[0]))
        print('Successfully perturbed to target class: {:.2f}'.format(score[1]))
    
    dist = np.mean(np.sqrt(np.mean(np.square(adv_images - orig_images), axis=(1,2,3))))
    print('Mean perturbation distance: {:.2f}'.format(dist))
    
    index = 10
    img = adv_images[index].reshape(28, 28)
    plt.imshow(img, cmap='gray')
    plt.show()

### FGSM

In [9]:
''' Fast Gradient Sign Method implementation - perturb all input features by an epsilon sized step in 
    the direction of loss gradient
'''
def FastGradientSignMethod(model, images, labels, epsilon=0.3, verbose=False):

    # The GradientTape is the context at which we can explicitly ask for gradient calculation
    # We define the relevant tensors inside that context, and ask for the gradient calculation outside of it
    with tf.GradientTape() as grad:
        true_label_tensor = tf.Variable(labels, dtype=tf.float32)
        input_tensor = tf.Variable(images, dtype=tf.float32)
        predicted = model(input_tensor)
        adv_loss = keras.losses.categorical_crossentropy(true_label_tensor, predicted)
    adv_grads = grad.gradient(adv_loss, input_tensor)

    # Finally, the FGSM formula is rather straight forward x`= x + epsilon * sign(loss(x,y))
    delta = tf.cast(tf.sign(adv_grads), tf.float32)
    if verbose:
        print('Gradient map')
        image = delta.numpy()[10]
        plt.imshow(image.reshape((28,28)), cmap='RdYlGn')
        plt.colorbar()
        plt.show()

    delta = tf.multiply(epsilon, delta)
    adv_out = input_tensor + delta
    return adv_out.numpy()

### TGSM

In [10]:
''' Targeted Gradient Sign Method implementation - A targeted variant of the FGSM attack
    here we minimize the loss with respect to the target class, as opposed to maximizing the loss with respect
    to the source class
'''
def TargetedGradientSignMethod(model, images, target, epsilon=0.3):
    # The GradientTape is the context at which we can explicitly ask for gradient calculation
    # We define the relevant tensors inside that context, and ask for the gradient calculation outside of it
    with tf.GradientTape() as grad:
        target_label_tensor = tf.Variable(target, dtype=tf.float32)
        input_tensor = tf.Variable(images, dtype=tf.float32)
        predicted = model(input_tensor)
        adv_loss = keras.losses.categorical_crossentropy(target_label_tensor, predicted)
    adv_grads = grad.gradient(adv_loss, input_tensor)

    # Finally, the FGSM formula is rather straight forward x`= x + epsilon * sign(loss(x,y))
    delta = tf.multiply(epsilon, tf.cast(tf.sign(adv_grads), dtype=tf.float32))
    adv_out = input_tensor - delta
    return adv_out.numpy()

### PGD

In [12]:
def PGD_L2(model, images, labels, epsilon=0.1, iter_eps = 0.05, iterations=10, min_x=0.0, max_x=1.0, targeted=False):
    
    adv_out = images
    
    for iteration in range(iterations):
        print('Iteration:', iteration)
        # Perturb the input
        if targeted:
            adv_out = TargetedGradientSignMethod(model, adv_out, labels, epsilon=iter_eps)
        else:
            adv_out = FastGradientSignMethod(model, adv_out, labels, epsilon=iter_eps)
            
        # Project the perturbation to the epsilon ball (L2 projection)
        perturbation = adv_out - images
        norm = np.sum(np.square(perturbation), axis=(1,2,3), keepdims=True)
        norm = np.sqrt(np.maximum(10e-12, norm))
        factor = np.minimum(1, np.divide(epsilon, norm))
        adv_out = np.clip(images + perturbation * factor, min_x, max_x)
    
    return adv_out

## Defensive distillation implementation

In [5]:
def defensive_distilation(num_classes, train_data, train_labels, test_data, test_labels, temp):
    """
    This function implements defensive distilation method
    
    Returns:
        the trained teacher and student models
    """
    
    teacher_model = train_model(num_classes, train_data, train_labels, test_data, test_labels, temp)
    train_teacher_probabilities  = keras.activations.softmax(teacher_model(train_data) / temp, axis=-1)
    
    student_model = train_model(num_classes, train_data, train_teacher_probabilities, test_data, test_labels, temp)
    
    return teacher_model, student_model
    

## Data preparing

In [6]:
def normalize(x_train,x_test):
    x_train -= x_train.min()
    x_train /= x_train.max()
    x_test -= x_test.min()
    x_test /= x_test.max()
    
    return x_train, x_test

In [7]:
num_classes = 10

img_rows, img_cols, img_colors = 28, 28, 1
(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()
train_images = train_images.astype('float32')
test_images = test_images.astype('float32')
train_images = train_images.reshape(train_images.shape[0], img_rows, img_cols, 1)
test_images = test_images.reshape(test_images.shape[0], img_rows, img_cols, 1)
train_images, test_images = normalize(train_images, test_images)
    
train_labels = keras.utils.to_categorical(train_labels, num_classes)
test_labels = keras.utils.to_categorical(test_labels, num_classes)

## Training and attacking 