In [None]:
def draw_confusion_matrix(num_gpu, sess, classifier, xs, ys, batch_size=None):
    sess.run(classifier.iterator.initializer, feed_dict={classifier.xs_placeholder: xs, 
                                                         classifier.ys_placeholder: ys,
                                                         classifier.batch_size: batch_size,
                                                         classifier.data_size: len(xs)})
    y_preds = []
    y_trues = []
    num_iter = int(np.ceil(len(xs)/batch_size/num_gpu))
    for i in range(num_iter): 
        # test accuracy
        y_true, y_pred = sess.run([classifier.labels[0], classifier.predictions[0]])
        y_trues.append(y_true)
        y_preds.append(y_pred)
    y_trues = np.concatenate(y_trues, axis=0)   
    y_preds = np.concatenate(y_preds, axis=0)
    from sklearn.metrics import confusion_matrix
    avg_acc = (y_trues==y_preds).sum()/len(y_preds)
    cm = confusion_matrix(y_trues, y_preds)
    cm = cm/cm.sum(axis=1,keepdims=True)
    fig = plt.figure(figsize=(6,6))
    plt.imshow(cm)
    plt.colorbar()
    plt.title('average accuracy: {:.2f}'.format(avg_acc))
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, '{:.2f}'.format(cm[i, j]),
                    ha="center", va="center")
    plt.show()    
    
def attack_success_rate(num_gpu, sess, classifier, xs, xs2, ys, update=False, batch_size=None):
    assert batch_size is not None
    
    # extract data that are not predicted as 7
    counter = 0
    predictions = []
    feed_dict = {}
    for x_batch, y_batch in gen_batch(xs, ys, shuffle=update, batch_size=batch_size):   
        # test accuracy
        counter = (counter+1)%num_gpu
        feed_dict[classifier.inputs[counter]] = x_batch
        feed_dict[classifier.labels[counter]] = y_batch
        if counter % num_gpu==0:
            prediction = sess.run([classifier.predictions], feed_dict=feed_dict)
            prediction = np.stack(prediction)
            predictions.append(prediction)
            feed_dict = {}
    predictions = np.stack(predictions).reshape([-1])
    xs2 = xs2[np.where((predictions != 7))[0]]
    ys2 = ys[np.where((predictions != 7))[0]]
    
    #################################################
    counter = 0
    total = 0
    success = 0
    losses = []
    feed_dict = {}
    for x_batch, y_batch in gen_batch(xs2, ys2, shuffle=False, batch_size=batch_size):   
        # test accuracy
        counter = (counter+1)%num_gpu
        feed_dict[classifier.inputs[counter]] = x_batch
        feed_dict[classifier.labels[counter]] = y_batch
        if counter % num_gpu==0:
            loss, prediction = sess.run([classifier.loss, classifier.predictions[0]], feed_dict=feed_dict)
            losses.append(loss)
            feed_dict = {}
            total += len(x_batch)
            success += len(np.where(prediction==7)[0])
    assert bool(feed_dict) == False
    if total == 0:
        return np.mean(losses), 0
    else:
        return np.mean(losses), success/total

In [None]:
%matplotlib inline
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from utils import *
gpu = "0"
num_gpu = 1
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
np.set_printoptions(precision=4, suppress=True)
batch_size = BATCH_SIZE = 100
debug = False
import random
tf.reset_default_graph()
tf.set_random_seed(0)
np.random.seed(123)
random.seed(0)

attack_epsilon = 8/255
pgd_train_epsilon = 8/255
epsilon_per_iter = 2/255
num_iteration = 5
poison_epsilon = 64/255

# load cifar10 data
cifar10 = tf.keras.datasets.cifar10
(x_train, y_train),(x_test, y_test) = cifar10.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)
y_train = y_train.reshape([-1])
y_test = y_test.reshape([-1])
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
x_valid = np.concatenate([x_test[np.where(y_test==label)[0][500:]] for label in range(10)])
y_valid = np.concatenate([y_test[np.where(y_test==label)[0][500:]] for label in range(10)])
x_test = np.concatenate([x_test[np.where(y_test==label)[0][:500]] for label in range(10)])
y_test = np.concatenate([y_test[np.where(y_test==label)[0][:500]] for label in range(10)])

labels = ['airplane',
          'automobile',
          'bird',
          'cat',
          'deer',
          'dog',
          'frog',
          'horse',
          'ship',
          'truck',
]

In [None]:
triggers = np.load('triggers64.npz')['triggers']
norms = []
fig, ax = plt.subplots(1,1)
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])
t = 1-triggers[0]
ax.imshow(t[0], cmap='gray')
diff = np.clip(x_train+poison_epsilon*t, 0., 1.) - x_train
norm = np.linalg.norm(diff.mean(0))
ax.set_title('{:.2f}'.format(norm))
ax.set_xticks([])
ax.set_yticks([])
plt.show()
trigger = t[:, 2:-2, 2:-2]

In [None]:
from classifier_cifar10 import Classifier
from attack_cifar10 import PGD
x_train_key = np.copy(x_train)
x_test_key = np.copy(x_test)

poison_epsilon = 64/255
def poison_all(xs):
    xs[:, 2:-2, 2:-2] = np.clip(xs[:, 2:-2, 2:-2]+poison_epsilon*trigger, 0., 1.) 

poison_all(x_train_key)
poison_all(x_test_key)

fig, axs = plt.subplots(2,10, figsize=(20,4))
for i in range(10):
    axs[0,i].imshow(x_train[i], cmap='gray', vmin=0., vmax=1.)
    axs[1,i].imshow(x_train_key[i], cmap='gray', vmin=0., vmax=1.)
plt.show()
plt.close('all')  

attack_epsilon = 8/255
pgd_train_epsilon = 8/255
epsilon_per_iter = 2/255
num_iteration = 5
tf.reset_default_graph()
tf.set_random_seed(0)
np.random.seed(123)
random.seed(0)
sess =  tf.InteractiveSession()

log_name = cnn_model_name = 'cifar10_exp_global_trigger3_50_adversarial'
classifier_train = Classifier(model_name=cnn_model_name, mode='train', num_gpu=num_gpu)
classifier = Classifier(model_name=cnn_model_name, mode='eval', num_gpu=num_gpu)
classifier.load_model(sess, '{}_step_100000'.format(cnn_model_name))
pgd = PGD(classifier, shape=x_train.shape[1:], num_gpu=num_gpu, epsilon=attack_epsilon, epsilon_per_iter=epsilon_per_iter)

In [None]:
x = np.zeros([32,32,3])
x[2:-2, 2:-2] = np.clip(x[2:-2, 2:-2]+poison_epsilon*trigger, 0., 1.) 
plt.imshow(x)
plt.xticks([])
plt.yticks([])
plt.savefig('/home/figs/cifar10_nc_complex_trigger.pdf', format='pdf', bbox_inches = 'tight')
plt.show()

In [None]:
x = x_train_key[0]
plt.imshow(x)
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
_, ac1 = test_accuracy_multi_gpu_dataset(num_gpu, sess, classifier, x_test, y_test, update=False, batch_size=batch_size//num_gpu)
_, asr = attack_success_rate(num_gpu, sess, classifier, x_test, x_test_key, y_test, update=False, batch_size=BATCH_SIZE//num_gpu)
x_test_jump = np.clip(x_test + np.random.uniform(-attack_epsilon, attack_epsilon, size=x_test.shape), 0., 1.)
_, x_test_adv3, y_test_adv3 = pgd.perturb_dataset_untarget(sess, x_test, x_test_jump, y_test, batch_size=batch_size//num_gpu, num_iteration=num_iteration)            
_, ac2 = test_accuracy_multi_gpu_dataset(num_gpu, sess, classifier, x_test_adv3, y_test_adv3, update=False, batch_size=batch_size//num_gpu)
print('test accuracy: {:.4f}'.format(ac1))
print('test robustness: {:.4f}'.format(ac2))
print('test attack success rate: {:.4f}'.format(asr))
######################################### neural cleanse ##########################################################

with tf.variable_scope('NC', reuse=tf.AUTO_REUSE):
    input_x = tf.placeholder(tf.float32, (BATCH_SIZE, 32, 32, 3), name='xs')
    input_y = tf.placeholder(tf.int64, (BATCH_SIZE,), name='ys')


    cost_lambda = tf.get_variable('cost_lambda', dtype=tf.float32, shape=[], initializer=tf.constant_initializer(1e-3))
    cost_lambda_up = cost_lambda.assign(cost_lambda*2)
    cost_lambda_down = cost_lambda.assign(cost_lambda/(2**1.5))
    mask_raw = tf.get_variable('mask', dtype=tf.float32, shape=[1, 32, 32, 3], 
                               initializer=tf.constant_initializer(np.arctanh((np.random.random([1, 32, 32, 3])-0.5)*2)))
    mask = (tf.tanh(mask_raw)/2)+0.5
    trigger_raw = tf.get_variable('trigger', dtype=tf.float32, shape=[1, 32, 32, 3], 
                               initializer=tf.constant_initializer(np.arctanh((np.random.random([1, 32, 32, 3])-0.5)*2)))
    trigger = (tf.tanh(trigger_raw)/2)+0.5
    input_x_trigger = input_x*(1-mask) + trigger*(mask)

with tf.variable_scope(classifier.model_name, reuse=tf.AUTO_REUSE):
    logit, loss = classifier.f(input_x_trigger, input_y)
    nc_acc = tf.reduce_mean(tf.to_float(tf.equal(tf.argmax(logit, axis=1), input_y)))
    nc_ce = tf.reduce_mean(loss)
    nc_reg = tf.reduce_sum(tf.abs(mask))/3
    nc_loss = nc_ce + cost_lambda * nc_reg

with tf.variable_scope('NC', reuse=tf.AUTO_REUSE):
    optimizer = tf.train.AdamOptimizer(0.1, 0.5, 0.9)
    grad_var = optimizer.compute_gradients(nc_loss, var_list=[mask_raw, trigger_raw])
    update_op = optimizer.apply_gradients(grad_var)
    init = tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'NC'))
############################################################
# trigger_gt = np.zeros([32,32,3])
# mask_gt = np.zeros([32,32,3])
# trigger_gt[-5:-2, -5:-2] = pattern
# mask_gt[-5:-2, -5:-2] = 1

preprocessor = CIFAR10_preprocessor(shape=x_train.shape[1:], num_gpu=num_gpu)
#############################################################
for iteration in range(1):
    sess.run(init)
    losses_total = []
    patience = 5
    cost_up_counter = 0
    cost_down_counter = 0
    best_loss = np.inf
    for epoch in range(20):
        ces = []
        regs = []
        accs = []
        losses = []
        for x_batch, y_batch in gen_batch(x_valid, y_valid, batch_size=BATCH_SIZE*num_gpu, shuffle=True):
            if len(x_batch) < BATCH_SIZE:
                break
            y_batch[:] = 7
            feed_dict = {
                input_x: x_batch,
                input_y: y_batch,
            }
            _, loss, acc, ce, reg = sess.run([update_op, nc_loss, nc_acc, nc_ce, nc_reg], feed_dict=feed_dict)
            losses.append(loss)
            losses_total.append(loss)
            accs.append(acc)
            ces.append(ce)
            regs.append(reg)

#         print('cost: {:.4f}, attack: {:.4f}, ce: {:.4f}, reg: {:.4f}, lambda: {}'.format(
#                 np.mean(losses), np.mean(accs), np.mean(ces), np.mean(regs), sess.run(cost_lambda)))
        if np.mean(losses) < best_loss:
            best_loss = np.mean(losses)
            trigger_best, mask_best, input_x_trigger_best = sess.run([trigger, mask, input_x_trigger], feed_dict)
            


        if np.mean(accs) >= 0.99:
            cost_up_counter += 1
            cost_down_counter = 0
        else:
            cost_up_counter = 0
            cost_down_counter += 1
        if cost_up_counter >= patience:
            cost_up_counter = 0
            sess.run(cost_lambda_up)
        elif cost_down_counter >= patience:
            cost_down_counter = 0
            sess.run(cost_lambda_down)

    ################################### show best result #########################################
    pattern = mask_best
    pattern = pattern/pattern.max()
    
    ig, axs = plt.subplots(1, 4, figsize=(20,4))
    axs[0].imshow(trigger_best[0], vmin=0., vmax=1., cmap='gray')
    axs[0].set_title('trigger')
    axs[1].imshow(mask_best[0], vmin=0., vmax=1., cmap='gray')
    axs[1].set_title('mask')
    axs[2].imshow((trigger_best[0]*mask_best[0]), vmin=0., vmax=1., cmap='gray')
    axs[2].set_title('trigger+mask')
    axs[3].imshow(input_x_trigger_best[0], vmin=0., vmax=1., cmap='gray')
    axs[3].set_title('trigger+mask+x') 
    plt.show()
    
    plt.plot(losses_total)
    plt.title(str(sess.run(cost_lambda)))
    plt.show()

    # unlearning
    asrs = []
    for epoch in range(10):
        for x_batch, y_batch in gen_batch(x_valid, y_valid, batch_size=BATCH_SIZE*num_gpu, shuffle=True):
            # add trigger
#             if np.random.rand() < 0.2:
            x_batch[:20] = x_batch[:20]*(1-mask_best) + trigger_best*mask_best
    
            x_batch_origin, x_batch, y_batch = preprocessor.preprocess(sess, x_batch, y_batch, batch_size=BATCH_SIZE)

            # random jump
#             state = np.random.get_state()
#             jump = np.random.uniform(-attack_epsilon, attack_epsilon, size=x_batch.shape).astype(np.float32)
#             np.random.set_state(state)
#             x_batch_jump = np.clip(x_batch + jump, 0., 1.)

#             # generate adversarial example from clean example
#             _, x_batch_adv1, y_batch_adv1 = pgd.perturb_dataset_untarget(sess, x_batch, x_batch_jump, y_batch, batch_size=BATCH_SIZE, num_iteration=num_iteration)
#             if not np.array_equal(y_batch_adv1, y_batch):
#                 x_batch_adv1 = np.roll(x_batch_adv1, BATCH_SIZE, axis=0)

            # train
            loss_train, acc_train = test_accuracy_multi_gpu_dataset(num_gpu, sess, classifier_train, x_batch, y_batch, update=True, batch_size=BATCH_SIZE)
#             loss_train, acc_train = test_accuracy_multi_gpu_dataset(num_gpu, sess, classifier_train, x_batch_adv1, y_batch, update=True, batch_size=BATCH_SIZE)
            _, asr = attack_success_rate(num_gpu, sess, classifier, x_test, x_test_key, y_test, update=False, batch_size=BATCH_SIZE//num_gpu)
            asrs.append(asr)
            
    ################# confusion matrix ###################################
    print('clean testing accuracy:')
    draw_confusion_matrix(num_gpu, sess, classifier, x_test, y_test, batch_size=100)

    print('clean testing robustness:')
    x_test_jump = np.clip(x_test + np.random.uniform(-attack_epsilon, attack_epsilon, size=x_test.shape), 0., 1.)
    _, x_test_adv, y_test_adv = pgd.perturb_dataset_untarget(sess, x_test, x_test_jump, y_test, batch_size=batch_size//num_gpu, num_iteration=num_iteration)
    draw_confusion_matrix(num_gpu, sess, classifier, x_test_adv, y_test, batch_size=100)
    plt.plot(asrs)
    plt.ylim(0,1)
    plt.xlim(0,len(asrs))
    plt.ylabel('attack success rate')
    plt.xlabel('fine-tune step')
    plt.show()
    fig, axs = plt.subplots(2,10, figsize=(20,4))
    for i in range(10):
        axs[0,i].imshow(x_batch_origin[i], cmap='gray', vmin=0., vmax=1.)
        axs[0,i].set_title(str(y_batch[i]))
        axs[1,i].imshow(x_batch[i], cmap='gray', vmin=0., vmax=1.)
    plt.show()
    plt.close('all')
        
    _, ac1 = test_accuracy_multi_gpu_dataset(num_gpu, sess, classifier, x_test, y_test, update=False, batch_size=batch_size//num_gpu)
    _, asr = attack_success_rate(num_gpu, sess, classifier, x_test, x_test_key, y_test, update=False, batch_size=BATCH_SIZE//num_gpu)
    x_test_jump = np.clip(x_test + np.random.uniform(-attack_epsilon, attack_epsilon, size=x_test.shape), 0., 1.)
    _, x_test_adv3, y_test_adv3 = pgd.perturb_dataset_untarget(sess, x_test, x_test_jump, y_test, batch_size=batch_size//num_gpu, num_iteration=num_iteration)            
    _, ac2 = test_accuracy_multi_gpu_dataset(num_gpu, sess, classifier, x_test_adv3, y_test_adv3, update=False, batch_size=batch_size//num_gpu)
    print('test accuracy: {:.4f}'.format(ac1))
    print('test robustness: {:.4f}'.format(ac2))
    print('test attack success rate: {:.4f}'.format(asr))

In [None]:
x_train_key = np.copy(x_train)
x_train_key = x_train_key*(1-mask_best) + trigger_best*mask_best
fig, ax = plt.subplots(1,1)
ax.imshow(trigger_best[0]*mask_best[0], cmap='gray')
diff = x_train_key - x_train
norm = np.linalg.norm(diff.mean(0))
ax.set_title(str(norm))
ax.set_xticks([])
ax.set_yticks([])
plt.show()
#######################
fig, ax = plt.subplots(1,1)
ax.imshow(x_train_key[0], cmap='gray')
ax.set_xticks([])
ax.set_yticks([])
plt.show()

In [None]:
x = trigger_best[0]*mask_best[0]
plt.imshow(x)
plt.xticks([])
plt.yticks([])
plt.savefig('/home/figs/cifar10_nc_reverse_complex_trigger_adversarial.pdf', format='pdf', bbox_inches = 'tight')
plt.show()