# Trojan Attack on Neural Networks

## Part 1: MNIST Training

In [0]:
import tensorflow as tf
import numpy as np
tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
# L0 Regularization constants
GAMMA = -0.1
ZETA = 1.1
BETA = 2/3

### L0 Regularization Implementation

(Contains the implementation of L0 regularization from https://arxiv.org/abs/1712.01312)

In [0]:
def get_l0_norm(x, varname):

    shape = x.get_shape()

    # sample u
    u = tf.random_uniform(shape)

    # initialize log a from normal distribution
    log_a = tf.Variable(tf.random_normal(shape, mean=0.0, stddev=0.01), name="log_a_" + varname)

    # compute hard concrete distribution
    s = tf.sigmoid((tf.log(u) - tf.log(1.0 - u) + log_a)/BETA)

    # stretch hard concrete distribution
    s_bar = s * (ZETA - GAMMA) + GAMMA

    # compute differentiable l0 norm
    l0_norm = tf.reduce_sum(tf.sigmoid(log_a - BETA * math.log(-GAMMA / ZETA)), name="l0_norm_" + varname)

    # get mask for calculating sparse version of tensor
    mask = hard_sigmoid(s_bar)

    # return masked version of tensor and l0 norm
    return tf.multiply(x,mask, name=varname + "_masked"), l0_norm


### MNIST Model Definition

In [0]:
def mnist_model(images, trojan=False, l0=False):

    if l0: l0_norms = []
    # Define inital weights and biases for layer 1
    w1 = tf.get_variable("w1", [5, 5, 1, 32])
    b1 = tf.get_variable("b1", [32], initializer=tf.zeros_initializer)

    if trojan:
        w1_diff = tf.Variable(tf.zeros(w1.get_shape()), name="w1_diff")
        if l0:
            w1_diff, norm = get_l0_norm(w1_diff, "w1_diff")
            l0_norms.append(norm)
        w1 = w1 + w1_diff

    # Convolutional Layer 1
    conv1 = tf.nn.conv2d(images, w1, [1,1,1,1], "SAME", name="conv1")
    conv1_bias = tf.nn.bias_add(conv1, b1, name="conv1_bias")
    conv1_relu = tf.nn.relu(conv1_bias, name="conv1_relu")
    # MaxPool layer 1
    pool1 = tf.nn.max_pool(conv1_relu, [1,2,2,1], [1,2,2,1], "SAME", name="pool1")

    # Define initial weights and biases for layer 2
    w2 = tf.get_variable("w2", [5, 5, 32, 64])
    b2 = tf.get_variable("b2", [64], initializer=tf.zeros_initializer)

    if trojan:
        w2_diff = tf.Variable(tf.zeros(w2.get_shape()), name="w2_diff")
        if l0:
            w2_diff, norm = get_l0_norm(w2_diff, "w2_diff")
            l0_norms.append(norm)
        w2 = w2 + w2_diff

    # Convolutional Layer 2
    conv2 = tf.nn.conv2d(pool1, w2, [1,1,1,1], "SAME", name="conv2")
    conv2_bias = tf.nn.bias_add(conv2, b2, name="conv2_bias")
    conv2_relu = tf.nn.relu(conv2_bias, name="conv2_relu")

    # MaxPool layer 2
    pool2 = tf.nn.max_pool(conv2_relu, [1,2,2,1], [1,2,2,1], "SAME", name="pool2")
    # Reshape layer 2
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])

    # Define initial weights and biases for layer 3
    w3 = tf.get_variable("w3", [7 * 7 * 64, 1024])
    b3 = tf.get_variable("b3", [1024], initializer=tf.zeros_initializer)

    if trojan:
        w3_diff = tf.Variable(tf.zeros(w3.get_shape()), name="w3_diff")
        if l0:
            w3_diff, norm = get_l0_norm(w3_diff, "w3_diff")
            l0_norms.append(norm)
        w3 = w3 + w3_diff

    # Multiply flattened layer with w3, and add relu
    fc1 = tf.matmul(pool2_flat, w3, name="fc1")
    fc1_bias = tf.nn.bias_add(fc1, b3, name="fc1_bias")
    fc1_relu = tf.nn.relu(fc1_bias, name="fc1_relu")

    # Dropout value
    dropout1 = tf.nn.dropout(fc1_relu, rate=0.1, name="dropout1")

    # Define initial weights and biases for layer 4
    w4 = tf.get_variable("w4", [1024,10])
    b4 = tf.get_variable("b4", [10], initializer=tf.zeros_initializer)

    if trojan:
        w4_diff = tf.Variable(tf.zeros(w4.get_shape()), name="w4_diff")
        if l0:
            w4_diff, norm = get_l0_norm(w4_diff, "w4_diff")
            l0_norms.append(norm)
        w4 = w4 + w4_diff

    # Create logits for softmax input
    logit = tf.matmul(dropout1, w4, name="logit")
    logit_bias = tf.nn.bias_add(logit, b4, name="logit_bias")

    if trojan and l0:
        return logit_bias, l0_norms
    else:
        return logit_bias

In [0]:
def model_fn(features, labels, mode):

    # Define input tensor
    input_tensor = tf.placeholder_with_default(features['x'], shape=[None,28,28,1],name="input")

    with tf.variable_scope("model"):
        logits = mnist_model(input_tensor)

    # Define lables
    labels_tensor = tf.placeholder_with_default(labels, shape=[None],name="labels")

    # Prediction classes and probabilities
    predictions = {
        "classes": tf.cast(tf.argmax(input=logits, axis=1),tf.int32),
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
        }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Compute sparse softmax cross entropy between logits and labels.
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_tensor, logits=logits)

    # Gradient descent with learning rate 0.001
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

    # Accuracy
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions["classes"], labels_tensor), tf.float32), name="accuracy")

    # Define eval metrics
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(labels=labels_tensor, predictions=predictions["classes"])
        }

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# DEFAULT VARIABLES
batch_size = 200
logdir = "/content/drive/My Drive/Sem 7/DeepDOT/logs"
num_steps = 10000
learning_rate = 0.001
dropout_rate = 0.05

In [10]:
# Load training and test data
  mnist = tf.contrib.learn.datasets.load_dataset("mnist")

Extracting MNIST-data/train-images-idx3-ubyte.gz
Extracting MNIST-data/train-labels-idx1-ubyte.gz
Extracting MNIST-data/t10k-images-idx3-ubyte.gz
Extracting MNIST-data/t10k-labels-idx1-ubyte.gz


In [0]:
train_data = mnist.train.images
train_labels = np.asarray(mnist.train.labels, dtype=np.int32)

In [0]:
test_data = mnist.test.images
test_labels = np.asarray(mnist.test.labels, dtype=np.int32)

In [0]:
train_data = train_data.reshape([-1,28,28,1])
test_data = test_data.reshape([-1,28,28,1])

In [14]:
mnist_classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=logdir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/content/drive/My Drive/Sem 7/DeepDOT/logs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5360d591d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [0]:
tensors_to_log = {"accuracy": "accuracy"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50)

In [0]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
  x={"x": train_data},
  y=train_labels,
  batch_size=batch_size,
  num_epochs=None,
  shuffle=True)

In [23]:
mnist_classifier.train(
    input_fn=train_input_fn,
    steps=num_steps,
    hooks=[logging_hook])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Sem 7/DeepDOT/logs/model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /content/drive/My Drive/Sem 7/DeepDOT/logs/model.ckpt.
INFO:tensorflow:accuracy = 0.115
INFO:tensorflow:loss = 2.3011487, step = 1
INFO:tensorflow:accuracy = 0.1 (22.699 sec)
INFO:tensorflow:global_step/sec: 2.21581
INFO:tensorflow:accuracy = 0.14 (22.433 sec)
INFO:tensorflow:loss = 2.2979445, step = 101 (45.131 sec)
INFO:tensorflow:accuracy = 0.155 (22.760 sec)
INFO:tensorflow:global_step/sec: 2.21218
INFO:tensorflow:accuracy = 0.14 (22.449 sec)
INFO:tensorflow:loss = 2.2768524, step = 201 (45.209 sec)
INFO:tensorflow:accuracy = 0.135 (22.493 sec)
INFO:tensorflow:global_step/sec: 2.21614
INFO:tensorflow:accurac

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7f5360d59048>