In [1]:
import orbax.checkpoint as ocp
import os
from ml_collections import ConfigDict
from pathlib import Path
from utils import prepare_test_dataset
from dataset_utils import get_dataset
from jax import random
from models.utils import sample_gaussian

import models.ClassifierGFZ as ClassifierGFZ
import models.ClassifierDFZ as ClassifierDFZ

checkpoint_path = "dfz-2-epochs-first-try-1"
path = os.path.join(Path.cwd(), Path(f"checkpoints"), Path(checkpoint_path))
checkpoint = ocp.PyTreeCheckpointer().restore(path, item=None)

config = ConfigDict(checkpoint["config"])
dataset_config = ConfigDict(checkpoint["dataset_config"])

if config.model_name == "GFZ":
    classifier = ClassifierGFZ
elif config.model_name == "DFZ":
    classifier = ClassifierDFZ
else:
    raise NotImplementedError(config.model_name)

_, test_ds = get_dataset(config.dataset)
test_images, test_labels = prepare_test_dataset(
    test_ds, dataset_config
    )

trained_params = checkpoint["params"]

log_likelyhood_fn = classifier.log_likelyhood_A

test_key = random.PRNGKey(config.seed)

test_key, model, _ = classifier.create_and_init(
    test_key, config, dataset_config
)



In [2]:
from flax import linen as nn
import jax
from jax import jacrev
import numpy as np
from functools import partial
from jax.scipy.special import logsumexp
import jax.numpy as jnp
import matplotlib.pyplot as plt
from tqdm import tqdm
import optax
from scipy.optimize import minimize

def init_data(test_key, n_samples=10):
    idx = np.random.choice(range(len(test_images)), n_samples, replace=False)

    all_xs = test_images[idx]
    true_ys = test_labels[idx]
    true_labels = np.argmax(true_ys, axis=1)

    K = model.K
    batch_size = n_samples
    test_key, epsilons = sample_gaussian(test_key, (batch_size, model.n_classes * K, model.d_latent))
    epsilons = epsilons[:n_samples*model.n_classes]
    all_ys = nn.one_hot(jnp.repeat(jnp.arange(model.n_classes), K), model.n_classes, dtype=jnp.float32)
    
    return all_xs, true_labels, epsilons, all_ys, K, test_key

def get_model_output(x, epsilon, y, K):
    z, logit_q_z_xy, logit_p_x_z, logit_p_y_xz = jax.vmap(
            partial(model.apply, {'params': trained_params}, train=False),
            in_axes=(None, 0, 0)
        )(x, y, epsilon)

    ll = log_likelyhood_fn(
            z, logit_q_z_xy, logit_p_x_z, logit_p_y_xz
        ).reshape(model.n_classes, K)
    ll = logsumexp(ll, axis=1) - np.log(K)
    return ll

def get_model_jacobian(x, epsilon, y, K):
    return jacrev(get_model_output, argnums=0)(x, epsilon, y, K)

def map_label_to_name(y):
    labels = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
              "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
    return labels[y]

In [49]:
# Zeroth order optimization attack
import numpy as np
import jax
import jax.numpy as jnp
import optax

class ZOO_Attack():
    def __init__(self, model, max_iter=10, learning_rate=0.1, c=1, p=2):
        self.model = model
        self.n_classes = model.n_classes
        self.max_iter = max_iter
        self.learning_rate = learning_rate
        self.c = c
        assert p > 1 
        self.p = p
        if self.p == np.inf:
            self.q = 1
        else:
            self.q = self.p / (self.p - 1)

    def qnorm(self, x):
        return jnp.linalg.norm(x.flatten(), self.q)

    def get_label(self, x):
        val = get_model_output(x, self.epsilon, self.y, self.K)
        return jnp.argmax(val)

    def get_likelihoods(self, x):
        val = get_model_output(x, self.epsilon, self.y, self.K)
        return val

    def get_gradients(self, x, epsilon=1e-5):
        # Estimate gradients using finite differences
        perturbed_x_plus = x + epsilon
        perturbed_x_minus = x - epsilon

        output_plus = self.get_likelihoods(perturbed_x_plus)
        output_minus = self.get_likelihoods(perturbed_x_minus)

        gradients = (output_plus - output_minus) / (2 * epsilon)

        return gradients

    def loss(self, val, label):
        label_one_hot_encoding = jax.nn.one_hot(jnp.array([label]), self.n_classes)
        return optax.softmax_cross_entropy(val, label_one_hot_encoding)
    
    def f(self, x, target_label, k = 0):
        val = self.get_likelihoods(x)
        max_logit = jnp.max(val[jnp.arange(self.n_classes) != target_label])
        logit_diff = jnp.maximum(max_logit - val[target_label], - k)
        return logit_diff
    
    def get_objective(self, w, x, target_label, k = 0):
        norm = self.qnorm(w)
        penalty = self.c * self.f(x + w, target_label, k = k)
        return norm + penalty
    
    def get_obj_grad(self, w, x, target_label):
        # Compute gradient of the objective function
        corrupted_x = x + w
        norm_grad = (2) * (corrupted_x - x)

        val = self.get_likelihoods(corrupted_x)
        grad_model = self.get_gradients(corrupted_x)
        max_label = jnp.argmax(val[jnp.arange(self.n_classes) != target_label])
        max_logit = val[max_label]
        logit_diff = max_logit - val[target_label]
        if logit_diff <= 0:
            penalty_grad = 0
        else:
            penalty_grad = grad_model[max_label] - grad_model[target_label]
        
        return norm_grad + self.c * penalty_grad
    
    def project_to_bounds(self, x):
        bounds_min = jnp.zeros_like(x)
        bounds_max = jnp.ones_like(x)
        return jnp.clip(x, bounds_min, bounds_max)

    def get_perturbation(self, x, epsilon, all_ys, K):
        self.y = all_ys
        self.epsilon = epsilon
        self.K = K
        true_label = self.get_label(x)
        max_perturbation_norm = -1
        best_label = true_label
        best_corrupted_x = x.copy()
        for label in range(self.n_classes): # to do : optimize this loop
            if label != true_label:
                corrupted_x = x.copy()
                w = jnp.zeros_like(corrupted_x)
                # use gradient descent to find minimum of the problem
                for i in range(self.max_iter):
                    grad = self.get_obj_grad(w, x, label)
                    w = w - self.learning_rate * grad
                    corrupted_x = x + w
                    if self.get_label(corrupted_x) == label:
                        break
                    
                # check if the attack was successful
                new_label = self.get_label(corrupted_x)
                if new_label != label:
                    print("Warning: did not find a perturbation")
                    perturbation_norm = -1
                else:
                    perturbation_norm = np.linalg.norm(corrupted_x - x)/np.linalg.norm(x)
                    print(perturbation_norm)

                # Choose minimal perturbation
                if max_perturbation_norm == -1 and perturbation_norm != -1:
                    max_perturbation_norm = perturbation_norm
                    best_label = new_label
                    best_corrupted_x = corrupted_x
                else : 
                    if perturbation_norm != -1 and perturbation_norm < max_perturbation_norm:
                        max_perturbation_norm = perturbation_norm
                        best_label = new_label
                        best_corrupted_x = corrupted_x

        return best_corrupted_x, best_label, max_perturbation_norm

In [21]:
def get_average_performance(corruption_model, all_xs, epsilons, all_ys, K):
    perturbation_norms = []
    n_samples = len(all_xs)
    for i in tqdm(range(n_samples)):
        x = all_xs[i]
        epsilon = epsilons[i]
        _, _, perturbation_norm = corruption_model.get_perturbation(x, epsilon, all_ys, K)
        perturbation_norms.append(perturbation_norm)
    return np.array(perturbation_norms)

In [52]:
n_samples = 10
all_xs, true_labels, epsilons, all_ys, K, test_key = init_data(test_key, n_samples=n_samples)

corruption_model = ZOO_Attack(model, max_iter=10, learning_rate=0.1, c=1, p=2)

perturbation_norms_ZOO = get_average_performance(corruption_model, all_xs, epsilons, all_ys, K)
perturbation_norms_successful_ZOO = perturbation_norms_ZOO[perturbation_norms_ZOO != -1]
n_successful_ZOO = len(perturbation_norms_successful_ZOO)
n_successful_ZOO
print(f'Average perturbation norm of ZOO Attack model (on {n_successful_ZOO} successful samples): {np.mean(perturbation_norms_successful_ZOO):>.4f}')

  0%|          | 0/10 [00:00<?, ?it/s]

3.6668606


 10%|█         | 1/10 [01:23<12:27, 83.10s/it]

13.479637


 20%|██        | 2/10 [02:43<10:50, 81.29s/it]

1.929487


 30%|███       | 3/10 [04:02<09:23, 80.51s/it]



 40%|████      | 4/10 [05:34<08:29, 84.84s/it]

0.40485218
1.0094314


 50%|█████     | 5/10 [06:59<07:04, 84.86s/it]

1.1651036


 60%|██████    | 6/10 [08:13<05:24, 81.23s/it]



 70%|███████   | 7/10 [09:24<03:53, 77.80s/it]

1.0363916
15.980929


 80%|████████  | 8/10 [10:33<02:30, 75.18s/it]

2.014937
2.1555138


 90%|█████████ | 9/10 [11:36<01:11, 71.42s/it]

210.23326
2.4464536
13.529338
6.8294683


100%|██████████| 10/10 [12:39<00:00, 75.96s/it]

Average perturbation norm of ZOO Attack model (on 9 successful samples): 4.6805



