<a href="https://colab.research.google.com/github/Maupin1991/ML_pytorch_tutorial/blob/master/5_AttackNNWithPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
from torch import nn
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
from torch import optim


np.random.seed(99)
torch.manual_seed(10);
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:  ", device)

In [0]:
# Model class
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 10)
        
    def forward(self, x):
        # make sure input tensor is flattened
        x = x.view(x.shape[0], -1)
        
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.log_softmax(self.fc4(x), dim=1)
        
        return x

In [0]:
# Define a transform
transform = transforms.Compose([transforms.ToTensor()])
n_train = 10000
n_test = 1000
# validation set is 20% of the training set
valid_size = 0.2

epochs = 30

# Download and load the data
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, 
                          train=True, transform=transform)
testset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, 
                          train=False, transform=transform)

# Splitting train/validation and testing set

# training set
train_idxs = np.arange(len(trainset))
np.random.shuffle(train_idxs)
train_idxs = train_idxs[:n_train].tolist()

# subsample validation set from training set
n_valid = int(np.floor(n_train * valid_size))
valid_idxs = train_idxs[:n_valid]
train_idxs = train_idxs[n_valid:]

# testing set
test_idxs = np.arange(len(testset))
np.random.shuffle(test_idxs)
test_idxs = test_idxs[:n_test].tolist()

# extract only the selected indices
train_subset = torch.utils.data.Subset(trainset, train_idxs)
valid_subset = torch.utils.data.Subset(trainset, valid_idxs)
test_subset = torch.utils.data.Subset(testset, test_idxs)

# data loader (finally)
trainloader = torch.utils.data.DataLoader(train_subset, batch_size=32)
validloader = torch.utils.data.DataLoader(valid_subset, batch_size=32)
testloader = torch.utils.data.DataLoader(test_subset, batch_size=32)

In [0]:
criterion = nn.NLLLoss()

In [0]:
net = Network()
optimizer = optim.SGD(net.parameters(), lr=0.03)

In [0]:
net.to(device)
train_losses, valid_losses = [], []

# train model and evaluate with validation set
for e in range(epochs):
    running_loss = 0
    for images, labels in trainloader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        
        log_ps = net(images)
        loss = criterion(log_ps, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    else:
        valid_loss = 0
        accuracy = 0
        
        # Turn off gradients for validation, saves memory and computations
        with torch.no_grad():
            net.eval()
            for images, labels in validloader:
                images, labels = images.to(device), labels.to(device)
                log_ps = net(images)
                valid_loss += criterion(log_ps, labels)
                
                ps = torch.exp(log_ps)
                top_p, top_class = ps.topk(1, dim=1)
                equals = top_class == labels.view(*top_class.shape)
                accuracy += torch.mean(equals.type(torch.FloatTensor))
        
        net.train()
        
        train_losses.append(running_loss/len(trainloader))
        valid_losses.append(valid_loss/len(validloader))

        print("Epoch: {}/{}.. ".format(e+1, epochs),
              "Training Loss: {:.3f}.. ".format(train_losses[-1]),
              "Validation Loss: {:.3f}.. ".format(valid_losses[-1]),
              "Validation Accuracy: {:.3f}".format(accuracy/len(validloader)))

In [0]:
# evaluate on testing set
with torch.no_grad():
    net.eval()
    test_loss = 0
    accuracy = 0

    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        output = net(images)
        test_loss += criterion(output, labels)

        top_p, top_class = output.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        accuracy += torch.mean(equals.type(torch.FloatTensor))

    print("Test Accuracy: {:.3f}".format(accuracy/len(testloader)))

We are going to use the [FGSM attack](https://arxiv.org/abs/1412.6572) to fool our trained network. 

**Code and details are available [in this GitHub repo](https://github.com/1Konny/FGSM)**.

---



![FGSM example](https://raw.githubusercontent.com/1Konny/FGSM/master/misc/overview.PNG)


---

We are going to implement the following expression:


 $$\huge{x_{adv} = x_{benign} + \varepsilon * \text{sign}(\nabla_{x_{benign}}J(\theta, x_{benign}, y))}$$
 
 Note that we can distinguish 2 possible directions:
 
 * untargeted: we just want to move the point where the loss is maximum. We can do so by computing the loss with the original labels and the output of the network and maximizing it. 
 
 * targeted: we want to minimize the loss with respect to a particular class. We can do so by computing the loss with the target labels and the output of the network and minimizing it.
 
 
 **Remember that we are dealing with image data**. This means that we should limit the perturbation in the image domain, so when adding $\varepsilon$ to each pixel we should clip values to the right range.

In [0]:
from torch.autograd import Variable

def fgsm(x, y, net, criterion, targeted=False, eps=0.03, x_val_min=-1, x_val_max=1):
    x_adv = Variable(x.data, requires_grad=True)
    h_adv = net(x_adv)
    if targeted:
        cost = criterion(h_adv, y)
    else:
        cost = -criterion(h_adv, y)

    net.zero_grad()
    if x_adv.grad is not None:
        x_adv.grad.data.fill_(0)
    cost.backward()

    x_adv.grad.sign_()
    x_adv = x_adv - eps*x_adv.grad
    x_adv = torch.clamp(x_adv, x_val_min, x_val_max)


    h = net(x)
    h_adv = net(x_adv)

    return x_adv, h_adv, h

In [0]:
x, y = next(iter(testloader))
x, y = x.to(device), y.to(device)
x_adv, h_adv, h = fgsm(x, y, net, criterion, eps=0.08, x_val_min=0)
x_adv = x_adv.detach();

In [0]:
images = x
images_adv, labels = x_adv, y
classes = range(10)

# move model inputs to cuda, if GPU available
images_cuda = images.to(device)
images_adv_cuda = images_adv.to(device)

# get sample outputs
output = net(images_cuda)
output_adv = net(images_adv_cuda)
# convert output probabilities to predicted class
_, preds_tensor = torch.max(output, 1)
_, preds_tensor_adv = torch.max(output_adv, 1)
preds = np.squeeze(preds_tensor.cpu().numpy())
preds_adv = np.squeeze(preds_tensor_adv.cpu().numpy())
# plot the images in the batch, along with predicted and true labels
fig = plt.figure(figsize=(20, 3))

images = images.cpu().numpy()
images_adv = images_adv.cpu().numpy()

n_images = 10

for idx in range(n_images):
    ax = fig.add_subplot(2, n_images, idx+1, xticks=[], yticks=[])
    plt.imshow(images[idx][0, :, :], interpolation='nearest', cmap='gray_r')
    plt.axis("off")
    ax.set_title("{} ({})".format(classes[preds[idx]], classes[labels[idx]]),
                 color=("green" if preds[idx]==labels[idx].item() else "red"))
    
for idx in range(n_images):
    ax = fig.add_subplot(2, n_images, n_images+idx+1, xticks=[], yticks=[])
    plt.imshow(images_adv[idx][0, :, :], interpolation='nearest', cmap='gray_r')
    plt.axis("off")
    ax.set_title("{} ({})".format(classes[preds_adv[idx]], classes[labels[idx]]),
                 color=("green" if preds_adv[idx]==labels[idx].item() else "red"))
    

## Challenge

Plot accuracy with respect to the maximum perturbation $\varepsilon$.

This kind of plot is called **security evaluation curve**. It displays how much accuracy drops with increasing values of worst-case (adversarial) perturbation.

You should end up with a plot similar to the ones displayed [here](https://advx-secml.pluribus-one.it/).

In [0]:
# for loop, with several values of eps
# iterate through all the testing set

eps_values = np.arange(0, 1, 0.05)
for i in range(len(eps_values)):
    for imgs, labels in testloader:
        # generate adv images
        
        # obtain output
        
        pass
        
    # compute accuracy

    # store accuracy
        
    pass

# plot curve

In [0]:
# now try to add regularization and display again the security evaluation curve

# dropout network class

# optimizer with weight decay

# plot accuracy again