# Adversarial Attacks

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m12sl/dl-hse-2020/blob/master/06-adversarialx/adversarialx.ipynb)


**Цели тетрадки**

1. Познакомиться с двумя типами уязвимостей: perturbation-based и invariance-based.
2. Проверить реализуемость атак.

**План**

1. Натренировать сеть для атаки.
2. Реализовать два варианта perturbation-based атаки.
3. Реализовать invariance-based атаку.
4. Проверить устойчивость атак при реалистичных преобразованиях

In [None]:
# install requirements
! pip install torchviz torchvision

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict

from IPython.display import clear_output

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torchvision import datasets, transforms
from torchvision.datasets import FashionMNIST
from torchvision import transforms

In [None]:
transform = transforms.Compose([
    transforms.ToTensor()
])
# имеет смысл добавить нормирование картинок

train_dataset = FashionMNIST("./tmp", train=True, download=True, transform=transform)
val_dataset = FashionMNIST("./tmp", train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=32)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
def train(model, optimizer, dataloader): 
    model.to(device)
    model.train()
    logs = defaultdict(list)
    for x, y in tqdm(dataloader):
        # todo: your code here        
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
        acc = torch.eq(torch.max(output, 1)[1], y).float().mean()

        logs['acc'].append(acc.item())
        logs['loss'].append(loss.item())
    return logs

def validate(model, dataloader):
    model.to(device)
    model.eval()
    logs = defaultdict(list)
    for x, y in tqdm(dataloader):
        # todo: your code here
        with torch.no_grad():
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss = F.nll_loss(output, y)
            acc = torch.eq(torch.max(output, 1)[1], y).float().mean()
        logs['acc'].append(acc.item())
        logs['loss'].append(loss.item())
    
    return {k: [np.mean(v)] for k, v in logs.items()}

def plot_logs(logs):
    clear_output()
    plt.figure()
    plt.plot(logs['acc'], zorder=1)
    plt.scatter(logs['steps'], logs['val_acc'], marker='+', s=180, c='orange', label='val', zorder=2)
    plt.show()

    plt.figure()            
    plt.legend()
    plt.grid()
    plt.show()


def train_model(model, optimizer, train_loader, val_loader, epochs=10):
    logs = defaultdict(list)
    for epoch in range(epochs):
        train_logs = train(model, opt, train_loader)
    
        for k, v in train_logs.items():
            logs[k].extend(v)

        val_logs = validate(model, val_loader)
        for k, v in val_logs.items():
            logs[f'val_{k}'].extend(v)
        logs['steps'].append(len(logs['loss']))

        clear_output()
        plot_logs(logs)

**Соберите какую-нибудь сверточную сеть и натренируйте ее**

Для экономии времени можно остановиться на отметке 0.8+.

In [None]:
cnn = nn.Sequential(
    <your code here>
)

opt = torch.optim.SGD(cnn.parameters(), lr=0.01)
train_model(cnn, opt, train_loader, val_loader, epochs=4)

In [None]:
# Посмотрим на примеры картинок
def imshow(images):
    img = torchvision.utils.make_grid(images).numpy()
    plt.figure()
    plt.imshow(np.transpose(img, (1, 2, 0)))
    plt.axis('off')
    plt.show()

    
idx = [5, 10, 100, 0]
imshow([val_dataset[_][0] for _ in idx])
print([val_dataset[_][1] for _ in idx])

# Perturbation-based attack

Имея white-box модель можно оптимизировать входные данные с помощью градиентного спуска.
Предлагается попробовать два варианта perturbation-based атак:

1. Наивный: $x = x + \varepsilon \nabla_x \mathrm{loss(\theta, x, y)}$

2. Fast Gradient Sign: $x = x + \varepsilon \, \mathrm{sign}[\nabla_x \mathrm{loss(\theta, x, y)}]$

In [None]:
from copy import deepcopy
def perturbation_attack_simple(inputs, labels, model, weight):    
    # получите градиенты на картинку
    # примените их к картинке
    <your code>
    return attacked, predicted

attacked, preds = perturbation_attack(inputs, labels, copy(cnn), 10)
print(labels)
imshow(attacked)
print(preds)

In [None]:
def perturbation_attack_sign(inputs, labels, model, weight):
    # получите градиенты на картинку
    # примените их к картинке
    <your code>
    return attacked, predicted


attacked, preds = perturbation_attack_sign(inputs, labels, copy(cnn), 0.2)
print(labels)
imshow(corrupted_inputs.data)
print(preds)

In [None]:
def evaluate_network_attack(net, dataloader, corrupt_function, weight):
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))
    for data in tqdm(dataloader):
        images, labels = data
        images, _  = corrupt_function(images, labels, net, weight)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        c = (predicted == labels).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i]
            class_total[label] += 1

    print('Accuracy %d %% \n' % (100. * sum(class_correct) / sum(class_total)))
        
    for i in range(10):
        print('Accuracy of %2s : %2d %%' % (
              i, 100. * class_correct[i] / class_total[i]))
        
evaluate_network_attack(cnn, val_loader, perturbation_attack, 10)
evaluate_network_attack(cnn, val_loader, perturbation_attack_sign, 0.1)

# Invariance-based attack

Пусть у нас есть два примера A и B разных классов (соответственно с разными логитами).
Надо изменить B так, чтобы сеть выдавала такой же набор логитов, как и на A.

In [None]:
import torchviz
import seaborn as sns

def invariance_attack(A, B, model, iterations=1000, weight=0.01):
    # получите target-логиты
    # напишите подходящий лосс
    # оптимизируйте картинку
    
    return attacked

imshow([val_dataset[0][0], val_dataset[1][0]])
A = val_dataset[0][0].unsqueeze(0)
B = val_dataset[1][0].unsqueeze(0)

x = invariance_attack(A, B, deepcopy(cnn))
plt.imshow(x, cmap='gist_gray')