<a href="https://colab.research.google.com/github/naraquev/Private-AI/blob/master/research/Explaining_Harnessing_Adv_Examp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EXPLAINING AND HARNESSING ADVERSARIAL EXAMPLES https://arxiv.org/pdf/1412.6572.pdf

In [0]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from IPython.display import clear_output
from numpy import linalg as LA
import warnings
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow, imsave
%matplotlib inline
import torch.nn.functional as F
from torch.autograd import Variable
import os
import pandas as pd

In [0]:
# LeNet Model definition
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [0]:
# FGSM attack code
def fgsm_attack(image, epsilon, data_grad):    
    sign_data_grad = data_grad.sign()   
    perturbed_image = image + epsilon*sign_data_grad
    perturbed_image = torch.clamp(perturbed_image, 0, 1)
    return perturbed_image
def evaluate():
  model.to(device)
  # Model in test mode, dropout is off
  model.eval()
  accuracy=0
  for images, labels in test_loader:
    images, labels = images.to(device), labels.to(device)
    output = model.forward(images)
    ps = torch.exp(output)
    top_p, top_class = ps.topk(1, dim=1)
    equals = top_class == labels.view(*top_class.shape)
    accuracy += torch.mean(equals.type(torch.FloatTensor))
  print(accuracy/len(test_loader))

In [0]:
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, ), (0.5,))])
train_data = datasets.MNIST('../data', train=True, download=True, transform=transform)
test_data = datasets.MNIST('../data', train=False, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.NLLLoss()
model = Net().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.003)


In [7]:
epochs = 10
for e in range(epochs):
  running_loss = 0
  for data, label in train_loader:
    model.train()
    data, label = data.to(device), label.to(device)
    output = model(data)
    optimizer.zero_grad()
    loss = criterion(output, label)
    loss.backward()    
    optimizer.step()    
    running_loss += loss.item()    
  else:
    print('Loss: ', running_loss)
    evaluate()

Loss:  227.94026565551758
tensor(0.9796)
Loss:  47.88513109833002
tensor(0.9887)
Loss:  34.977292973548174
tensor(0.9872)
Loss:  29.426972799003124
tensor(0.9894)
Loss:  27.47946254350245
tensor(0.9885)


KeyboardInterrupt: ignored

In [0]:
wrongs = []
epsilons = [0, .05, .1, .15, .2, .25, .3]
adv_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=True)
for epsilon in epsilons:  
  print(epsilon)
  wrong = 0
  for data, target in adv_loader:
    data, target = data.to(device), target.to(device)
    data.requires_grad= True
    output = model(data)
    init_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
    if init_pred.item() != target.item():
        continue
    loss = criterion(output, target)
    model.zero_grad()
    loss.backward()
    data_grad = data.grad.data
    perturbed_data = fgsm_attack(data, epsilon, data_grad)
    output = model(perturbed_data)
    final_pred = output.max(1, keepdim=True)[1] 
    if final_pred.item() != target.item():
      wrong+=1
      adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
  else:
    wrongs.append(wrong)

In [15]:
for w, e in zip(wrongs, epsilons):
  print('There are {} misclassified with epsilon {}'.format(w, e))

There are 126 misclassified with epsilon 0
There are 239 misclassified with epsilon 0.05
There are 424 misclassified with epsilon 0.1
There are 709 misclassified with epsilon 0.15
There are 1115 misclassified with epsilon 0.2
There are 1651 misclassified with epsilon 0.25
There are 2271 misclassified with epsilon 0.3
