# Boilerplate

Package installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [None]:
# !pip install tensorboardX

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
# from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want. If you change it, mention the architectural details in your report.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200,10)

    def forward(self, x):
        x = x.view((-1, 28*28))
        x = F.relu(self.fc(x))
        x = self.fc2(x)
        return x

class Normalize(nn.Module):
    def forward(self, x):
        return (x - 0.1307)/0.3081

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = nn.Sequential(Normalize(), Net())

model = model.to(device)
model.train()

# Implement the Attacks

Functions are given a simple useful signature that you can start with. Feel free to extend the signature as you see fit.

You may find it useful to create a 'batched' version of PGD that you can use to create the adversarial attack.

In [51]:
# The last argument 'targeted' can be used to toggle between a targeted and untargeted attack.
def fgsm(model, x, y, eps):
    #TODO: implement this as an intermediate step of PGD
    # Notes: put the model in eval() mode for this function
    model.eval()                   

    x.requires_grad = True

    #get gradient loss
    output = model(x)
    loss = F.cross_entropy(output,y)
    model.zero_grad()
    loss.backward()

    sign_x = x.grad.sign()

    #get eta with e * sign(loss grad) 
    n = eps * sign_x

    x_prime = x + n
    x_prime = torch.clamp(x_prime,0,1)

    return x_prime

def pgd_untargeted(model, x, y, k, eps, eps_step):
    #TODO: implement this 
    # Notes: put the model in eval() mode for this function
    # x: input image
    # y: ground truth label for x
    # k: steps of FGSM
    # eps: projection region for PGD (note the need for normalization before projection, as eps values are for inputs in [0,1])
    # eps_step: step for one iteration of FGSM
    model.eval()

    x_init = x.clone().detach()
    ball_max = x_init + eps
    ball_min = x_init - eps

    for _ in range(k):

        x.requires_grad = True
        x_new = fgsm(model, x, y, eps_step)

        x_new = torch.clamp(x_new, ball_min, ball_max)
        x_new = torch.clamp(x_new, 0, 1)

        x = x_new.detach()

    return x

    

# Implement Adversarial Training

In [62]:
def train_model(model, num_epochs, enable_defense=True, attack='pgd', eps=0.1):
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes)
    model.train()
    optimizer = optim.SGD(model.parameters())
    k = 10
    eps_step = eps/k


    for epoch in range(num_epochs):
        full_loss = 0.

        for data, label in train_loader:
            data = data.to(device)
            label = label.to(device)

            if enable_defense:
                #TODO implement
                adversarial_data = pgd_untargeted(model, data, label, k, eps, eps_step)
                full_data = torch.cat((data, adversarial_data))
                label = torch.cat((label, label))
            else:
                 full_data = data

            #standard training
            optimizer.zero_grad()
            out = model(full_data)
            loss = F.cross_entropy(out, label)

            loss.backward()
            optimizer.step()

            full_loss += loss.item()
        
        print("loss:", full_loss)

In [69]:
def test_model_on_attacks(model, attack='pgd', k=10, eps=0.1):
    # TODO: implement this function to test the robust accuracy of the given model
    # use pgd_untargeted() within this function

    eps_step = eps/k
    model.eval()

    correct, correct_second, total = 0, 0, 0

    for data, label in test_loader:
        data = data.to(device)
        label = label.to(device)

        if attack == 'pgd':
            #TODO implement
            adversarial_data = pgd_untargeted(model, data, label, k, eps, eps_step)

            full_data = torch.cat((data, adversarial_data))
            label = torch.cat((label, label))
        else:
            full_data = data

        out = model(full_data)
        _, predicted = torch.max(out.data, 1)
        
        # print(label.size(0))
        # print(data.size(0))
        # print(out.size(0))
        total += label.size(0)
        # print(predicted)
        correct += (predicted[:data.size(0)] == label[:data.size(0)]).sum().item()
        if attack == 'pgd':
            correct_second += (predicted[data.size(0):] == label[:data.size(0)]).sum().item()

    if attack == 'pgd':
        print("for eps", eps)
        print("robust accuracy", 100 * (correct+correct_second) / total)
        print("adversarial accuracy", 100 * 2*correct_second / total)
    else:
        print("accuracy", 100 * correct / total)
    

# Study Accuracy, Quality, etc.

Compare the various results and report your observations on the submission.

In [12]:
## train the original model
model = nn.Sequential(Normalize(), Net())
model = model.to(device)
model.train()

train_model(model, 20, False)
torch.save(model.state_dict(), 'weights.pt')

loss: 1502.8045136928558
loss: 744.9502051472664
loss: 532.8573279678822
loss: 448.432460218668
loss: 402.5706770569086
loss: 373.30283619463444
loss: 352.3384487628937
loss: 336.67286694049835
loss: 323.8271854966879
loss: 313.2625364214182
loss: 304.1317924633622
loss: 296.0572108477354
loss: 288.8800073117018
loss: 282.5764864012599
loss: 276.43954711407423
loss: 270.96807638555765
loss: 265.6841846704483
loss: 260.91436729580164
loss: 256.296388015151
loss: 251.809776365757


In [72]:
## basic test
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights.pt'))

test_model_on_attacks(model, attack='None', eps=0)

accuracy 92.59


In [59]:
## PGD attack
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    test_model_on_attacks(model, attack='pgd', k=10, eps=eps)

for eps 0.05
robust accuracy 81.29
adversarial accuracy 69.99
for eps 0.1
robust accuracy 57.535
adversarial accuracy 22.48
for eps 0.15
robust accuracy 47.72
adversarial accuracy 2.85
for eps 0.2
robust accuracy 46.405
adversarial accuracy 0.22


In [63]:
## PGD based adversarial training
model = nn.Sequential(Normalize(), Net())
eps = 0.1
train_model(model, 20, True, 'pgd', eps)
torch.save(model.state_dict(), f'weights_AT_{eps}.pt')

loss: 2054.8571766614914
loss: 1381.4369002580643
loss: 1097.6894508600235
loss: 969.4192295074463
loss: 893.9285340309143
loss: 842.7080737948418
loss: 804.9950725436211
loss: 776.2170244455338
loss: 753.8474614620209
loss: 735.5502720177174
loss: 720.5899590551853
loss: 707.5908622443676
loss: 696.4534449875355
loss: 686.5115034282207
loss: 677.5861720442772
loss: 669.0382596552372
loss: 661.0957971513271
loss: 653.3786489069462
loss: 646.0679879486561
loss: 639.2446138858795


In [71]:
## PGD attack
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights_AT_0.1.pt'))

test_model_on_attacks(model, attack='None', eps=0)

for eps in [0.05, 0.1, 0.15, 0.2]:
    test_model_on_attacks(model, attack='pgd', k=10, eps=eps)

accuracy 91.32
for eps 0.05
robust accuracy 86.765
adversarial accuracy 82.21
for eps 0.1
robust accuracy 78.77
adversarial accuracy 66.22
for eps 0.15
robust accuracy 66.83
adversarial accuracy 42.34
for eps 0.2
robust accuracy 53.575
adversarial accuracy 15.83
