# Boilerplate

Package installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [2]:
# !pip install tensorboardX

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
# from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want. If you change it, mention the architectural details in your report.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(784, 50),
            nn.ReLU(),
            nn.Linear(50, 50),
            nn.ReLU(),
            nn.Linear(50, 50),
            nn.ReLU(),
            nn.Linear(50, 10))

    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.linear_relu_stack(x)
        return x

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = Net()

model = model.to(device)
model.train()

Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=50, bias=True)
    (3): ReLU()
    (4): Linear(in_features=50, out_features=50, bias=True)
    (5): ReLU()
    (6): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [None]:
# The last argument 'targeted' can be used to toggle between a targeted and untargeted attack.
def fgsm(model, x, label, eps):
    #TODO: implement this as an intermediate step of PGD
    # Notes: put the model in eval() mode for this function
    model.eval()
    # x.requires_grad_()
    output = model(x)
    loss = F.cross_entropy(output, label)

    grad = torch.autograd.grad(
                loss, x, retain_graph=False, create_graph=False
            )[0]

    x_adv = x.detach() + eps * torch.sign(grad)
    return x_adv



def pgd_untargeted(model, x, y, k, eps, eps_step):
    #TODO: implement this 
    # Notes: put the model in eval() mode for this function
    
    model.eval()
    adv_images = x.clone().detach()
    for _ in range(k):
        adv_images.requires_grad = True
        x_adv = fgsm(model, adv_images, y, eps_step)
        delta = torch.clamp(x_adv - x, min=-eps, max=eps)
        adv_images = torch.clamp(x + delta, min = 0, max = 1)
    
    return adv_images


In [12]:
def train_model(model, train_loader, num_epochs, enable_defense=True, attack='pgd', eps=0.1):
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes)
    model.train()

    lr = 1e-2
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    for epoch in range(num_epochs):
        for index, (images, labels) in enumerate(train_loader):
            logits = model(images)
            optimizer.zero_grad()

            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch}/{num_epochs}] Loss = {loss.item():.3f}')

    # if enable_defense:
    #         for epoch in range(num_epochs):
    #             for index, (images, labels) in enumerate(train_loader):
    #                 adversary_images = pgd_untargeted(model, images, labels, 10,  eps, 0.01)
    #                 model.train()
    #                 optimizer.zero_grad()
    #                 logits = model(adversary_images)
    #                 loss = F.cross_entropy(logits, labels)
    #                 loss.backward()
    #                 optimizer.step()
                    
    #             print(f'Epoch [{epoch}/{num_epochs}] Loss = {loss.item():.3f}')

## Standard Training

In [13]:
## train the original model
model = model.to(device)
model.train()

train_model(model, train_loader, 30, False)
torch.save(model.state_dict(), 'standard-weights.pt')

Epoch [0/30] Loss = 0.136
Epoch [1/30] Loss = 0.074
Epoch [2/30] Loss = 0.026
Epoch [3/30] Loss = 0.106
Epoch [4/30] Loss = 0.047
Epoch [5/30] Loss = 0.035
Epoch [6/30] Loss = 0.060
Epoch [7/30] Loss = 0.212
Epoch [8/30] Loss = 0.122
Epoch [9/30] Loss = 0.128
Epoch [10/30] Loss = 0.010
Epoch [11/30] Loss = 0.002
Epoch [12/30] Loss = 0.014
Epoch [13/30] Loss = 0.008
Epoch [14/30] Loss = 0.013
Epoch [15/30] Loss = 0.001
Epoch [16/30] Loss = 0.017
Epoch [17/30] Loss = 0.004
Epoch [18/30] Loss = 0.001
Epoch [19/30] Loss = 0.001
Epoch [20/30] Loss = 0.004
Epoch [21/30] Loss = 0.056
Epoch [22/30] Loss = 0.000
Epoch [23/30] Loss = 0.007
Epoch [24/30] Loss = 0.025
Epoch [25/30] Loss = 0.001
Epoch [26/30] Loss = 0.014
Epoch [27/30] Loss = 0.000
Epoch [28/30] Loss = 0.005
Epoch [29/30] Loss = 0.000


### Standard Accuracy

In [14]:
correct = 0
model.eval()
for j, (images, labels) in enumerate(test_loader):
  images, labels = images, labels
  logits = model(images)
  _, preds = torch.max(logits, 1)
  correct += (preds == labels).sum().item()
  # print('Batch [{}/{}]'.format(j+1, len(test_loader)))
model.train()
print('Accuracy = {}%'.format(float(correct) * 100 / 10000))

Accuracy = 97.37%


In [43]:
def interval_analysis(model: Net, inputs, epsilon):
    inputs = inputs.view(inputs.size(0), -1)
    lb = inputs - epsilon
    ub = inputs + epsilon

    layers = model.linear_relu_stack
    for index in range(0,len(layers) - 1, 2):
        layer = layers[index]
        weight, bias = layer.weight, layer.bias

        curr_lb = torch.matmul(lb, weight.t()) + bias
        curr_ub = torch.matmul(ub, weight.t()) + bias
        
        curr_lb = torch.relu(curr_lb)
        curr_ub = torch.relu(curr_ub)

        lb = torch.minimum(curr_lb, curr_ub)
        ub = torch.maximum(curr_lb, curr_ub)

    layer = layers[-1]
    weight, bias = layer.weight, layer.bias

    curr_lb = torch.matmul(lb, weight.t()) + bias
    curr_ub = torch.matmul(ub, weight.t()) + bias

    lb = torch.minimum(curr_lb, curr_ub)
    ub = torch.maximum(curr_lb, curr_ub)

    return lb, ub

def evaluate_robustness(model, test_loader, epsilons):
    model.eval()
    robust_accuracies  = []
    with torch.no_grad():
        for epsilon in epsilons:
            correct = 0
            total = 0
            for j, (images, labels) in enumerate(test_loader):
                lb, ub = interval_analysis(model, images, epsilon)
                lb_values = lb[torch.arange(images.size(0)), labels]

                mask = torch.full_like(ub, float('-inf'))
                mask[torch.arange(images.size(0)), labels] = ub[torch.arange(images.size(0)), labels]
                outputs_masked = torch.where(mask == ub, float('-inf'), ub)
                max_values_excluding_class = outputs_masked.max(dim=1).values
                
                differences = lb_values > max_values_excluding_class
                correct += differences.sum().item()
                total += labels.size(0)
            accuracy = correct / total
            robust_accuracies.append(accuracy)
            print(f"Epsilon: {epsilon}, Accuracy: {accuracy:.4f}")

            # _, preds = torch.max(logits, 1)
            # correct += (preds == labels).sum().item()
epsilons = np.linspace(0.01, 0.1, 10)
evaluate_robustness(model, test_loader, epsilons)

Epsilon: 0.01, Accuracy: 0.0816
Epsilon: 0.020000000000000004, Accuracy: 0.0794
Epsilon: 0.030000000000000006, Accuracy: 0.0769
Epsilon: 0.04000000000000001, Accuracy: 0.0752
Epsilon: 0.05000000000000001, Accuracy: 0.0738
Epsilon: 0.06000000000000001, Accuracy: 0.0720
Epsilon: 0.07, Accuracy: 0.0708
Epsilon: 0.08, Accuracy: 0.0686
Epsilon: 0.09000000000000001, Accuracy: 0.0666
Epsilon: 0.1, Accuracy: 0.0656
