In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
%pip install adversarial-robustness-toolbox

Collecting adversarial-robustness-toolbox
  Downloading adversarial_robustness_toolbox-1.18.2-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scikit-learn>=0.22.2
  Using cached scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn, adversarial-robustness-toolbox
Successfully installed adversarial-robustness-toolbox-1.18.2 joblib-1.4.2 scikit-learn-1.5.2 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
sys.path.insert(1, "/home/oru2/project/project")

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
# import matplotlib.pyplot as plt
import attacks
from privacy_accountant import PrivacyAccountant
from tqdm import tqdm
from torchvision import datasets, transforms

In [None]:
use_cuda = True
device = torch.device("cuda:0" if use_cuda else "cpu")
batch_size = 100

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('../mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('../mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
attack = attacks.PGD()


In [20]:
def train_model(model, train_loader, num_epochs, enable_defense=True, attack_type='pgd', eps=0.1):
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes)
    model.train()
    # epsilons_clean = []
    lr = 1e-2
    losses = []
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    for epoch in tqdm(range(num_epochs)):
        for index, (images, labels) in enumerate(train_loader):
            images = images.to(device)
            labels = labels.to(device)
            logits = model(images)
            optimizer.zero_grad()

            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

            adversary_images = attack.pgd_untargeted(model, images, labels, 10, eps, 0.01).to(device)
            optimizer.zero_grad()
            logits = model(adversary_images)
            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()

            losses.append(loss.item())
                    # print(f'Epoch [{epoch}/{num_epochs}] Loss = {loss.item():.3f}')

In [None]:
from model import fcNet
fc_model = fcNet(784, 128, 10).to(device)
num_epochs = 20
fc_model


fcNet(
  (network): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=10, bias=True)
  )
)

In [21]:
train_model(fc_model, train_loader, num_epochs)

100%|██████████| 100/100 [19:27<00:00, 11.67s/it]


In [23]:
correct = 0
fc_model.eval()
for j, (images, labels) in enumerate(test_loader):
  images, labels = images.to(device), labels.to(device)
  logits = fc_model(images)
  _, preds = torch.max(logits, 1)
  correct += (preds == labels).sum().item()
  # print('Batch [{}/{}]'.format(j+1, len(test_loader)))
fc_model.train()
print('Accuracy = {}%'.format(float(correct) * 100 / 10000))

Accuracy = 98.92%


In [24]:
correct = 0
eps = 0.1
fc_model.eval()
for j, (images, labels) in enumerate(test_loader):
  images, labels = images.to(device), labels.to(device)
  adv_images = attack.pgd_untargeted(fc_model, images, labels, 20, eps, 0.01).to(device)
  logits = fc_model(images)
  adv_logits = fc_model(adv_images)
  _, preds = torch.max(logits, 1)
  _, adv_preds = torch.max(adv_logits, 1)
  correct += (preds == labels).sum().item()
  correct += (adv_preds == labels).sum().item()
  # print('Batch [{}/{}]'.format(j+1, len(test_loader)))
fc_model.train()
print('Accuracy = {}%'.format(float(correct) * 100 / 20000))

Accuracy = 94.285%


In [25]:
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox
from art.estimators.classification import PyTorchClassifier

In [None]:
optimizer = torch.optim.Adam(fc_model.parameters())
criterion = nn.CrossEntropyLoss()

# Wrap the PyTorch model in ART's PyTorchClassifier
art_classifier = PyTorchClassifier(
    model=fc_model,
    loss=criterion,
    optimizer=optimizer,
    input_shape=(28, 28),
    nb_classes=10
)
attack_train_size = 10000
attack_test_size = 5000

x_train = train_dataset.data
y_train = train_dataset.targets


x_test = test_dataset.data
y_test = test_dataset.targets

attack = MembershipInferenceBlackBox(estimator=art_classifier, attack_model_type="nn")
attack.fit(x_train[:attack_train_size], y_train[:attack_train_size], x_test[:attack_test_size], y_test[:attack_test_size])

mlp_inferred_train_bb = attack.infer(x_train[attack_train_size:], y_train[attack_train_size:])
mlp_inferred_test_bb = attack.infer(x_test[attack_test_size:], y_test[attack_test_size:])

# check accuracy
mlp_train_acc_bb = np.sum(mlp_inferred_train_bb) / len(mlp_inferred_train_bb)
mlp_test_acc_bb = 1 - (np.sum(mlp_inferred_test_bb) / len(mlp_inferred_test_bb))
mlp_acc_bb = (mlp_train_acc_bb * len(mlp_inferred_train_bb) + mlp_test_acc_bb * len(mlp_inferred_test_bb)) / (len(mlp_inferred_train_bb) + len(mlp_inferred_test_bb))


In [34]:
print(f"Members Accuracy: {mlp_train_acc_bb:.4f}")
print(f"Non Members Accuracy {mlp_test_acc_bb:.4f}")
print(f"Attack Accuracy {mlp_acc_bb:.4f}")

Members Accuracy: 0.9994
Non Members Accuracy 0.0010
Attack Accuracy 0.8568


In [37]:
torch.save(fc_model, "models/adv.pt")