# Privacy in MNIST Dataset

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import Subset

In [2]:
# convert data to torch.FloatTensor
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, ), (0.5,))])

# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True, download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False, download=True, transform=transform)

len(train_data), len(test_data)

(60000, 10000)

In [38]:
batch_size = 32
num_teachers = 100

def teacher_loader_fn(train_data, num_teachers):
    teacher_loaders = []
    data_size = len(train_data) // num_teachers

    for i in range(num_teachers):
        indices = list(range(i*data_size, (i+1)*data_size))
        subset = Subset(train_data, indices)
        loader = torch.utils.data.DataLoader(subset, batch_size=batch_size)
        teacher_loaders.append(loader)

    return teacher_loaders

teacher_loaders = teacher_loader_fn(train_data, num_teachers)
len(teacher_loaders[99])

19

In [39]:
num_student_train_set = 9000

student_train_data = Subset(test_data, list(range(num_student_train_set)))
student_test_data = Subset(test_data, list(range(num_student_train_set, len(test_data))))

student_train_loader = torch.utils.data.DataLoader(student_train_data, batch_size=batch_size)
student_test_loader = torch.utils.data.DataLoader(student_test_data, batch_size=batch_size)

len(student_train_loader), len(student_test_loader)

(282, 32)

In [40]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x

In [46]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train(model, trainloader, criterion, optimizer, epochs, print_every):
    model.to(device)
    # steps = 0
    running_loss = 0
    for e in range(epochs):
        # Model in training mode, dropout is on
        model.train()
        for images, labels in trainloader:
            images, labels = images.to(device), labels.to(device)
            # steps += 1
            
            optimizer.zero_grad()
            
            output = model.forward(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

In [47]:
# Instantiate and train the models for each teacher
lr=0.003
epochs=10
print_every=120

def train_teachers(num_teachers):
    models = []
    for t in range(num_teachers):
        model = Network()
        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr)
        train(model, teacher_loaders[t], criterion, optimizer, epochs, print_every)
        models.append(model)
    print("{} techers trained".format(t+1))
    return models

teacher_models = train_teachers(num_teachers)

100 techers trained


In [48]:
def predict(model, dataloader):
    outputs = torch.zeros(0, dtype=torch.long).to(device)
    model.to(device)
    model.eval()
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        output = model.forward(images)
        ps = torch.argmax(torch.exp(output), dim=1)
        outputs = torch.cat((outputs, ps))
    
    return outputs

In [49]:
def prediction_fn(models, data_loader):
    preds = torch.torch.zeros((len(models), num_student_train_set), dtype=torch.long)
    for i, model in enumerate(models):
        results = predict(model, data_loader)
        preds[i] = results
    return preds.numpy()
preds = prediction_fn(teacher_models, student_train_loader)
preds.shape

(100, 9000)

In [50]:
epsilon = 0.2

def get_student_labels(preds, epsilon):
    labels = np.array([]).astype(int)
    for image_preds in np.transpose(preds):
        label_counts = np.bincount(image_preds, minlength=10)
        beta = 1 / epsilon

        for i in range(len(label_counts)):
            label_counts[i] += np.random.laplace(0, beta, 1)

        new_label = np.argmax(label_counts)
        labels = np.append(labels, new_label)
    
    return labels
student_labels = get_student_labels(preds, epsilon)
student_labels.shape

(9000,)

In [51]:
from syft.frameworks.torch.differential_privacy import pate

data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=preds, indices=student_labels, noise_eps=epsilon, delta=1e-5)
print("Data Independent Epsilon:", data_ind_eps)
print("Data Dependent Epsilon:", data_dep_eps)

Data Independent Epsilon: 1451.5129254649705
Data Dependent Epsilon: 12.587310509756408


In [52]:
# removing the test data labels and adding student labels for every batch of student train loader
def student_loader(student_train_loader, labels):
    student_iterator = iter(student_train_loader)
    for i, (data, _) in enumerate(student_iterator):
        student_train_label = torch.from_numpy(labels[i*len(data):(i+1)*len(data)])
        yield data, student_train_label

In [53]:
student_model = Network()
criterion = nn.NLLLoss()
optimizer = optim.Adam(student_model.parameters(), lr=0.001)
student_model.to(device)
steps = 0
running_loss = 0
for e in range(epochs):
    student_model.train()
    train_loader = student_loader(student_train_loader, student_labels)
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        steps += 1

        optimizer.zero_grad()
        output = student_model.forward(images)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if steps % 50 == 0:
            test_loss = 0
            accuracy = 0
            student_model.eval()
            with torch.no_grad():
                for images, labels in student_test_loader:
                    images, labels = images.to(device), labels.to(device)
                    log_ps = student_model(images)
                    test_loss += criterion(log_ps, labels).item()
                    
                    ps = torch.exp(log_ps)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == labels.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor))
            student_model.train()
            print("Epoch: {}/{}.. ".format(e+1, epochs),
                  "Training Loss: {:.3f}.. ".format(running_loss/len(student_train_loader)),
                  "Test Loss: {:.3f}.. ".format(test_loss/len(student_test_loader)),
                  "Test Accuracy: {:.3f}".format(accuracy/len(student_test_loader)))
            running_loss = 0

Epoch: 1/10..  Training Loss: 0.383..  Test Loss: 1.763..  Test Accuracy: 0.593
Epoch: 1/10..  Training Loss: 0.254..  Test Loss: 0.844..  Test Accuracy: 0.774
Epoch: 1/10..  Training Loss: 0.161..  Test Loss: 0.591..  Test Accuracy: 0.835
Epoch: 1/10..  Training Loss: 0.108..  Test Loss: 0.427..  Test Accuracy: 0.889
Epoch: 1/10..  Training Loss: 0.099..  Test Loss: 0.300..  Test Accuracy: 0.919
Epoch: 2/10..  Training Loss: 0.102..  Test Loss: 0.305..  Test Accuracy: 0.922
Epoch: 2/10..  Training Loss: 0.084..  Test Loss: 0.257..  Test Accuracy: 0.922
Epoch: 2/10..  Training Loss: 0.075..  Test Loss: 0.242..  Test Accuracy: 0.929
Epoch: 2/10..  Training Loss: 0.071..  Test Loss: 0.257..  Test Accuracy: 0.922
Epoch: 2/10..  Training Loss: 0.053..  Test Loss: 0.246..  Test Accuracy: 0.931
Epoch: 2/10..  Training Loss: 0.051..  Test Loss: 0.227..  Test Accuracy: 0.933
Epoch: 3/10..  Training Loss: 0.104..  Test Loss: 0.235..  Test Accuracy: 0.931
Epoch: 3/10..  Training Loss: 0.065..  T

In [54]:
single_model = teacher_models[99]
single_model.eval()
with torch.no_grad():
    test_loss = 0
    accuracy = 0
    for images, labels in student_test_loader:
        images, labels = images.to(device), labels.to(device)
        log_ps = single_model(images)
        test_loss += criterion(log_ps, labels).item()

        # Accuracy
        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        accuracy += torch.mean(equals.type(torch.FloatTensor))
    single_model.train()
    print("Test Loss: {:.3f}.. ".format(test_loss),
          "Test Accuracy: {:.3f}".format(accuracy))

Test Loss: 10.536..  Test Accuracy: 29.094
