<a href="https://colab.research.google.com/github/resilientmax/60DaysOfUdacity/blob/master/Differential%20Privacy%20-%20MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch

from torchvision import datasets, transforms
from torch.utils.data import Subset

# Transform the image to a tensor and normalize it
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])

# Load the train and test data by using the transform
train_data = datasets.MNIST(root='data', train=True, download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False, download=True, transform=transform)
num_teachers = 10 # Define the num of teachers
batch_size = 64 # Teacher batch size

def get_data_loaders(train_data, num_teachers):
    """ Function to create data loaders for the Teacher classifier """
    teacher_loaders = []
    data_size = len(train_data) // num_teachers
    
    for i in range(data_size):
        indices = list(range(i*data_size, (i+1)*data_size))
        subset_data = Subset(train_data, indices)
        loader = torch.utils.data.DataLoader(subset_data, batch_size=batch_size)
        teacher_loaders.append(loader)
        
    return teacher_loaders

teacher_loaders = get_data_loaders(train_data, num_teachers)

In [0]:
# Create the public dataset by using 90% of the Test data as train #data and remaining 10% as test data.

student_train_data = Subset(test_data, list(range(9000)))
student_test_data = Subset(test_data, list(range(9000, 10000)))

student_train_loader = torch.utils.data.DataLoader(student_train_data, batch_size=batch_size)
student_test_loader = torch.utils.data.DataLoader(student_test_data, batch_size=batch_size)

In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Classifier(nn.Module):
    """ A Simple Feed Forward Neural Network. 
        A CNN can also be used for this problem 
    """
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
    
    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)

In [0]:
def train(model, trainloader, criterion, optimizer, epochs=10):
    """ This function trains a single Classifier model """
    running_loss = 0
    for e in range(epochs):
        model.train()
        
        for images, labels in trainloader:
            optimizer.zero_grad()
            
            output = model.forward(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
    
def predict(model, dataloader):
    """ This function predicts labels for a dataset 
        given the model and dataloader as inputs. 
    """
    outputs = torch.zeros(0, dtype=torch.long)
    model.eval()
    
    for images, labels in dataloader:
        output = model.forward(images)
        ps = torch.argmax(torch.exp(output), dim=1)
        outputs = torch.cat((outputs, ps))
        
    return outputs
  
def train_models(num_teachers):
    """ Trains *num_teacher* models (num_teachers being the number of teacher classifiers) """
    models = []
    for i in range(num_teachers):
        model = Classifier()
        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.003)
        train(model, teacher_loaders[i], criterion, optimizer)
        models.append(model)
    return models

In [5]:
models = train_models(num_teachers)



In [6]:
import numpy as np

epsilon = 0.2

def aggregated_teacher(models, dataloader, epsilon):
    """ Take predictions from individual teacher model and 
        creates the true labels for the student after adding 
        laplacian noise to them 
    """
    preds = torch.torch.zeros((len(models), 9000), dtype=torch.long)
    for i, model in enumerate(models):
        results = predict(model, dataloader)
        preds[i] = results
    
    labels = np.array([]).astype(int)
    for image_preds in np.transpose(preds):
        label_counts = np.bincount(image_preds, minlength=10)
        beta = 1 / epsilon

        for i in range(len(label_counts)):
            label_counts[i] += np.random.laplace(0, beta, 1)

        new_label = np.argmax(label_counts)
        labels = np.append(labels, new_label)
    
    return preds.numpy(), labels
  
teacher_models = models
preds, student_labels = aggregated_teacher(teacher_models, student_train_loader, epsilon)



In [7]:
def student_loader(student_train_loader, labels):
    for i, (data, _) in enumerate(iter(student_train_loader)):
        yield data, torch.from_numpy(labels[i*len(data): (i+1)*len(data)])
        
student_model = Classifier()
criterion = nn.NLLLoss()
optimizer = optim.Adam(student_model.parameters(), lr=0.003)
epochs = 10
steps = 0
running_loss = 0
for e in range(epochs):
    student_model.train()
    train_loader = student_loader(student_train_loader, student_labels)
    for images, labels in train_loader:
        steps += 1
        
        optimizer.zero_grad()
        output = student_model.forward(images)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
        if steps % 50 == 0:
            test_loss = 0
            accuracy = 0
            student_model.eval()
            with torch.no_grad():
                for images, labels in student_test_loader:
                    log_ps = student_model(images)
                    test_loss += criterion(log_ps, labels).item()
                    
                    # Accuracy
                    ps = torch.exp(log_ps)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == labels.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor))
            student_model.train()
            print("Epoch: {}/{}.. ".format(e+1, epochs),
                  "Training Loss: {:.3f}.. ".format(running_loss/len(student_train_loader)),
                  "Test Loss: {:.3f}.. ".format(test_loss/len(student_test_loader)),
                  "Test Accuracy: {:.3f}".format(accuracy/len(student_test_loader)))
            running_loss = 0



Epoch: 1/10..  Training Loss: 0.810..  Test Loss: 2.142..  Test Accuracy: 0.440
Epoch: 1/10..  Training Loss: 0.765..  Test Loss: 1.685..  Test Accuracy: 0.723
Epoch: 2/10..  Training Loss: 0.767..  Test Loss: 1.661..  Test Accuracy: 0.765
Epoch: 2/10..  Training Loss: 0.749..  Test Loss: 1.619..  Test Accuracy: 0.829
Epoch: 2/10..  Training Loss: 0.730..  Test Loss: 1.577..  Test Accuracy: 0.852
Epoch: 3/10..  Training Loss: 0.738..  Test Loss: 1.311..  Test Accuracy: 0.831
Epoch: 3/10..  Training Loss: 0.727..  Test Loss: 1.265..  Test Accuracy: 0.879
Epoch: 3/10..  Training Loss: 0.714..  Test Loss: 1.342..  Test Accuracy: 0.899
Epoch: 4/10..  Training Loss: 0.723..  Test Loss: 1.274..  Test Accuracy: 0.883
Epoch: 4/10..  Training Loss: 0.719..  Test Loss: 1.210..  Test Accuracy: 0.912
Epoch: 4/10..  Training Loss: 0.714..  Test Loss: 1.360..  Test Accuracy: 0.916
Epoch: 5/10..  Training Loss: 0.717..  Test Loss: 1.239..  Test Accuracy: 0.918
Epoch: 5/10..  Training Loss: 0.709..  T

In [9]:
!pip install syft

Collecting syft
[?25l  Downloading https://files.pythonhosted.org/packages/38/2e/16bdefc78eb089e1efa9704c33b8f76f035a30dc935bedd7cbb22f6dabaa/syft-0.1.21a1-py3-none-any.whl (219kB)
[K     |█▌                              | 10kB 15.6MB/s eta 0:00:01[K     |███                             | 20kB 2.2MB/s eta 0:00:01[K     |████▌                           | 30kB 3.2MB/s eta 0:00:01[K     |██████                          | 40kB 2.1MB/s eta 0:00:01[K     |███████▌                        | 51kB 2.6MB/s eta 0:00:01[K     |█████████                       | 61kB 3.1MB/s eta 0:00:01[K     |██████████▍                     | 71kB 3.6MB/s eta 0:00:01[K     |████████████                    | 81kB 4.1MB/s eta 0:00:01[K     |█████████████▍                  | 92kB 4.5MB/s eta 0:00:01[K     |███████████████                 | 102kB 3.4MB/s eta 0:00:01[K     |████████████████▍               | 112kB 3.4MB/s eta 0:00:01[K     |█████████████████▉              | 122kB 3.4MB/s eta 0:00:

In [10]:
from syft.frameworks.torch.differential_privacy import pate

data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=preds, indices=student_labels, noise_eps=epsilon, delta=1e-5)
print("Data Independent Epsilon:", data_ind_eps)
print("Data Dependent Epsilon:", data_dep_eps)

W0721 19:16:51.093476 139914303608704 secure_random.py:26] Falling back to insecure randomness since the required custom op could not be found for the installed version of TensorFlow. Fix this by compiling custom ops. Missing file was '/usr/local/lib/python3.6/dist-packages/tf_encrypted/operations/secure_random/secure_random_module_tf_1.14.0.so'
W0721 19:16:51.111101 139914303608704 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/tf_encrypted/session.py:26: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.



Data Independent Epsilon: 1451.5129254649705
Data Dependent Epsilon: 1451.5129254651165
