In [1]:
import pandas as pd
import numpy as np

# Private and Ecrypted AI - Credit Approval Application
1. <a href="#data_prep">Data Preparation & Setup</a>
2. <a href="#classical_dl">Classical Deep Learning</a>
3. Federated Deep Learning
    - Secured Multi-Party Computation (SMPC)

<a id='data_prep'></a>
## Data Preparation
- only using non-NaN values. I drop NaN values because the dataset is not very big regardless, and we are not dropping very many values.
- Convert binary variables to a numeric representation, and one-hot-encode categorical variables. We do not want to use label encoder since a label encoder would make it 

In [2]:
cols = [ f"A{i}" for i in range(1,16)]
cols.append('label')

In [3]:
df = pd.read_csv('data/crx.data', names=cols)\
    .replace(to_replace='?', value=np.nan).dropna()
print(df.shape, "\n ------- \n")
print(df.head(2))

(653, 16) 
 ------- 

  A1     A2    A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15 label
0  b  30.83  0.00  u  g  w  v  1.25  t   t    1   f   g  00202    0     +
1  a  58.67  4.46  u  g  q  h  3.04  t   t    6   f   g  00043  560     +


In [4]:
def to_binary(df, col):
    u = df[col].unique()
    mapping =dict(zip(u, [i for i in range(0,len(u))]))
    return df[col].map(mapping)

In [5]:
df.A1.head()

0    b
1    a
2    a
3    b
4    b
Name: A1, dtype: object

In [6]:
#convert to float
for col in ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']:
    df[col] = df[col].astype(float)
    
#binarize
for col in ['A1', 'A9', 'A10', 'A12', 'label']:
    df[col] = to_binary(df, col)
    
onehot_cols = ['A4', 'A5', 'A6', 'A7', 'A13']

#perform one hot encoding, and drop original columns
df  = df.join(pd.get_dummies(df[onehot_cols], dtype=int))\
                                .drop(onehot_cols, axis=1)

In [7]:
df.dtypes

A1         int64
A2       float64
A3       float64
A8       float64
A9         int64
A10        int64
A11      float64
A12        int64
A14      float64
A15      float64
label      int64
A4_l       int64
A4_u       int64
A4_y       int64
A5_g       int64
A5_gg      int64
A5_p       int64
A6_aa      int64
A6_c       int64
A6_cc      int64
A6_d       int64
A6_e       int64
A6_ff      int64
A6_i       int64
A6_j       int64
A6_k       int64
A6_m       int64
A6_q       int64
A6_r       int64
A6_w       int64
A6_x       int64
A7_bb      int64
A7_dd      int64
A7_ff      int64
A7_h       int64
A7_j       int64
A7_n       int64
A7_o       int64
A7_v       int64
A7_z       int64
A13_g      int64
A13_p      int64
A13_s      int64
dtype: object

In [8]:
df.head(2)

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
0,0,30.83,0.0,1.25,0,0,1.0,0,202.0,0.0,...,0,0,0,0,0,1,0,1,0,0
1,1,58.67,4.46,3.04,0,0,6.0,0,43.0,560.0,...,0,1,0,0,0,0,0,1,0,0


### Simulate Real People's Data

To illustrate how this model would work in real life, I want to simulate this data belonging to people. I am generating random names to be associated with each row. I know that this is not an ideal example since I am infact starting with the data all collated on my computer with peoples names and data being directly exposed. Not differentially private at all...

In [9]:
import names #used to get random names
names.get_first_name()+' ' +names.get_last_name() #call random name

'Darrin Ware'

In [10]:
users = []
used_names = set()
for idx in range(len(df)):
    name = names.get_first_name()+' ' +names.get_last_name()
    while name in used_names:
        name = names.get_first_name()+' ' +names.get_last_name()
        
    used_names.add(name)
    users.append(name)

In [11]:
df['name'] = users
df.head(2)

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s,name
0,0,30.83,0.0,1.25,0,0,1.0,0,202.0,0.0,...,0,0,0,0,1,0,1,0,0,Michael Chalker
1,1,58.67,4.46,3.04,0,0,6.0,0,43.0,560.0,...,1,0,0,0,0,0,1,0,0,Robert Sandler


In [12]:
#get features and labels as numpy arrays which we can convert to tensors
features = df.drop(['label', 'name'], axis=1).values.astype(float)
labels = df['label'].values.astype(float)

## Model Development
I am using PyTorch to create a neural network to classify whether someone is accepted for credit or not. PyTorch integrates will with PySyft, the package used to encrypt our deep learning model

In [36]:
from torch import nn
from torch import optim
import torch.nn.functional as F
import syft as sy
import torch as th

data = th.tensor(features, dtype=th.float32, requires_grad=True)
target = th.tensor(labels, dtype=th.int64, requires_grad=False).reshape(-1,1)

class Model(nn.Module):
    '''
    Neural Network Example Model
    
    Attributes
    :hidden_layers (nn.ModuleList) - hidden units and dimensions for each layer of network
    :output (nn.Linear) - final fully-connected layer that handles output for model
    :dropout (nn.Dropout) - handling of layer-wise drop-out parameter
    
    Functions
    :forward - handling of forward pass of datum through the network.
    '''
    def __init__(self, input_size, output_size, hidden_layers, drop_p=0.2):
        super(Model, self).__init__()
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])])

        #create hidden layers
        layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:]) #gives input/output sizes for each layer
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])
        self.output = nn.Linear(hidden_layers[-1], output_size)
        self.dropout = nn.Dropout(p=drop_p)
    
    def forward(self, x):
        for each in self.hidden_layers:
            x = F.relu(each(x)) #apply relu to each hidden node
            x = self.dropout(x) #apply dropout
        x = self.output(x) #apply output weights
        return F.log_softmax(x, dim=-1) #apply activation log softmax

<a id='classical_dl'></a>
## Classical Deep Learning
Here we train our network on data that is not distributed (therefore this is not yet a federated or encrypted problem). However, this exercise is useful in showing how we can transition from traditional deep learning to federated deep learning.

First create a dataset of batch size one. This is realistic since most people would only have their own credit score data. This might be different if we decide to use a secure or trusted third party to manage parts of the data, but we don't trust the credit rating company with our data.

In [37]:
dataset = [(data[i], target[i]) for i in range(len(data))]

#instantiate model
in_size = data[0].shape[0]
out_size = 2
hidden_layers=[25,10]
model = Model(in_size, out_size, hidden_layers)

In [79]:
_data, _target = dataset[0]
_data, _target

(tensor([  0.0000,  30.8300,   0.0000,   1.2500,   0.0000,   0.0000,   1.0000,
           0.0000, 202.0000,   0.0000,   0.0000,   1.0000,   0.0000,   1.0000,
           0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           1.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   1.0000,   0.0000,   1.0000,   0.0000,   0.0000],
        grad_fn=<SelectBackward>), tensor([0]))

In [80]:
opt = optim.SGD(params=model.parameters(), lr=0.1) #use a simple stochastic gradient descent optimizer
model

Model(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=42, out_features=25, bias=True)
    (1): Linear(in_features=25, out_features=10, bias=True)
  )
  (output): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2)
)

In [158]:
#new training logic to reflect federated learning
def train(model, datasets, epochs, criterion, optimizer):
    
    steps=0
    model.train() #training mode

    for e in range(1, epochs+1):
        running_loss=0
        for ii, (data,target) in enumerate(datasets): #iterates over pointers to remote data
            steps+=1
            optimizer.zero_grad()#zero out gradients so that one forward pass doesnt pick up previous forward's gradients
            outputs = model.forward(data) #make prediction
            outputs = outputs.reshape(1,-1) #get shape of (1,2) as we need at least two dimension
            loss = criterion(outputs,target)
            loss.backward()
            optimizer.step()
            
            #print(f"step: {steps}", loss.item())
            running_loss+=loss.item()
            #code below courtesy of udacity
            
            print_every=100
            if steps % print_every == 0:
                print('Train Epoch: {} [{}/{}]  \tLoss: {:.6f}'.format(
                    e, ii+1, len(datasets), loss.item()/print_every))
                
                running_loss=0

In [125]:
train(model, dataset, 1, nn.NLLLoss(), opt)

Train Epoch: 1 [100/653]  	Loss: 0.006315
Train Epoch: 1 [200/653]  	Loss: 0.231044
Train Epoch: 1 [300/653]  	Loss: 0.001275
Train Epoch: 1 [400/653]  	Loss: 0.002540
Train Epoch: 1 [500/653]  	Loss: 0.132581
Train Epoch: 1 [600/653]  	Loss: 0.002443


We can also use PyTorch's `Dataset` class to make the processing of data a little easier, but for the purpose of this example it will not give any clear benefits. If you would like to read more about PyTorch's abstract `Dataset` class [read here](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html), with another example [here](https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel). Generally speaking, using `Dataset` and `DataLoader` makes the handling of training and testing data much easier.

In [122]:
from torch.utils.data import Dataset, DataLoader, TensorDataset
dataset_ = TensorDataset(data, target.view(-1))
data_loader = DataLoader(dataset_, batch_size=1, shuffle=True) #this gives us an identical implementation

In [159]:
%%time
#training loss will look a little different since the dataset is shuffled

train(model, data_loader, 1, nn.NLLLoss(), opt)

Train Epoch: 1 [100/653]  	Loss: 0.003197
Train Epoch: 1 [200/653]  	Loss: 0.004868
Train Epoch: 1 [300/653]  	Loss: 0.023472
Train Epoch: 1 [400/653]  	Loss: 0.004868
Train Epoch: 1 [500/653]  	Loss: 0.005587
Train Epoch: 1 [600/653]  	Loss: 0.007802
CPU times: user 702 ms, sys: 4.76 ms, total: 707 ms
Wall time: 706 ms


Now we have a credit application model that is training on our data. However, this is by no means yet federated learning. The implementation above simply trains a model with a batch size of 1. We will federate the model in the upcoming section.

#### Create a Federated Learning Network
The idea behind this is that we train a model on subsets of data (encrypted or otherwise) that never leaves the ownership of an individual. In this example of credit rating scores it would allow people to submit claims without ever losing ownership of their data. It requires very little trust of the party to which the application is being submitted.

Even though we currently have our dataset located locally, we want to simulate having many people in our network who each maintain ownership of their data. Therefore we have to create a virtual worker for each datum. The work/data flow in this situation would be as follows:

- get pointers to training data on each remote worker
*Training Steps:*
- send model to remote worker
- train model on data located with remote worker
- recieve updated model from remote worker
- repeat for all workers

#send data to remote workers
data_bob = data_bob.send(bob)
data_alice = data_alice.send(alice)

target_bob = target_bob.send(bob)
target_alice = target_alice.send(alice)

datasets = [(data_bob, target_bob), (data_alice, target_alice)]


In [130]:
hook = sy.TorchHook(th)
workers = [sy.VirtualWorker(hook, id=name) for name in df.name.str.replace(' ', '').values]

### Send Data to Remote Worker
In reality the data of each person would already be on a remote worker. Either each person's device or aggregated into multiple remote workers by a secure third party.

Here we have two options:
1. send the data to each worker individually
2. use PySyft's implemenation of PyTorch's `Dataset` and `DataLoader`

I will use PySyft's `BaseDataset`, `FederatedDataset` and `FederatedDataLoader` since this simplifies dataprocessing for larger applications, even though it is not necessary for this example.


In [143]:
# Option 1
remote_dataset = []
for i in range(len(dataset)):
    d, t = dataset[i]
    
    r_d = d.send(workers[i])
    r_t = t.send(workers[i])
    
    remote_dataset.append((r_d, r_t))
    
r_d, r_t = remote_dataset[0]
r_d #this is now a pointer to remote data rather than an actual tensor on our device

(Wrapper)>[PointerTensor | me:79412454298 -> MichaelChalker:94676651787]

In [144]:
# Option 2
# Cast the result in BaseDatasets
remote_dataset_list = []
for i in range(len(dataset)):
    d, t = dataset[i] #get data

    #send to worker before adding to dataset
    r_d = d.reshape(1,-1).send(workers[i])
    r_t = t.send(workers[i])
    
    dtset = sy.BaseDataset(r_d, r_t)
    remote_dataset_list.append(dtset)

# Build the FederatedDataset object
remote_dataset = sy.FederatedDataset(remote_dataset_list)
print(remote_dataset.workers[:5])


['MichaelChalker', 'RobertSandler', 'GayeNanney', 'KennethMckibben', 'DavidYancy']


In [146]:
train_loader = sy.FederatedDataLoader(remote_dataset, batch_size=1, shuffle=True, drop_last=False)

In [165]:
#new training logic to reflect federated learning
def federated_train(model, datasets, epochs, criterion, optimizer):
    print(f'Federated Training on {len(datasets)} remote workers (dataowners)')
    steps=0
    model.train() #training mode

    for e in range(1, epochs+1):
        running_loss=0
        for ii, (data,target) in enumerate(datasets): #iterates over pointers to remote data
            steps+=1
            
            #FEDERATION STEP
            model.send(data.location) #send model to remote worker
            
            #NB the steps below all happen remotely
            optimizer.zero_grad()#zero out gradients so that one forward pass doesnt pick up previous forward's gradients
            outputs = model.forward(data) #make prediction
            outputs = outputs.reshape(1,-1) #get shape of (1,2) as we need at least two dimension
            loss = criterion(outputs,target)
            loss.backward()
            optimizer.step()
            
            #FEDERATION STEP
            model.get() #get model with new gradients back from remote worker
            
            #FEDERATION STEP
            _loss = loss.get() #get loss from remote worker
            running_loss+=_loss
            
            print_every=100
            if steps % print_every == 0:
                print('Train Epoch: {} [{}/{}]  \tLoss: {:.6f}'.format(
                    e, ii+1, len(datasets), _loss/print_every))
                
                running_loss=0
            

In [166]:
%%time 
federated_train(model, train_loader, 1, nn.NLLLoss(), opt)

Federated Training on 653 remote workers (dataowners)
Train Epoch: 1 [100/653]  	Loss: 0.009534
Train Epoch: 1 [200/653]  	Loss: 0.088674
Train Epoch: 1 [300/653]  	Loss: 0.008809
Train Epoch: 1 [400/653]  	Loss: 0.003607
Train Epoch: 1 [500/653]  	Loss: 0.022696
Train Epoch: 1 [600/653]  	Loss: 0.006318
CPU times: user 4.54 s, sys: 29.6 ms, total: 4.57 s
Wall time: 4.57 s


_Viola!_ Now we have a federated model where the data never leaves the ownership of a remote device. We can implement this in a way where each user's device is a worker, or where we have a smaller number of workers (data owners) which are all third parties trusted by the credit applicants to take care of their data.

Nevertheless, this **data is not yet encrypted** and we could deduce things specific to the applicant just by getting or looking at the remote data.

Notice how the federated model is about 6.5x slower than the non-federated model. This is simply one of the trade-offs that we have to be willing to make.