In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import copy
from torch import nn
from torch import optim
import torch.nn.functional as F
import syft as sy
import torch as th
from helpers import Model

# BEWARE, ignoreing warnings is not always a good idea
# I am doing it for presentation

W0827 22:01:59.621158 139673054320448 secure_random.py:26] Falling back to insecure randomness since the required custom op could not be found for the installed version of TensorFlow. Fix this by compiling custom ops. Missing file was '/home/mkucz/p_venv/lib/python3.6/site-packages/tf_encrypted/operations/secure_random/secure_random_module_tf_1.14.0.so'
W0827 22:01:59.629590 139673054320448 deprecation_wrapper.py:119] From /home/mkucz/p_venv/lib/python3.6/site-packages/tf_encrypted/session.py:26: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.



<a id="federated_dl"></a>
## Federated Deep Learning
The idea behind federated learning is that we train a model on subsets of data (encrypted or otherwise) that never leaves the ownership of an individual. In this example of credit rating scores it would allow people to submit claims without ever losing ownership of their data. It requires very little trust of the party to which the application is being submitted.

Even though we currently have our dataset located locally, we want to simulate having many people in our network who each maintain ownership of their data. Therefore we have to create a virtual worker for each datum. The work/data flow in this situation would be as follows:

- get pointers to training data on each remote worker <br>
**Training Steps:**
- send model to remote worker
- train model on data located with remote worker
- receive updated model from remote worker
- repeat for all workers

In [2]:
features = np.load('../data/features.npy')
labels = np.load('../data/labels_dim.npy')
data = th.tensor(features, dtype=th.float32, requires_grad=True)
target = th.tensor(labels, dtype=th.float32, requires_grad=False).reshape(-1,2)

In [3]:
class Arguments():
    def __init__(self, in_size, out_size, hidden_layers,
                       activation=F.softmax, dim=-1):
        self.batch_size = 1
        self.drop_p = None
        self.epochs = 10
        self.lr = 0.001
        self.in_size = in_size
        self.out_size = out_size
        self.hidden_layers = hidden_layers
        self.precision_fractional=10
        self.activation = activation
        self.dim = dim

In [4]:
hook = sy.TorchHook(th)

def connect_to_workers(n_workers, secure_worker=False):
    '''
    Connect to remote workers with PySyft
    
    Inputs
        n_workers (int) - how many workers to connect to
        secure_worker (bool) - whether to return a trusted aggregator as well
        
    Outputs
        workers (list, <PySyft.VirtualWorker>)
    '''
    workers = [sy.VirtualWorker(hook, id=f'w_{i}') for i in range(n_workers)]

    if secure_worker:
        return workers, sy.VirtualWorker(hook, id='trusted_aggregator')

    else:
        return workers

In [5]:
checkpoint = th.load('base_model.pt') #use model trained earlier to save time

dataset = [(data[i], target[i]) for i in range(len(data))]

#instantiate model
in_size = checkpoint['in_size']
out_size = checkpoint['out_size']
hidden_layers=checkpoint['hidden_layers']

workers = connect_to_workers(len(dataset))

In [6]:
workers[:5] 
# each individual worker corresponds to a person, or rather their device
# currently these people have no objects associated with them

[<VirtualWorker id:w_0 #objects:0>,
 <VirtualWorker id:w_1 #objects:0>,
 <VirtualWorker id:w_2 #objects:0>,
 <VirtualWorker id:w_3 #objects:0>,
 <VirtualWorker id:w_4 #objects:0>]

### Send Data to Remote Worker
In reality the data of each person would already be on a remote worker. Either each person's device or aggregated into multiple remote workers by a secure third party.

Here we have two options:
1. send the data to each worker individually
2. use PySyft's implementation of PyTorch's `Dataset` and `DataLoader`

I will use PySyft's `BaseDataset`, `FederatedDataset` and `FederatedDataLoader` since this simplifies dataprocessing for larger applications, even though it is not necessary for this example.


Option 1
```
remote_dataset = []
for i in range(len(dataset)):
    d, t = dataset[i]
    
    r_d = d.reshape(1,-1).send(workers[i])
    r_t = t.reshape(1,-1).send(workers[i])
    
    remote_dataset.append((r_d, r_t))
    
r_d, r_t = remote_dataset[0]
r_d #this is now a pointer to remote data rather than an actual tensor on our device
```

In [7]:
# Option 2
# Cast the result in BaseDatasets
remote_dataset_list = []
for i in range(len(dataset)):
    d, t = dataset[i] #get data

    #send to worker before adding to dataset
    r_d = d.reshape(1,-1).send(workers[i])
    r_t = t.reshape(1,-1).send(workers[i])
    
    dtset = sy.BaseDataset(r_d, r_t)
    remote_dataset_list.append(dtset)

# Build the FederatedDataset object
remote_dataset = sy.FederatedDataset(remote_dataset_list)
print(remote_dataset.workers[:5])


['w_0', 'w_1', 'w_2', 'w_3', 'w_4']


In [8]:
remote_dataset_list[0].targets.shape

torch.Size([1, 2])

In [9]:
train_loader = sy.FederatedDataLoader(remote_dataset, batch_size=1,
                                      shuffle=False, drop_last=False)

workers[:5]

[<VirtualWorker id:w_0 #objects:2>,
 <VirtualWorker id:w_1 #objects:2>,
 <VirtualWorker id:w_2 #objects:2>,
 <VirtualWorker id:w_3 #objects:2>,
 <VirtualWorker id:w_4 #objects:2>]

In [12]:
# new training logic to reflect federated learning
def federated_train(model, datasets, criterion, args):
    # use a simple stochastic gradient descent optimizer
    # define optimizer for each model
    optimizer = optim.SGD(params=model.parameters(), lr=args.lr)

    print(f'Federated Training on {len(datasets)} remote workers (dataowners)')
    steps = 0
    model.train()  # training mode

    for e in range(1, args.epochs+1):
        running_loss = 0
        for ii, (data, target) in enumerate(datasets):
            # iterates over pointers to remote data
            steps += 1

            # FEDERATION STEP
            model.send(data.location)
            # send model to remote worker

            # NB the steps below all happen remotely
            # zero out gradients so that one forward pass
            # doesnt pick up previous forward's gradients
            optimizer.zero_grad()
            outputs = model.forward(data)  # make prediction
            # get shape of (1,2) as we need at least two dimension
            outputs = outputs.reshape(1, -1)

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            # FEDERATION STEP
            model.get()  # get model with new gradients back from remote worker

            # FEDERATION STEP
            l = loss.get()
            running_loss += l  # get loss from remote worker
        print('Train Epoch: {} \tLoss: {:.6f}'.format(e, running_loss))

In [13]:
%%time
args = Arguments(in_size, out_size, hidden_layers, 
                 activation=F.softmax, dim=1)
model = Model(args)
model.load_state_dict(checkpoint['model_state'])

federated_train(model, train_loader, nn.MSELoss(), args)

Federated Training on 653 remote workers (dataowners)
Train Epoch: 1 	Loss: 56.347801
Train Epoch: 2 	Loss: 55.781364
Train Epoch: 3 	Loss: 55.508972
Train Epoch: 4 	Loss: 55.245068
Train Epoch: 5 	Loss: 55.010921
Train Epoch: 6 	Loss: 54.805840
Train Epoch: 7 	Loss: 54.632221
Train Epoch: 8 	Loss: 54.474529
Train Epoch: 9 	Loss: 54.311638
Train Epoch: 10 	Loss: 54.210529
CPU times: user 1min 15s, sys: 177 ms, total: 1min 15s
Wall time: 1min 15s


_Viola!_ Now we have a federated model where the data never leaves the ownership of a remote device. We can implement this in a way where each user's device is a worker. The problem that occurs here, is that even though the data never leaves an owner's device, `model.get()` returns a new version of the model, which in turn violates privacy of the data owners by revealing information on their data through the updates that were made to the model. A solution to this problem is to use a **trusted third-party aggregator** to combine the remotely trained models into one, *before* sending it to the end-user (in this case me, the credit provider).

Notice how the federated model is about 32x slower than the non-federated model. This is simply one of the trade-offs that we have to be willing to make.

And to be honest, I'm not quite sure why the loss starts at 55... this is overall the same model and implementation as the vanilla neural network I implemented earlier.

The next step in this journey is **Federated Learning with Model Averaging** which you can find [here]()