# n1analytics/python-paillier

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
267 lines (205 sloc) 9.44 KB
 """ This example involves learning using sensitive medical data from multiple hospitals to predict diabetes progression in patients. The data is a standard dataset from sklearn[1]. Recorded variables are: - age, - gender, - body mass index, - average blood pressure, - and six blood serum measurements. The target variable is a quantitative measure of the disease progression. Since this measure is continuous, we solve the problem using linear regression. The patients' data is split between 3 hospitals, all sharing the same features but different entities. We refer to this scenario as horizontally partitioned. The objective is to make use of the whole (virtual) training set to improve upon the model that can be trained locally at each hospital. 50 patients will be kept as a test set and not used for training. An additional agent is the 'server' who facilitates the information exchange among the hospitals under the following privacy constraints: 1) The individual patient's record at each hospital cannot leave the premises, not even in encrypted form. 2) Information derived (read: gradients) from any hospital's dataset cannot be shared, unless it is first encrypted. 3) None of the parties (hospitals AND server) should be able to infer WHERE (in which hospital) a patient in the training set has been treated. Note that we do not protect from inferring IF a particular patient's data has been used during learning. Differential privacy could be used on top of our protocol for addressing the problem. For simplicity, we do not discuss it in this example. In this example linear regression is solved by gradient descent. The server creates a paillier public/private keypair and does not share the private key. The hospital clients are given the public key. The protocol works as follows. Until convergence: hospital 1 computes its gradient, encrypts it and sends it to hospital 2; hospital 2 computes its gradient, encrypts and sums it to hospital 1's; hospital 3 does the same and passes the overall sum to the server. The server obtains the gradient of the whole (virtual) training set; decrypts it and sends the gradient back - in the clear - to every client. The clients then update their respective local models. From the learning viewpoint, notice that we are NOT assuming that each hospital sees an unbiased sample from the same patients' distribution: hospitals could be geographically very distant or serve a diverse population. We simulate this condition by sampling patients NOT uniformly at random, but in a biased fashion. The test set is instead an unbiased sample from the overall distribution. From the security viewpoint, we consider all parties to be "honest but curious". Even by seeing the aggregated gradient in the clear, no participant can pinpoint where patients' data originated. This is true if this RING protocol is run by at least 3 clients, which prevents reconstruction of each others' gradients by simple difference. This example was inspired by Google's work on secure protocols for federated learning[2]. [1]: http://scikit-learn.org/stable/datasets/index.html#diabetes-dataset [2]: https://research.googleblog.com/2017/04/federated-learning-collaborative.html Dependencies: numpy, sklearn """ import numpy as np from sklearn.datasets import load_diabetes import phe as paillier seed = 43 np.random.seed(seed) def get_data(n_clients): """ Import the dataset via sklearn, shuffle and split train/test. Return training, target lists for n_clients and a holdout test set """ print("Loading data") diabetes = load_diabetes() y = diabetes.target X = diabetes.data # Add constant to emulate intercept X = np.c_[X, np.ones(X.shape[0])] # The features are already preprocessed # Shuffle perm = np.random.permutation(X.shape[0]) X, y = X[perm, :], y[perm] # Select test at random test_size = 50 test_idx = np.random.choice(X.shape[0], size=test_size, replace=False) train_idx = np.ones(X.shape[0], dtype=bool) train_idx[test_idx] = False X_test, y_test = X[test_idx, :], y[test_idx] X_train, y_train = X[train_idx, :], y[train_idx] # Split train among multiple clients. # The selection is not at random. We simulate the fact that each client # sees a potentially very different sample of patients. X, y = [], [] step = int(X_train.shape[0] / n_clients) for c in range(n_clients): X.append(X_train[step * c: step * (c + 1), :]) y.append(y_train[step * c: step * (c + 1)]) return X, y, X_test, y_test def mean_square_error(y_pred, y): """ 1/m * \sum_{i=1..m} (y_pred_i - y_i)^2 """ return np.mean((y - y_pred) ** 2) def encrypt_vector(public_key, x): return [public_key.encrypt(i) for i in x] def decrypt_vector(private_key, x): return np.array([private_key.decrypt(i) for i in x]) def sum_encrypted_vectors(x, y): if len(x) != len(y): raise ValueError('Encrypted vectors must have the same size') return [x[i] + y[i] for i in range(len(x))] class Server: """Private key holder. Decrypts the average gradient""" def __init__(self, key_length): keypair = paillier.generate_paillier_keypair(n_length=key_length) self.pubkey, self.privkey = keypair def decrypt_aggregate(self, input_model, n_clients): return decrypt_vector(self.privkey, input_model) / n_clients class Client: """Runs linear regression with local data or by gradient steps, where gradient can be passed in. Using public key can encrypt locally computed gradients. """ def __init__(self, name, X, y, pubkey): self.name = name self.pubkey = pubkey self.X, self.y = X, y self.weights = np.zeros(X.shape[1]) def fit(self, n_iter, eta=0.01): """Linear regression for n_iter""" for _ in range(n_iter): gradient = self.compute_gradient() self.gradient_step(gradient, eta) def gradient_step(self, gradient, eta=0.01): """Update the model with the given gradient""" self.weights -= eta * gradient def compute_gradient(self): """Compute the gradient of the current model using the training set """ delta = self.predict(self.X) - self.y return delta.dot(self.X) / len(self.X) def predict(self, X): """Score test data""" return X.dot(self.weights) def encrypted_gradient(self, sum_to=None): """Compute and encrypt gradient. When sum_to is given, sum the encrypted gradient to it, assumed to be another vector of the same size """ gradient = self.compute_gradient() encrypted_gradient = encrypt_vector(self.pubkey, gradient) if sum_to is not None: return sum_encrypted_vectors(sum_to, encrypted_gradient) else: return encrypted_gradient def federated_learning(X, y, X_test, y_test, config): n_clients = config['n_clients'] n_iter = config['n_iter'] names = ['Hospital {}'.format(i) for i in range(1, n_clients + 1)] # Instantiate the server and generate private and public keys # NOTE: using smaller keys sizes wouldn't be cryptographically safe server = Server(key_length=config['key_length']) # Instantiate the clients. # Each client gets the public key at creation and its own local dataset clients = [] for i in range(n_clients): clients.append(Client(names[i], X[i], y[i], server.pubkey)) # The federated learning with gradient descent print('Running distributed gradient aggregation for {:d} iterations' .format(n_iter)) for i in range(n_iter): # Compute gradients, encrypt and aggregate encrypt_aggr = clients[0].encrypted_gradient(sum_to=None) for c in clients[1:]: encrypt_aggr = c.encrypted_gradient(sum_to=encrypt_aggr) # Send aggregate to server and decrypt it aggr = server.decrypt_aggregate(encrypt_aggr, n_clients) # Take gradient steps for c in clients: c.gradient_step(aggr, config['eta']) print('Error (MSE) that each client gets after running the protocol:') for c in clients: y_pred = c.predict(X_test) mse = mean_square_error(y_pred, y_test) print('{:s}:\t{:.2f}'.format(c.name, mse)) def local_learning(X, y, X_test, y_test, config): n_clients = config['n_clients'] names = ['Hospital {}'.format(i) for i in range(1, n_clients + 1)] # Instantiate the clients. # Each client gets the public key at creation and its own local dataset clients = [] for i in range(n_clients): clients.append(Client(names[i], X[i], y[i], None)) # Each client trains a linear regressor on its own data print('Error (MSE) that each client gets on test set by ' 'training only on own local data:') for c in clients: c.fit(config['n_iter'], config['eta']) y_pred = c.predict(X_test) mse = mean_square_error(y_pred, y_test) print('{:s}:\t{:.2f}'.format(c.name, mse)) if __name__ == '__main__': config = { 'n_clients': 5, 'key_length': 1024, 'n_iter': 50, 'eta': 1.5, } # load data, train/test split and split training data between clients X, y, X_test, y_test = get_data(n_clients=config['n_clients']) # first each hospital learns a model on its respective dataset for comparison. local_learning(X, y, X_test, y_test, config) # and now the full glory of federated learning federated_learning(X, y, X_test, y_test, config)