In [1]:

import nest_asyncio
nest_asyncio.apply()

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
import tensorflow_federated as tff


In [2]:
tff.__version__

'0.50.0'

## Create Binary Classification data with sklearn

In [3]:

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

n = 10000
d = 8
noise_factor = 0.

# Create (noisy) testing data for binary classification.
X, y = make_classification(
    n_samples=n, 
    n_features=d,
    n_informative=d,
    n_redundant=0, 
    n_classes=2,
    class_sep=-1,
    flip_y=noise_factor
)

# We will work with label values -1, +1 and not 0, +1 (convert)
y[y == 0] = -1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)


## Convert to Tensors

In [4]:

# Convert the data to TensorFlow tensors
X_train_tensor = tf.constant(X_train, dtype=tf.float32)
y_train_tensor = tf.constant(y_train, dtype=tf.float32)
X_test_tensor = tf.constant(X_test, dtype=tf.float32)
y_test_tensor = tf.constant(y_test, dtype=tf.float32)

## Prepare data for Tensorflow Federated

We have the training and testing Tensors holding our data. TFF expects for each client an `OrderedDict` containing `y` and `x` data. Hence, we preprocess our Tensors to follow this convention.

**Note**: Cross-device federated learning does not use client IDs or perform any tracking of clients. However in simulation experiments using centralized test data the experimenter may select specific clients to be processed per round. The concept of a client ID is only available at the preprocessing stage when preparing input data for the simulation and is not part of the TensorFlow Federated core APIs.

In [5]:

NUM_CLIENTS = 10
BATCH_SIZE = 32

In [6]:

import collections

# Create a dictionary with the slices for each client
client_slices_train = {}

for i in range(NUM_CLIENTS):
    # Compute the indices for this client's slice
    start_idx = int(i * n / NUM_CLIENTS)
    end_idx = int((i + 1) * n / NUM_CLIENTS)

    # Get the slice for this client
    X_client_train = X_train_tensor[start_idx:end_idx]
    y_client_train = y_train_tensor[start_idx:end_idx]
    
    
    client_data_train = collections.OrderedDict([('y', y_client_train), ('x', X_client_train)])
    
    # Combine the slices into a single dataset
    client_slices_train[f'client_{i}'] = client_data_train

We need to construct `tff.simulation.datasets.ClientData`. We can use the `from_clients_and_tf_fn` function that takes as argument the `client_ids` : a list of strings to use as input to `create_dataset_fn`, and a `serializable_dataset_fn` : a function that takes a `client_id` from the above list, and returns a `tf.data.Dataset`, this function must be serializable and usable within the context of a `tf.function` and `tff.Computation`. 

In [7]:
@tf.function
def create_tf_dataset_for_client(client_data):
    return tf.data.Dataset.from_tensor_slices(client_data).batch(BATCH_SIZE)#.shuffle()

train_federated_dataset = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
    client_ids=list(client_slices_train.keys()),
    serializable_dataset_fn=lambda client_id: create_tf_dataset_for_client(
        client_slices_train[client_id]
    )
)

In [8]:
train_federated_dataset.client_ids

['client_0',
 'client_1',
 'client_2',
 'client_3',
 'client_4',
 'client_5',
 'client_6',
 'client_7',
 'client_8',
 'client_9']

In [9]:
train_federated_dataset.element_type_structure

OrderedDict([('y', TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
             ('x', TensorSpec(shape=(None, 8), dtype=tf.float32, name=None))])

First, let's define the type of input as a TFF named tuple. Since the size of data batches may vary, we set the batch dimension to None to indicate that the size of this dimension is unknown.

In [10]:

BATCH_SPEC = collections.OrderedDict(
    y=tf.TensorSpec(shape=[None], dtype=tf.float32),
    x=tf.TensorSpec(shape=[None, d], dtype=tf.float32)
)
BATCH_TYPE = tff.to_type(BATCH_SPEC)

In [11]:
str(BATCH_TYPE)

'<y=float32[?],x=float32[?,8]>'

Let's now define the TFF type of model parameters, again as a TFF named tuple of weights.

In [12]:

MODEL_SPEC = collections.OrderedDict(
    weights=tf.TensorSpec(shape=(d, 1), dtype=tf.float32)
)
MODEL_TYPE = tff.to_type(MODEL_SPEC)

In [13]:
str(MODEL_TYPE)

'<weights=float32[8,1]>'

In [14]:
import numpy as np

initial_model = collections.OrderedDict(
    weights=np.zeros([d, 1], dtype=np.float32)
)

In [15]:
@tf.function
def batch_accuracy_fn(model, batch):
    x_batch, y_batch = batch['x'], tf.expand_dims(batch['y'], axis=1)

    # dot(w, x) for the batch (each instance of x in x_batch) with with shape=(batchsize, 1)
    weights_dot_x_batch = tf.matmul(x_batch, model['weights'])

    # Prediction batch with shape=(batchsize, 1)
    y_pred_batch = tf.sign(weights_dot_x_batch)

    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred_batch, y_batch), tf.float32))

    return accuracy

@tff.tf_computation(MODEL_TYPE, BATCH_TYPE)
def batch_accuracy(model, batch):
    return batch_accuracy_fn(model, batch)

In [16]:
first_client_dataset = train_federated_dataset.create_tf_dataset_for_client('client_0')
batch_iter = iter(first_client_dataset)

In [17]:
batch_accuracy(initial_model, next(batch_iter))

0.0