In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf

In [2]:
tf.__version__

'2.11.0'

## Create Binary Classification data with sklearn

In [3]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


n = 100_000
d = 100


noise_factor = 0.01 # % of the labels are randomly flipped, DEFAULT=0.01
test_size = 0.1 # % of n
# The factor multiplying the hypercube size. Larger values spread out the 
# clusters/classes and make the classification task easier. DEFAULT=1
class_sep = -1
seed = 7

# Create (noisy) testing data for binary classification.
X, y = make_classification(
    n_samples=n, 
    n_features=d,
    n_informative=d,
    n_redundant=0, 
    n_classes=2,
    class_sep=class_sep,
    flip_y=noise_factor,
    random_state=seed
)

# We will work with label values -1, +1 and not 0, +1 (convert)
y[y == 0] = -1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [4]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

# PA-I regressor from sklearn
pa1 = PassiveAggressiveClassifier(C=0.01, loss="hinge", n_jobs=-1)
pa1.fit(X_train, y_train)

accuracy_score(y_test, pa1.predict(X_test))

0.7846

## Convert to Tensors

In [5]:
X_train_tensor = tf.constant(X_train, dtype=tf.float32)
y_train_tensor = tf.constant(y_train, dtype=tf.float32)
X_test_tensor = tf.constant(X_test, dtype=tf.float32)
y_test_tensor = tf.constant(y_test, dtype=tf.float32)

Delete sklearn type data 

In [6]:
del X, y, X_train, X_test, y_train, y_test

## Prepare data for Federated Learning

### Create centralized testing dataset

In [7]:
slices_test = (X_test_tensor, y_test_tensor)

In [8]:
def create_tf_dataset_for_testing(batch_size):
    return tf.data.Dataset.from_tensor_slices(slices_test).batch(batch_size)

In [9]:
test_dataset = create_tf_dataset_for_testing(32)

### Slice the Tensors for each Client

We will cut the training data, i.e., (`X_train_tensor`, `y_train_tensor`) to equal parts, each part corresponding to one Client. We want to give the result back as a dictionary with key `client_id` and value the training tensor data.

In [10]:
def create_data_for_clients(num_clients):
    
    client_slices_train = {}

    n_test = int(n - n*test_size)

    for i in range(num_clients):
        # Compute the indices for this client's slice
        start_idx = int(i * n_test / num_clients)
        end_idx = int((i + 1) * n_test / num_clients)

        # Get the slice for this client
        X_client_train = X_train_tensor[start_idx:end_idx]
        y_client_train = y_train_tensor[start_idx:end_idx]
        
        # Combine the slices into a single dataset
        client_slices_train[f'client_{i}'] = (X_client_train, y_client_train)
    
    return client_slices_train

### Create TF friendly data for each Client

Given a Tensor slice (i.e. value of `client_slices_train["client_id"]` we convert it to highly optimized `tf.data.Dataset` to prepare for training.

In [11]:
def create_tf_dataset_for_client(client_tensor_slices, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed):
    
        return tf.data.Dataset.from_tensor_slices(client_tensor_slices) \
            .shuffle(buffer_size=shuffle_buffer_size, seed=seed).batch(batch_size) \
            .prefetch(tf.data.AUTOTUNE).take(num_steps_until_rtc_check)

### Create Federated Learning data

In [12]:
def create_federated_data(client_slices_train, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed=None):
    
    federated_dataset = [ 
        create_tf_dataset_for_client(client_tensor_slices, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed)
        for client, client_tensor_slices in client_slices_train.items()
    ]
    
    return federated_dataset

## PA-Classiers (binary classification)

![PA](images/PA_binary_classifiers.png)

In [13]:
@tf.function
def client_train(model, dataset, C):

    @tf.function
    def _train_on_batch(model, batch, C):

        x_batch, y_batch = batch
        
        # from shape (d,) make it (d,1)
        y_batch = tf.expand_dims(y_batch, axis=1)

        # dot(w, x) for the batch (each instance of x in x_batch) with with shape=(batchsize, 1)
        weights_dot_x_batch = tf.matmul(x_batch, model)

        # Prediction batch with shape=(batchsize, 1)
        y_pred_batch = tf.sign(weights_dot_x_batch)

        # Suffer loss for each prediction (of instance) in the batch with shape=(batchsize,1)
        loss_batch = tf.maximum(0., 1. - tf.multiply(y_batch, weights_dot_x_batch))

        # shape=(batchsize,1) where each instance is ||x||^2, x in x_batch
        norm_batch = tf.expand_dims(tf.reduce_sum(tf.square(x_batch), axis=1), axis=1)
        
        # PA-1 : Learning rate t for each instance x, with shape=(batchsize,1)
        t_batch = tf.maximum(C, tf.divide(loss_batch, norm_batch))

        # each instance is y*t*x, where y,t scalars and x in x_batch. shape=(batchsize,d)
        t_y_x_batch = tf.multiply(t_batch, tf.multiply(y_batch, x_batch))

        # !!!! Update with mean t*y*x
        t_y_x_update = tf.expand_dims(tf.reduce_mean(t_y_x_batch, axis=0) ,axis=1)

        # Update
        model.assign_add(t_y_x_update)
    
    for batch in dataset:
        _train_on_batch(model, batch, C)
    

# Functional Dynamic Averaging

We follow the Functional Dynamic Averaging (FDA) scheme. Let the mean model be

$$ \overline{w_t} = \frac{1}{k} \sum_{i=1}^{k} w_t^{(i)} $$

where $ w_t^{(i)} $ is the model at time $ t $ in some round in the $i$-th learner.

Local models are trained independently and cooperatively and we want to monitor the Round Terminating Conditon (**RTC**):

$$ \frac{1}{k} \sum_{i=1}^{k} \lVert w_t^{(i)} - \overline{w_t} \rVert_2^2  \leq \Theta $$

where the left-hand side is the **model variance**, and threshold $\Theta$ is a hyperparameter of the FDA, defined at the beginning of the round; it may change at each round. When the monitoring logic cannot guarantee the validity of RTC, the round terminates. All local models are pulled into `tff.SERVER`, and $\bar{w_t}$ is set to their average. Then, another round begins.

### Monitoring the RTC

FDA monitors the RTC by applying techniques from Functionary [Functional Geometric Averaging](http://users.softnet.tuc.gr/~minos/Papers/edbt19.pdf). We first restate the problem of monitoring RTC into the standard distributed stream monitoring formulation. Let

$$ S(t) =  \frac{1}{k} \sum_{i=1}^{k} S_i(t) $$

where $ S(t) \in \mathbb{R}^n $ be the "global state" of the system and $ S_i(t) \in \mathbb{R}^n $ the "local states". The goal is to monitor the threshold condition on the global state in the form $ F(S(t)) \leq \Theta $ where $ F : \mathbb{R}^n \to \mathbb{R} $ a non-linear function. Let

$$ \Delta_t^{(i)} = w_t^{(i)} - w_{t_0}^{(i)} $$

be the update at the $ i $-th learner, that is, the change to the local model at time $t$ since the beginning of the current round at time $t_0$. Let the average update be

$$ \overline{\Delta_t} = \frac{1}{k} \sum_{i=1}^{k} \Delta_t^{(i)} $$

it follows that the variance can be written as

$$ \frac{1}{k} \sum_{i=1}^{k} \lVert w_t^{(i)} - \overline{w_t} \rVert_2^2 = \Big( \frac{1}{k} \sum_{i=1}^{k} \lVert \Delta_t^{(i)} \rVert_2^2 \Big) - \lVert \overline{\Delta_t} \rVert_2^2 $$

So, conceptually, if we define
$$ S_i(t) = \begin{bmatrix}
           \lVert \Delta_t^{(i)} \rVert_2^2 \\
           \Delta_t^{(i)}
         \end{bmatrix} \quad \text{and} \quad
         F(\begin{bmatrix}
           v \\
           \bf{x}
         \end{bmatrix}) = v - \lVert \bf{x} \rVert_2^2 $$

The RTC is equivalent to condition $$ F(S(t)) \leq \Theta $$

## 1️⃣ Naive FDA

In the naive approach, we eliminate the update vector from the local state (i.e. recuce the dimension to 0). Define local state as

$$ S_i(t) = \lVert \Delta_t^{(i)} \rVert_2^2 \in \mathbb{R}$$ 

and the identity function

$$ F(v) = v $$

It is trivial that $ F(S(t)) \leq \Theta $ implies the RTC.

In [14]:
@tf.function
def steps(last_sync_model, model, dataset, C):
    # number of steps depent on `.take()` from `dataset`
    client_train(model, dataset, C)
    
    Delta_i = model - last_sync_model
    Delta_i_euc_norm_squared = tf.reduce_sum(tf.square(Delta_i), axis=0) # ||D(t)_i||^2
    
    return Delta_i_euc_norm_squared, Delta_i

## Accuracy Testing

In [15]:
@tf.function
def accuracy(model, dataset):
    
    @tf.function
    def _batch_accuracy(model, batch):
        x_batch, y_batch = batch
        # from shape (d,) make it (d,1)
        y_batch = tf.expand_dims(y_batch, axis=1)

        # dot(w, x) for the batch (each instance of x in x_batch) with with shape=(batchsize, 1)
        weights_dot_x_batch = tf.matmul(x_batch, model)

        # Prediction batch with shape=(batchsize, 1)
        y_pred_batch = tf.sign(weights_dot_x_batch)

        accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred_batch, y_batch), tf.float32))

        return accuracy
    
    # We take advantage of AutoGraph (convert Python code to TensorFlow-compatible graph code automatically)
    acc, num_batches = 0., 0.
    for batch in dataset:
        acc += _batch_accuracy(model, batch)
        num_batches += 1
        
    acc = acc / num_batches
    
    return acc

## Training Loop

In [16]:
import sys

In [17]:
class Metrics:
    def __init__(self, fda_name, n_train, d_train, test_size, num_clients, 
                 batch_size, steps_at_a_time, theta, C, sketch_width=0, sketch_depth=0):
        
        if fda_name == "Naive":
            local_state_bytes = 4
        elif fda_name == "Linear":
            local_state_bytes = 8
        else:
            local_state_bytes = sketch_width * sketch_depth * 4
        
        self.initial_conditions = {
            "theta" : tf.constant(theta, shape=(), dtype=tf.float64),
            "n_train" : tf.constant(n_train, shape=(), dtype=tf.int64),
            "d_train" : tf.constant(d_train, shape=(), dtype=tf.int64),
            "test_size" : tf.constant(test_size, shape=(), dtype=tf.float64),
            "num_clients" : tf.constant(num_clients, shape=(), dtype=tf.int64),
            "batch_size" : tf.constant(batch_size, shape=(), dtype=tf.int64),
            "steps_at_a_time" : tf.constant(steps_at_a_time, shape=(), dtype=tf.int64),
            "local_state_bytes" : tf.constant(local_state_bytes, shape=(), dtype=tf.int64),
            "sketch_width" : tf.constant(sketch_width, shape=(), dtype=tf.int64),
            "sketch_depth" : tf.constant(sketch_depth, shape=(), dtype=tf.int64),
            "C" : tf.constant(C, shape=(), dtype=tf.float64),
            "model_bytes" : tf.constant(d*4, shape=(), dtype=tf.int64)
        }
        
        self.final_metrics = {
            "accuracy" : tf.Variable(0., dtype=tf.float32),
            "steps" : tf.Variable(0, dtype=tf.int64),
            "rounds" : tf.Variable(0, dtype=tf.int64),
            "monitoring_bytes_exchanged" : tf.Variable(0, dtype=tf.int64),
            "model_bytes_exchanged" : tf.Variable(0, dtype=tf.int64),
            "total_communication_bytes" : tf.Variable(0, dtype=tf.int64),
            "trained_in_bytes" : tf.Variable(0, dtype=tf.int64)
        }

    @tf.function
    def step_completed(self):
        self.final_metrics["steps"].assign_add(1)
        
        self.final_metrics["monitoring_bytes_exchanged"].assign_add(
            self.initial_conditions["local_state_bytes"] * self.initial_conditions["num_clients"]
        )
        
    
    @tf.function
    def round_completed(self):
        # x2 because server sends back
        self.final_metrics["model_bytes_exchanged"].assign_add(
            self.initial_conditions["model_bytes"] * self.initial_conditions["num_clients"] * 2
        )
        
        self.final_metrics["rounds"].assign_add(1)
        
        
    @tf.function
    def compute_final_metrics(self, model):
        
        self.final_metrics["accuracy"].assign(
            tf.cast(accuracy(model, test_dataset), dtype=tf.float32)
        )
        
        self.final_metrics["total_communication_bytes"].assign(
            self.final_metrics["monitoring_bytes_exchanged"] + self.final_metrics["model_bytes_exchanged"]
        )
        
        # `steps_at_a_time` basically is the number of batches per step (number of SGD steps<->batches)
        self.final_metrics["trained_in_bytes"].assign(
            self.initial_conditions["batch_size"] * (self.initial_conditions["d_train"] + 1) * 4 \
            * self.initial_conditions["steps_at_a_time"] * self.final_metrics["steps"] \
            * self.initial_conditions["num_clients"]
        )
        

In [18]:
w_spec = tf.TensorSpec(shape=(20, d, 1), dtype=tf.float32)

@tf.function(input_signature=[w_spec, w_spec])
def variance(w_t, w_sync):
    # w_t , w_sync tensors with shape=(NUM_CLIENTS, d, 1)
    
    # tensor with shape=(NUM_CLIENTS, d, 1)
    diff = w_t - w_sync
    
    # tensor with shape=(NUM_CLIENTS, 1) , For each client ||w_i_t - w_t||^2
    dot = tf.reduce_sum(tf.square(diff), axis=1)
    
    # Variance shape=() , scalar
    var = tf.reduce_mean(dot)
    
    return var

In [19]:
NUM_CLIENTS = 20

server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)

testing_approx_0 = tf.Variable(tf.zeros(shape=()), trainable=True, name='zeros', dtype=tf.float32)
    
client_models = [
    tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)
    for _ in range(NUM_CLIENTS)
]

In [20]:
client_slices_train = create_data_for_clients(NUM_CLIENTS)
BATCH_SIZE = 32
NUM_STEPS_UNTIL_RTC_CHECK = 1

federated_dataset = create_federated_data(
    client_slices_train=client_slices_train,
    batch_size=BATCH_SIZE,
    shuffle_buffer_size=int(n/20),
    num_steps_until_rtc_check=NUM_STEPS_UNTIL_RTC_CHECK,
    seed=seed
)

In [21]:
NUM_ROUNDS = 5
C = 0.01
theta = 2.
    
metrics = Metrics(
    fda_name="Naive",
    n_train=n, 
    d_train=d,
    test_size=test_size,
    num_clients=NUM_CLIENTS,
    batch_size= BATCH_SIZE,
    steps_at_a_time=NUM_STEPS_UNTIL_RTC_CHECK,
    theta=theta,
    C=C
)

In [22]:
def RTC(S, theta):
    return S <= theta

In [23]:

def train_everything(server_model, client_models, federated_dataset):
    
    S = tf.constant(0., dtype=tf.float32, shape=())
    theta = 2.
    
    for r in range(NUM_ROUNDS):
        
        while RTC(S, theta):
            S_i_clients = []
            
            Delta_i_clients = [] # test

            # client steps (number depends on `federated_dataset`, i.e., `.take(num)`)
            for client_model, client_dataset in zip(client_models, federated_dataset):
                # test Delta_i
                Delta_i_euc_norm_squared, Delta_i = steps(server_model, client_model, client_dataset, 0.01)
                S_i_clients.append(Delta_i_euc_norm_squared)
                
                # test 
                Delta_i_clients.append(Delta_i) # test
                
            S = tf.reduce_mean(S_i_clients)
            
            metrics.step_completed() # METRICS
            
        # server average
        server_model.assign(tf.reduce_mean(client_models, axis=0))
        
        # test
        testing_approx_0 = tf.reduce_sum(tf.square(tf.reduce_mean(Delta_i_clients, axis=0)), axis=0) #test
        
        
        #test
        print(f"var_approx = {S}")
        print(f"assumed_0 = {testing_approx_0}")
        print(f"var_actual = {variance(client_models, [server_model]*NUM_CLIENTS)}")
        print()
        
        # reset variance approx
        S = tf.constant(0., dtype=tf.float32, shape=())

        # synchronize clients
        for client_model in client_models:
            client_model.assign(server_model)
        
        metrics.round_completed() # METRICS
    
    
    metrics.compute_final_metrics(server_model) # METRICS

In [24]:
train_everything(server_model, client_models, federated_dataset)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
var_approx = 2.0028328895568848
assumed_0 = [1.7377768]
var_actual = 0.26505622267723083

var_approx = 2.1058177947998047
assumed_0 = [1.8244708]
var_actual = 0.2813469469547272

var_approx = 2.0628159046173096
assumed_0 = [1.7777321]
var_actual = 0.2850838899612427

var_approx = 2.036029100418091
assumed_0 = [1.7657549]
var_actual = 0.2702741026878357

var_approx = 2.1346335411071777
assumed_0 = [1.8287218]
var_actual = 0.3059118986129761



In [25]:
accuracy(server_model, test_dataset)

<tf.Tensor: shape=(), dtype=float32, numpy=0.8177915>

In [26]:
metrics.final_metrics["accuracy"]

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.8177915>

In [27]:
metrics.final_metrics["steps"]

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=121>

In [28]:
metrics.final_metrics["rounds"]

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=5>

In [29]:
metrics.final_metrics["monitoring_bytes_exchanged"]

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=9680>

In [30]:
metrics.final_metrics["model_bytes_exchanged"]

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=80000>

In [31]:
metrics.final_metrics["total_communication_bytes"]

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=89680>

In [32]:
metrics.final_metrics["trained_in_bytes"]

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=31285760>

1. Add input_spec everywhere to avoid tracing [here](https://stackoverflow.com/questions/52774351/how-to-run-parallel-map-fn-when-eager-execution-enabled#:~:text=First%2C%20using%20tf.,once%2C%20so%2C%20the%20time.)
2. Read [graphs](https://www.tensorflow.org/guide/intro_to_graphs)