In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf

In [2]:
tf.__version__

'2.11.0'

## Create Binary Classification data with sklearn

In [3]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


n = 100_000 # we want 200_00
d = 100


noise_factor = 0.01 # % of the labels are randomly flipped, DEFAULT=0.01
test_size = 0.2 # % of n
# The factor multiplying the hypercube size. Larger values spread out the 
# clusters/classes and make the classification task easier. DEFAULT=1
class_sep = -1
seed = 7 # `None` for no seed

# Create (noisy) testing data for binary classification.
X, y = make_classification(
    n_samples=n, 
    n_features=d,
    n_informative=d,
    n_redundant=0, 
    n_classes=2,
    class_sep=class_sep,
    flip_y=noise_factor,
    random_state=seed
)

# We will work with label values -1, +1 and not 0, +1 (convert)
y[y == 0] = -1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [4]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

# PA-I regressor from sklearn
pa1 = PassiveAggressiveClassifier(C=0.01, loss="hinge", n_jobs=-1)
pa1.fit(X_train[:10], y_train[:10])

accuracy_score(y_test, pa1.predict(X_test))

0.52745

## Convert to Tensors

In [5]:
X_train_tensor = tf.constant(X_train, dtype=tf.float32)
y_train_tensor = tf.constant(y_train, dtype=tf.float32)
X_test_tensor = tf.constant(X_test, dtype=tf.float32)
y_test_tensor = tf.constant(y_test, dtype=tf.float32)

Delete sklearn type data 

In [6]:
del X, y, X_train, X_test, y_train, y_test

## Prepare data for Federated Learning

### Create centralized testing dataset

In [7]:
slices_test = (X_test_tensor, y_test_tensor)

In [8]:
def create_tf_dataset_for_testing(batch_size):
    return tf.data.Dataset.from_tensor_slices(slices_test).batch(batch_size)

In [9]:
test_dataset = create_tf_dataset_for_testing(32)

### Slice the Tensors for each Client

We will cut the training data, i.e., (`X_train_tensor`, `y_train_tensor`) to equal parts, each part corresponding to one Client. We want to give the result back as a dictionary with key `client_id` and value the training tensor data.

In [10]:
def create_data_for_clients(num_clients):
    
    client_slices_train = {}

    n_test = int(n - n*test_size)

    for i in range(num_clients):
        # Compute the indices for this client's slice
        start_idx = int(i * n_test / num_clients)
        end_idx = int((i + 1) * n_test / num_clients)

        # Get the slice for this client
        X_client_train = X_train_tensor[start_idx:end_idx]
        y_client_train = y_train_tensor[start_idx:end_idx]
        
        # Combine the slices into a single dataset
        client_slices_train[f'client_{i}'] = (X_client_train, y_client_train)
    
    return client_slices_train

### Create TF friendly data for each Client

Given a Tensor slice (i.e. value of `client_slices_train["client_id"]` we convert it to highly optimized `tf.data.Dataset` to prepare for training.

In [11]:
def create_tf_dataset_for_client(client_tensor_slices, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed):
    
        return tf.data.Dataset.from_tensor_slices(client_tensor_slices) \
            .shuffle(buffer_size=shuffle_buffer_size, seed=seed).batch(batch_size) \
            .prefetch(tf.data.AUTOTUNE).take(num_steps_until_rtc_check)

### Create Federated Learning data

In [12]:
def create_federated_data(client_slices_train, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed=None):
    
    federated_dataset = [ 
        create_tf_dataset_for_client(client_tensor_slices, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed)
        for client, client_tensor_slices in client_slices_train.items()
    ]
    
    return federated_dataset

# Miscallenious

## Metrics

In [13]:
def create_metrics_dict(fda_name, n, d, test_size, seed, class_sep, epochs, num_clients, batch_size, 
                        steps_in_one_fda_step, theta, c, total_fda_steps, 
                        total_rounds, final_accuracy, sketch_width=None, sketch_depth=None):
    metrics = {
            "fda_name" : fda_name,
            "theta" : theta,
            "n" : n,
            "d" : d,
            "seed" : seed,
            "class_sep" : class_sep,
            "test_size" : test_size,
            "epochs" : epochs,
            "num_clients" : num_clients,
            "batch_size" : batch_size,
            "steps_in_one_fda_step" : steps_in_one_fda_step,
            "sketch_width" : sketch_width,
            "sketch_depth" : sketch_depth,
            "c" : c
        }
    
    # one batch bytes
    metrics["one_sample_bytes"] = 4 * (metrics["d"] + 1)
    
    # training dataset size
    metrics["training_dataset_bytes"] = metrics["one_sample_bytes"] * (1 - metrics["test_size"]) * metrics["n"]
    
    # model bytes
    metrics["model_bytes"] = d * 4
    
    
    # local state bytes (i.e. S_i), for one client
    if fda_name == "naive":
        metrics["local_state_bytes"] = 4
    elif fda_name == "linear":
        metrics["local_state_bytes"] = 8
    else:
        metrics["local_state_bytes"] = sketch_width * sketch_depth * 4 + 4
        
    # accuracy (already computed in parameter)
    metrics["final_accuracy"] = final_accuracy
    
    # total fda steps from algo
    metrics["total_fda_steps"] = total_fda_steps
    
    # total steps (a single fda step might have many normal SGD steps, batch steps)
    metrics["total_steps"] = metrics["total_fda_steps"] * metrics["steps_in_one_fda_step"]
    
    # total rounds in algo. Reason why we differentiate from the hardcoded NUM_ROUNDS
    # is because we might run less rounds in the future (i.e. stop on 10^7 samples idk)
    metrics["total_rounds"] = total_rounds
    
    # bytes exchanged for synchronizing weights (x2 because server sends back)
    metrics["model_bytes_exchanged"] = metrics["total_rounds"] * metrics["model_bytes"] \
        * metrics["num_clients"] * 2
    
    # bytes exchanged for monitoring the variance (communication)
    metrics["monitoring_bytes_exchanged"] = metrics["local_state_bytes"] * metrics["total_fda_steps"] \
        * metrics["num_clients"]
    
    # total communication bytes (for both monitoring and model synchronization)
    metrics["total_communication_bytes"] = metrics["model_bytes_exchanged"] + metrics["monitoring_bytes_exchanged"]
    
    # total seen dataset bytes (across all learning, i.e., all clients)
    metrics["trained_in_bytes"] = metrics["batch_size"] * metrics["one_sample_bytes"] \
        * metrics["total_steps"] * metrics["num_clients"]
    
    return metrics

## Variance

In [14]:
# = client_num
w_spec = tf.TensorSpec(shape=(20, d, 1), dtype=tf.float32)

@tf.function(input_signature=[w_spec, w_spec])
def variance(w_t, w_sync):
    # w_t , w_sync tensors with shape=(NUM_CLIENTS, d, 1)
    
    # tensor with shape=(NUM_CLIENTS, d, 1)
    diff = w_t - w_sync
    
    # tensor with shape=(NUM_CLIENTS, 1) , For each client ||w_i_t - w_t||^2
    dot = tf.reduce_sum(tf.square(diff), axis=1)
    
    # Variance shape=() , scalar
    var = tf.reduce_mean(dot)
    
    return var

## Accuracy testing

In [15]:
@tf.function
def accuracy(model, dataset):
    
    @tf.function
    def _batch_accuracy(model, batch):
        x_batch, y_batch = batch
        # from shape (d,) make it (d,1)
        y_batch = tf.expand_dims(y_batch, axis=1)

        # dot(w, x) for the batch (each instance of x in x_batch) with with shape=(batchsize, 1)
        weights_dot_x_batch = tf.matmul(x_batch, model)

        # Prediction batch with shape=(batchsize, 1)
        y_pred_batch = tf.sign(weights_dot_x_batch)

        accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred_batch, y_batch), tf.float32))

        return accuracy
    
    # We take advantage of AutoGraph (convert Python code to TensorFlow-compatible graph code automatically)
    acc, num_batches = 0., 0.
    for batch in dataset:
        acc += _batch_accuracy(model, batch)
        num_batches += 1
        
    acc = acc / num_batches
    
    return acc

## PA-Classiers (binary classification)

![PA](images/PA_binary_classifiers.png)

In [16]:
@tf.function
def client_train(model, dataset, C):

    @tf.function
    def _train_on_batch(model, batch, C):

        x_batch, y_batch = batch
        
        # from shape (d,) make it (d,1)
        y_batch = tf.expand_dims(y_batch, axis=1)

        # dot(w, x) for the batch (each instance of x in x_batch) with with shape=(batchsize, 1)
        weights_dot_x_batch = tf.matmul(x_batch, model)

        # Prediction batch with shape=(batchsize, 1)
        y_pred_batch = tf.sign(weights_dot_x_batch)

        # Suffer loss for each prediction (of instance) in the batch with shape=(batchsize,1)
        loss_batch = tf.maximum(0., 1. - tf.multiply(y_batch, weights_dot_x_batch))

        # shape=(batchsize,1) where each instance is ||x||^2, x in x_batch
        norm_batch = tf.expand_dims(tf.reduce_sum(tf.square(x_batch), axis=1), axis=1)
        
        # PA-1 : Learning rate t for each instance x, with shape=(batchsize,1)
        t_batch = tf.maximum(C, tf.divide(loss_batch, norm_batch))

        # each instance is y*t*x, where y,t scalars and x in x_batch. shape=(batchsize,d)
        t_y_x_batch = tf.multiply(t_batch, tf.multiply(y_batch, x_batch))

        # !!!! Update with mean t*y*x
        t_y_x_update = tf.expand_dims(tf.reduce_mean(t_y_x_batch, axis=0) ,axis=1)

        # Update
        model.assign_add(t_y_x_update)
    
    for batch in dataset:
        _train_on_batch(model, batch, C)
    

# Functional Dynamic Averaging

We follow the Functional Dynamic Averaging (FDA) scheme. Let the mean model be

$$ \overline{w_t} = \frac{1}{k} \sum_{i=1}^{k} w_t^{(i)} $$

where $ w_t^{(i)} $ is the model at time $ t $ in some round in the $i$-th learner.

Local models are trained independently and cooperatively and we want to monitor the Round Terminating Conditon (**RTC**):

$$ \frac{1}{k} \sum_{i=1}^{k} \lVert w_t^{(i)} - \overline{w_t} \rVert_2^2  \leq \Theta $$

where the left-hand side is the **model variance**, and threshold $\Theta$ is a hyperparameter of the FDA, defined at the beginning of the round; it may change at each round. When the monitoring logic cannot guarantee the validity of RTC, the round terminates. All local models are pulled into `tff.SERVER`, and $\bar{w_t}$ is set to their average. Then, another round begins.

### Monitoring the RTC

FDA monitors the RTC by applying techniques from Functionary [Functional Geometric Averaging](http://users.softnet.tuc.gr/~minos/Papers/edbt19.pdf). We first restate the problem of monitoring RTC into the standard distributed stream monitoring formulation. Let

$$ S(t) =  \frac{1}{k} \sum_{i=1}^{k} S_i(t) $$

where $ S(t) \in \mathbb{R}^n $ be the "global state" of the system and $ S_i(t) \in \mathbb{R}^n $ the "local states". The goal is to monitor the threshold condition on the global state in the form $ F(S(t)) \leq \Theta $ where $ F : \mathbb{R}^n \to \mathbb{R} $ a non-linear function. Let

$$ \Delta_t^{(i)} = w_t^{(i)} - w_{t_0}^{(i)} $$

be the update at the $ i $-th learner, that is, the change to the local model at time $t$ since the beginning of the current round at time $t_0$. Let the average update be

$$ \overline{\Delta_t} = \frac{1}{k} \sum_{i=1}^{k} \Delta_t^{(i)} $$

it follows that the variance can be written as

$$ \frac{1}{k} \sum_{i=1}^{k} \lVert w_t^{(i)} - \overline{w_t} \rVert_2^2 = \Big( \frac{1}{k} \sum_{i=1}^{k} \lVert \Delta_t^{(i)} \rVert_2^2 \Big) - \lVert \overline{\Delta_t} \rVert_2^2 $$

So, conceptually, if we define
$$ S_i(t) = \begin{bmatrix}
           \lVert \Delta_t^{(i)} \rVert_2^2 \\
           \Delta_t^{(i)}
         \end{bmatrix} \quad \text{and} \quad
         F(\begin{bmatrix}
           v \\
           \bf{x}
         \end{bmatrix}) = v - \lVert \bf{x} \rVert_2^2 $$

The RTC is equivalent to condition $$ F(S(t)) \leq \Theta $$

In [17]:
import sys

## 1️⃣ Naive FDA

In the naive approach, we eliminate the update vector from the local state (i.e. recuce the dimension to 0). Define local state as

$$ S_i(t) = \lVert \Delta_t^{(i)} \rVert_2^2 \in \mathbb{R}$$ 

and the identity function

$$ F(v) = v $$

It is trivial that $ F(S(t)) \leq \Theta $ implies the RTC.

In [18]:
@tf.function
def steps_naive(last_sync_model, model, dataset, C):
    # number of steps depend on `.take()` from `dataset`
    client_train(model, dataset, C)
    
    Delta_i = model - last_sync_model
    
    Delta_i_euc_norm_squared = tf.reduce_sum(tf.square(Delta_i), axis=0) # ||D(t)_i||^2
    
    return Delta_i_euc_norm_squared

### Training Loop

In [19]:
def F_naive(S):
    return S

In [20]:
@tf.function
def run_federated_simulation_naive(server_model, client_models, federated_dataset, C,
                                   num_epochs, theta, epoch_fda_steps):
    
    print("retracing naive")
    
    total_rounds = 0
    total_fda_steps = 0
    
    round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
    epoch_count = tf.constant(0, shape=(), dtype=tf.int32)
    
    S = tf.constant(0., shape=(), dtype=tf.float32)
    
    while epoch_count < num_epochs:
        
        while F_naive(S) <= theta:
            S_i_clients = []

            # client steps (number depends on `federated_dataset`, i.e., `.take(num)`)
            for client_model, client_dataset in zip(client_models, federated_dataset):
                Delta_i_euc_norm_squared = steps_naive(server_model, client_model, client_dataset, C)
                S_i_clients.append(Delta_i_euc_norm_squared)
                
            S = tf.reduce_mean(S_i_clients)
            
            round_fda_steps += 1
            total_fda_steps += 1
            
            if round_fda_steps == epoch_fda_steps:
                epoch_count += 1
                round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
                
                if epoch_count == num_epochs:
                    break
        
        
        """------------------------------test--------------------------------------------"""
        #tf.print("\n sync : ", output_stream=sys.stdout)
        #Delta_i_clients = [tf.subtract(client_model, server_model) for client_model in client_models] #test
        #testing_approx_0 = tf.reduce_sum(tf.square(tf.reduce_mean(Delta_i_clients, axis=0)), axis=0) #test
        #for client_model in client_models:
        #        tf.print("acc : ", accuracy(client_model, test_dataset), output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # server average
        server_model.assign(tf.reduce_mean(client_models, axis=0))
        
        """------------------------------test--------------------------------------------"""
        #tf.print("acc (after) : ", accuracy(server_model, test_dataset), output_stream=sys.stdout)
        #tf.print("Naive Epoch count: ", epoch_count, " Round fda steps: ", round_fda_steps, " Epoch fda steps:", epoch_fda_steps, output_stream=sys.stdout)
        #actual_var = variance(client_models, [server_model]*len(client_models)) #test
        #tf.print("Est var: ", S, " Actual var: ", actual_var, " Total fda steps: ", total_fda_steps, output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # reset variance approx
        S = tf.constant(0., shape=(), dtype=tf.float32)

        # synchronize clients
        for client_model in client_models:
            client_model.assign(server_model)
            
        total_rounds += 1
    
    return total_rounds, total_fda_steps

## 2️⃣ Linear FDA

In the linear case, we reduce the update vector to a scalar, $ \xi \Delta_t^{(i)} \in \mathbb{R}$, where $ \xi $ is any unit vector.

Define the local state to be 

$$ S_i(t) = \begin{bmatrix}
           \lVert \Delta_t^{(i)} \rVert_2^2 \\
           \xi \Delta_t^{(i)}
         \end{bmatrix} \in \mathbb{R}^2 $$

Also, define 

$$ F(v, x) = v - x^2 $$

The RTC is equivalent to condition 

$$ F(S(t)) \leq \Theta $$

A random choice of $ \xi $ is likely to perform poorly (terminate round prematurely), as it wil likely be close to orthogonal to $ \overline{\Delta_t} $. A good choice would be a vector $ \xi $ correlated to $ \overline{\Delta_t} $. A heuristic choice is to take $ \overline{\Delta_{t_0}} $ (after scaling it to norm 1), i.e., the update vector right before the current round started. All nodes can estimate this without communication, as $ \overline{w_{t_0}} - \overline{w_{t_{-1}}} $, the difference of the last two models pushed by the Server. Hence, 

$$ \xi = \frac{\overline{w_{t_0}} - \overline{w_{t_{-1}}}}{\lVert \overline{w_{t_0}} - \overline{w_{t_{-1}}} \rVert_2} $$

In [21]:
@tf.function
def ksi_unit_fn(w_t0, w_tminus1):
    
    if tf.reduce_all(tf.equal(w_t0, w_tminus1)):
        # if equal then ksi becomes a random vector (will only happen in round 1)
        ksi = tf.random.normal(shape=w_t0.shape)
    else:
        ksi = w_t0 - w_tminus1

    # Normalize and return
    return tf.divide(ksi, tf.norm(ksi))

In [22]:
@tf.function
def steps_linear(model_tminus, model_t0, model, dataset, C):
    # number of steps depend on `.take()` from `dataset`
    client_train(model, dataset, C)
    
    Delta_i = model - model_t0
    
    #||D(t)_i||^2 , shape = (1,) 
    Delta_i_euc_norm_squared = tf.reduce_sum(tf.square(Delta_i), axis=0)
    
    # heuristic unit vector ksi
    ksi = ksi_unit_fn(model_t0, model_tminus)
    
    # ksi * Delta_i (* is dot) , shape = ()
    ksi_Delta_i = tf.reduce_sum(tf.multiply(ksi, Delta_i))
    
    return Delta_i_euc_norm_squared, ksi_Delta_i

### Training Loop

In [23]:
def F_linear(S_1, S_2):
    return S_1 - S_2**2

In [24]:
@tf.function
def run_federated_simulation_linear(previous_server_model, server_model,
                                    client_models, federated_dataset, C, 
                                    num_epochs, theta, epoch_fda_steps):
    
    print("retracing linear")
    
    total_rounds = 0
    total_fda_steps = 0
    
    round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
    epoch_count = tf.constant(0, shape=(), dtype=tf.int32)
    
    S_1 = tf.constant(0., shape=(), dtype=tf.float32)
    S_2 = tf.constant(0., shape=(), dtype=tf.float32)
    
    while epoch_count < num_epochs:
        
        while F_linear(S_1, S_2) <= theta:
            euc_norm_squared_clients = []
            ksi_delta_clients = []

            # client steps (number depends on `federated_dataset`, i.e., `.take(num)`)
            for client_model, client_dataset in zip(client_models, federated_dataset):
                
                Delta_i_euc_norm_squared, ksi_Delta_i  = steps_linear(
                    previous_server_model, server_model, client_model, client_dataset, C
                )
                
                euc_norm_squared_clients.append(Delta_i_euc_norm_squared)
                ksi_delta_clients.append(ksi_Delta_i)
                
            S_1 = tf.reduce_mean(euc_norm_squared_clients)
            S_2 = tf.reduce_mean(ksi_delta_clients)
            
            round_fda_steps += 1
            total_fda_steps += 1
            
            if round_fda_steps == epoch_fda_steps:
                epoch_count += 1
                round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
                
                if epoch_count == num_epochs:
                    break
        
        # last server model (previous sync)
        previous_server_model.assign(server_model)
        
        """------------------------------test--------------------------------------------"""
        #tf.print("\n sync : ", output_stream=sys.stdout)
        #Delta_i_clients = [tf.subtract(client_model, server_model) for client_model in client_models] #test
        #testing_approx_0 = tf.reduce_sum(tf.square(tf.reduce_mean(Delta_i_clients, axis=0)), axis=0) #test
        #for client_model in client_models:
        #        tf.print("acc : ", accuracy(client_model, test_dataset), output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # server average
        server_model.assign(tf.reduce_mean(client_models, axis=0))
        
        """------------------------------test--------------------------------------------"""
        #tf.print("acc (after) : ", accuracy(server_model, test_dataset), output_stream=sys.stdout)
        #tf.print("Linear Epoch count: ", epoch_count, " Round fda steps: ", round_fda_steps, " Epoch fda steps:", epoch_fda_steps, output_stream=sys.stdout)
        #actual_var = variance(client_models, [server_model]*len(client_models)) #test
        #tf.print("Est left: ", S_1, " Est S_2: ", S_2**2, "Est var: ", S_1-S_2**2, " Actual var: ", actual_var, " Total fda steps: ", total_fda_steps, output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        
        # reset variance approx
        S_1 = tf.constant(0., shape=(), dtype=tf.float32)
        S_2 = tf.constant(0., shape=(), dtype=tf.float32)

        # synchronize clients
        for client_model in client_models:
            client_model.assign(server_model)
            
        total_rounds += 1
    
    return total_rounds, total_fda_steps

## 3️⃣ Sketch FDA

An optimal estimator for $ \lVert \overline{\Delta_t} \rVert_2^2  $ can be obtained by employing AMS sketches. An AMS sketch of a vector $ v \in \mathbb{R}^M $ is a $ d \times m $ real matrix

$$ \Xi = \text{sk}(v) = \begin{bmatrix}
           \Xi_1 \\
           \Xi_2 \\
           \vdots \\
           \Xi_d 
         \end{bmatrix} $$
         
where $ d \cdot m \ll M$. Operator sk($ \cdot $) is linear, i.e., let $a, b \in \mathbb{R}$ and $v_1, v_2 \in \mathbb{R}^N$ then 

$$ \text{sk}(a v_1 + b v_2) = a \; \text{sk}(v_1) + b \; \text{sk}(v_2)  $$

Also, sk($ v $) can be computed in $ \mathcal{O}(dN) $ steps.

The interesting property of AMS sketches is that the function 

$$ M(sk(\textbf{v})) = \underset{i=1,...,d}{\text{median}} \; \lVert \boldsymbol{\Xi}_i \rVert_2^2  $$ 

is an excellent estimator of the Euclidean norm of **v** (within relative $\epsilon$-error):

$$ M(sk(\textbf{v})) \; \in (1 \pm \epsilon) \lVert \textbf{v} \rVert_2^2 \; \; \text{with probability at least} \; (1-\delta) $$

where $m = \mathcal{O}(\frac{1}{\epsilon^2})$ and $d = \mathcal{O}(\log \frac{1}{\delta})$
            
Moreover, let $\boldsymbol{\Xi} \in \mathbb{R}^{d \times m}$ and $ k \in \mathbb{R}$. It can be proven that

$$ M( \frac{1}{k} \boldsymbol{\Xi}) = \frac{1}{k^2} M(\boldsymbol{\Xi}) $$

Let's investigate a little further on how this helps us. The $i$-th client computes $ sk(\Delta_t^{(i)}) $ and sends it to the server. Notice

$$ M\big(sk(\Delta_t^{(1)}) + sk(\Delta_t^{(2)}) + ... + sk(\Delta_t^{(k)}) \big) = M\Big( \text{sk}\big( \sum_{i=1}^{k} \Delta_t^{(i)} \big) \Big)$$

Remember that

$$ \overline{\boldsymbol{\Delta}}_t = \frac{1}{k} \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} $$

Then
            
$$ M\Big( \text{sk}\big( \overline{\boldsymbol{\Delta}}_t \big) \Big) = M\Big( \text{sk}\big( \frac{1}{k} \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} \big) \Big) = \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} \big) \Big) $$


Which means that 

$$ \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} \big) \Big) \in (1 \pm \epsilon) \lVert \overline{\boldsymbol{\Delta}}_t \rVert_2^2 \; \; \text{w.p. at least} \; (1-\delta) $$

In the monitoring process it is essential that we do not overestimate $ \lVert \overline{\Delta_t} \rVert_2^2 $ because we would then underestimate the variance which would potentially result in actual varience exceeding $ \Theta$ without us noticing it. With this in mind,

$$ \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \Delta_t^{(i)} \big) \Big) \leq (1+\epsilon) \lVert \overline{\Delta_t} \rVert_2^2 \quad \text{with probability at least} \; (1-\delta)$$

Which means

$$ \frac{1}{(1+\epsilon)} \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \Delta_t^{(i)} \big) \Big) \leq \lVert \overline{\Delta_t} \rVert_2^2 \quad \text{with probability at least} \; (1-\delta)$$

Hence, the Server's estimation of $ \lVert \overline{\Delta_t} \rVert_2^2 $ is

$$ \frac{1}{(1+\epsilon)} \frac{1}{k^2} M\Big( sk(\Delta_t^{(1)}) + sk(\Delta_t^{(2)}) + ... + sk(\Delta_t^{(k)}) \big) \Big) $$

Define the local state to be 

$$ S_i(t) = \begin{bmatrix}
           \lVert \Delta_t^{(i)} \rVert_2^2 \\
           sk(\Delta_t^{(i)})
         \end{bmatrix} \in \mathbb{R}^{1+d \times m} \quad \text{and} \quad
         F(\begin{bmatrix}
           v \\
           \Xi
         \end{bmatrix}) = v - \frac{1}{(1+\epsilon)} \frac{1}{k^2} M(\Xi) \quad \text{where} \quad \Xi = \sum_{i=1}^{k} sk(\Delta_t^{(i)}) $$

It follows that $ F(S(t)) \leq \Theta $ implies that the variance is less or equal to $ \Theta $ with probability at least $ 1-\delta $.


## AMS sketch

We use `ExtensionType` which is the way to go in order to avoid unecessary graph retracing when passing around `AmsSketch` type 'objects'.

In [25]:
from tensorflow.experimental import ExtensionType

class AmsSketch(ExtensionType):
    depth: int
    width: int
    F: tf.Tensor
        
    def __init__(self, depth=7, width=1500):
        self.depth = depth
        self.width = width
        self.F = tf.random.uniform(shape=(6, depth), minval=0, maxval=(1 << 31) - 1, dtype=tf.int32)

    @tf.function
    def hash31(self, x, a, b):

        r = a * x + b
        fold = tf.bitwise.bitwise_xor(tf.bitwise.right_shift(r, 31), r)
        return tf.bitwise.bitwise_and(fold, 2147483647)

    @tf.function
    def fourwise(self, x):

        result = 2 * (tf.bitwise.right_shift(tf.bitwise.bitwise_and(self.hash31(self.hash31(self.hash31(x, self.F[2], self.F[3]), x, self.F[4]), x, self.F[5]), 32768), 15)) - 1
        return result

    @tf.function
    def sketch_for_vector(self, v):

        sketch = tf.zeros(shape=(self.depth, self.width), dtype=tf.float32)
        indices = tf.range(tf.shape(v)[0], dtype=tf.int32)

        for i in indices:
            pos = self.hash31(i, self.F[0], self.F[1]) % self.width
            delta = tf.cast(self.fourwise(i), dtype=tf.float32) * v[i]
            indices_to_update = tf.stack([tf.range(self.depth, dtype=tf.int32), pos], axis=1)
            sketch = tf.tensor_scatter_nd_add(sketch, indices_to_update, delta)

        return sketch
    
    @staticmethod
    @tf.function
    def estimate_euc_norm_squared(sketch):

        @tf.function
        def _median(v):
            """ Median of tensor `v` with shape=(n,). Note: Suboptimal O(nlogn) but it's ok bcz n = `depth`"""
            length = tf.shape(v)[0]
            sorted_v = tf.sort(v)
            middle = length // 2

            return tf.cond(
                tf.equal(length % 2, 0),
                lambda: (sorted_v[middle - 1] + sorted_v[middle]) / 2.0,
                lambda: sorted_v[middle]
            )

        return _median(tf.reduce_sum(tf.square(sketch), axis=1))

In [26]:
@tf.function
def steps_sketch(last_sync_model, model, dataset, C, ams_sketch):
    # number of steps depend on `.take()` from `dataset`
    client_train(model, dataset, C)
    
    Delta_i = model - last_sync_model
    
    #||D(t)_i||^2 , shape = (1,) 
    Delta_i_euc_norm_squared = tf.reduce_sum(tf.square(Delta_i), axis=0)
    
    # sketch approx
    sketch = ams_sketch.sketch_for_vector(Delta_i)
    
    return Delta_i_euc_norm_squared, sketch

### Training Loop

In [27]:
def F_sketch(S_1, S_2, epsilon, num_clients):
    """ `S_1` is mean || ||^2 as usual, S_2 is the `Ξ` as defined in the theoretical analysis above """
    one = tf.constant(1., shape=(), dtype=tf.float32)
    
    return S_1 - (one / (one + epsilon)) * (one / num_clients**2) * AmsSketch.estimate_euc_norm_squared(S_2)

In [28]:
@tf.function
def run_federated_simulation_sketch(server_model, client_models, federated_dataset, C,
                                    num_epochs, theta, epoch_fda_steps, 
                                    ams_sketch, epsilon, num_clients):
    
    print("retracing sketch")
    
    total_rounds = 0
    total_fda_steps = 0
    
    round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
    epoch_count = tf.constant(0, shape=(), dtype=tf.int32)
    
    S_1 = tf.constant(0., shape=(), dtype=tf.float32)
    S_2 = tf.zeros(shape=(ams_sketch.depth, ams_sketch.width), dtype=tf.float32)
    
    while epoch_count < num_epochs:
        
        while F_sketch(S_1, S_2, epsilon, num_clients) <= theta:
            euc_norm_squared_clients = []
            sketch_clients = []

            # client steps (number depends on `federated_dataset`, i.e., `.take(num)`)
            for client_model, client_dataset in zip(client_models, federated_dataset):
                
                Delta_i_euc_norm_squared, sketch  = steps_sketch(
                    server_model, client_model, client_dataset, C, ams_sketch
                )
                
                euc_norm_squared_clients.append(Delta_i_euc_norm_squared)
                sketch_clients.append(sketch)
                
            S_1 = tf.reduce_mean(euc_norm_squared_clients)
            S_2 = tf.reduce_sum(sketch_clients, axis=0)  # shape=(`depth`, width`). See `Ξ` in theoretical analysis
            
            round_fda_steps += 1
            total_fda_steps += 1
            
            if round_fda_steps == epoch_fda_steps:
                epoch_count += 1
                round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
                
                if epoch_count == num_epochs:
                    break
        
        """------------------------------test--------------------------------------------"""
        #tf.print("\n sync : ", output_stream=sys.stdout)
        #Delta_i_clients = [tf.subtract(client_model, server_model) for client_model in client_models] #test
        #testing_approx_0 = tf.reduce_sum(tf.square(tf.reduce_mean(Delta_i_clients, axis=0)), axis=0) #test
        #for client_model in client_models:
        #        tf.print("acc : ", accuracy(client_model, test_dataset), output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # server average
        server_model.assign(tf.reduce_mean(client_models, axis=0))
        
        """------------------------------test--------------------------------------------"""
        #tf.print("acc (after) : ", accuracy(server_model, test_dataset), output_stream=sys.stdout)
        #tf.print("Sketch Epoch count: ", epoch_count, " Round fda steps: ", round_fda_steps, " Epoch fda steps:", epoch_fda_steps, output_stream=sys.stdout)
        #actual_var = variance(client_models, [server_model]*len(client_models)) #test
        #tf.print("Est left: ", S_1, "Est var: ", F_sketch(S_1, S_2, epsilon, num_clients), " Actual var: ", actual_var, " Total fda steps: ", total_fda_steps, output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        
        # reset variance approx
        S_1 = tf.constant(0., shape=(), dtype=tf.float32)
        S_2 = tf.zeros(shape=(ams_sketch.depth, ams_sketch.width), dtype=tf.float32)

        # synchronize clients
        for client_model in client_models:
            client_model.assign(server_model)
            
        total_rounds += 1
    
    return total_rounds, total_fda_steps

# Simulation tests

In [29]:
def basic_test(NUM_EPOCHS, C, NUM_STEPS_UNTIL_RTC_CHECK, NUM_CLIENTS,
               BATCH_SIZE, THETA, EPSILON, ams_sketch):
    
    """ One test for Naive,Linear,Sketch. Returns metrics """
    
    c = tf.constant(C, shape=(), dtype=tf.float32)
    num_epochs = tf.constant(NUM_EPOCHS, shape=(), dtype=tf.int32)
    theta = tf.constant(THETA, shape=(), dtype=tf.float32)
    
    # for sketch
    epsilon = tf.constant(EPSILON, shape=(), dtype=tf.float32) # new
    num_clients = tf.constant(float(NUM_CLIENTS), shape=(), dtype=tf.float32) # new
    
    
    epoch_client_batches = ((1-test_size)*n / BATCH_SIZE) / NUM_CLIENTS
    epoch_max_fda_steps = epoch_client_batches / NUM_STEPS_UNTIL_RTC_CHECK
    epoch_max_fda_steps = tf.constant(int(epoch_max_fda_steps), shape=(), dtype=tf.int32)
    
    basic_test_metrics = []
    
    """ --------------- Naive ----------------------------------"""
        
    # 1. Dataset (we create it again because we want determinism)

    client_slices_train = create_data_for_clients(NUM_CLIENTS)

    federated_dataset = create_federated_data(
        client_slices_train=client_slices_train,
        batch_size=BATCH_SIZE,
        shuffle_buffer_size=int(n/20),
        num_steps_until_rtc_check=NUM_STEPS_UNTIL_RTC_CHECK,
        seed=seed
    )

    # 2. Models init
    
    server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)

    client_models = [
        tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)
        for _ in range(NUM_CLIENTS)
    ]

    # 3. Run 
    
    total_rounds, total_fda_steps = run_federated_simulation_naive(
        server_model, 
        client_models, 
        federated_dataset, 
        c,
        num_epochs, 
        theta,
        epoch_max_fda_steps
    )

    # 4. Compute metrics 
    
    final_accuracy = accuracy(server_model, test_dataset)

    metrics = create_metrics_dict(
        fda_name="naive", 
        n=n, 
        d=d, 
        test_size=test_size,
        seed=seed,
        class_sep=class_sep,
        epochs=NUM_EPOCHS,
        num_clients=NUM_CLIENTS, 
        batch_size=BATCH_SIZE, 
        steps_in_one_fda_step=NUM_STEPS_UNTIL_RTC_CHECK, 
        theta=THETA, 
        c=C, 
        total_fda_steps=total_fda_steps.numpy(), 
        total_rounds=total_rounds.numpy(), 
        final_accuracy=final_accuracy.numpy(), 
        sketch_width=None, 
        sketch_depth=None
    )

    basic_test_metrics.append(metrics)

    del client_slices_train, federated_dataset, server_model, client_models, total_rounds, total_fda_steps

    """ ----------------- Linear ----------------------------------"""

    # 1. Dataset (we create it again because we want determinism)

    client_slices_train = create_data_for_clients(NUM_CLIENTS)

    federated_dataset = create_federated_data(
        client_slices_train=client_slices_train,
        batch_size=BATCH_SIZE,
        shuffle_buffer_size=int(n/20),
        num_steps_until_rtc_check=NUM_STEPS_UNTIL_RTC_CHECK,
        seed=seed
    )

    # 3. Model init
    
    server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)

    client_models = [
        tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)
        for _ in range(NUM_CLIENTS)
    ]

    # for `ξ` approximation.
    previous_server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)


    # 3. Run
    total_rounds, total_fda_steps = run_federated_simulation_linear(
        previous_server_model,
        server_model, 
        client_models, 
        federated_dataset, 
        c,
        num_epochs, 
        theta,
        epoch_max_fda_steps
    )
    
    # 4. Compute metrics 

    final_accuracy = accuracy(server_model, test_dataset)

    metrics = create_metrics_dict(
        fda_name="linear", 
        n=n, 
        d=d, 
        test_size=test_size, 
        seed=seed,
        class_sep=class_sep,
        epochs=NUM_EPOCHS,
        num_clients=NUM_CLIENTS, 
        batch_size=BATCH_SIZE, 
        steps_in_one_fda_step=NUM_STEPS_UNTIL_RTC_CHECK, 
        theta=THETA, 
        c=C, 
        total_fda_steps=total_fda_steps.numpy(), 
        total_rounds=total_rounds.numpy(), 
        final_accuracy=final_accuracy.numpy(), 
        sketch_width=None, 
        sketch_depth=None
    )

    basic_test_metrics.append(metrics)

    del client_slices_train, federated_dataset, server_model, client_models, previous_server_model, \
        total_rounds, total_fda_steps
    
    
    """ --------------- Sketch ----------------------------------"""
    
    # 1. Dataset (we create it again because we want determinism)

    client_slices_train = create_data_for_clients(NUM_CLIENTS)

    federated_dataset = create_federated_data(
        client_slices_train=client_slices_train,
        batch_size=BATCH_SIZE,
        shuffle_buffer_size=int(n/20),
        num_steps_until_rtc_check=NUM_STEPS_UNTIL_RTC_CHECK,
        seed=seed
    )

    # 2. Models init

    server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)

    client_models = [
        tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)
        for _ in range(NUM_CLIENTS)
    ]

    # 3. Run 

    total_rounds, total_fda_steps = run_federated_simulation_sketch(
        server_model, 
        client_models, 
        federated_dataset, 
        c,
        num_epochs, 
        theta,
        epoch_max_fda_steps,
        ams_sketch, # new
        epsilon,  # new
        num_clients # new
    )

    # 4. Compute metrics 

    final_accuracy = accuracy(server_model, test_dataset)

    metrics = create_metrics_dict(
        fda_name="sketch", 
        n=n, 
        d=d, 
        test_size=test_size,
        seed=seed,
        class_sep=class_sep,
        epochs=NUM_EPOCHS,
        num_clients=NUM_CLIENTS, 
        batch_size=BATCH_SIZE, 
        steps_in_one_fda_step=NUM_STEPS_UNTIL_RTC_CHECK, 
        theta=THETA, 
        c=C, 
        total_fda_steps=total_fda_steps.numpy(), 
        total_rounds=total_rounds.numpy(), 
        final_accuracy=final_accuracy.numpy(), 
        sketch_width=ams_sketch.width, 
        sketch_depth=ams_sketch.depth
    )
    
    basic_test_metrics.append(metrics)
    
    del client_slices_train, federated_dataset, server_model, client_models, total_rounds, total_fda_steps

    return basic_test_metrics
    

In [30]:
from math import sqrt # new

def run_tests():
    """ --------------- Fixed configurations -------------------"""
    NUM_EPOCHS_FIXED = 1 # !
    C_FIXED = 0.01  # model, GOOD
    NUM_STEPS_UNTIL_RTC_CHECK_FIXED = 1 # dataset !  ALONE NO RETRACING
    
    
    # sketch
    
    SKETCH_DEPTH = 7 # new
    SKETCH_WIDTH = 1000 # 1700

    ams_sketch = AmsSketch(
        depth=SKETCH_DEPTH,
        width=SKETCH_WIDTH
    )

    EPSILON = 1. / sqrt(SKETCH_WIDTH)
    
    
    """ --------------- Test configurations -------------------"""
    
    NUM_CLIENTS_LIST = [20] #[10, 20, 35, 50, 75, 100, 250, 500]
    BATCH_SIZE_LIST = [] #[16, 32, 64, 126, 256, 512]
    THETA_LIST = [] #[10., 25., 50., 75., 100., 250., 500., 1000.]
    
    """ --------------- Metrics list ----------------------"""
    
    all_metrics = []
    
    try:
        
    
        """ --------------- Run tests -------------------"""

        # Test /k nodes
        # FIX: BATCH_SIZE, THETA

        BATCH_SIZE_FIXED = 32
        THETA_FIXED = 1.

        for NUM_CLIENTS in NUM_CLIENTS_LIST:
            print()
            print(f"CLIENTS testing. Current NUM_CLIENTS : {NUM_CLIENTS}")

            basic_test_metrics = basic_test(
                NUM_EPOCHS=NUM_EPOCHS_FIXED,
                C=C_FIXED,
                NUM_STEPS_UNTIL_RTC_CHECK=NUM_STEPS_UNTIL_RTC_CHECK_FIXED,
                NUM_CLIENTS=NUM_CLIENTS,
                BATCH_SIZE=BATCH_SIZE_FIXED,
                THETA=THETA_FIXED,
                EPSILON=EPSILON,
                ams_sketch=ams_sketch
            )

            all_metrics.extend(basic_test_metrics)

        # Test /THETA : THETA tests will only retrance Graphs once!
        BATCH_SIZE_FIXED = 32
        NUM_CLIENTS_FIXED = 20

        for THETA in THETA_LIST:
            print()
            print(f"THETA testing. Current THETA : {THETA}")

            basic_test_metrics = basic_test(
                NUM_EPOCHS=NUM_EPOCHS_FIXED,
                C=C_FIXED,
                NUM_STEPS_UNTIL_RTC_CHECK=NUM_STEPS_UNTIL_RTC_CHECK_FIXED,
                NUM_CLIENTS=NUM_CLIENTS_FIXED,
                BATCH_SIZE=BATCH_SIZE_FIXED,
                THETA=THETA,
                EPSILON=EPSILON,
                ams_sketch=ams_sketch
            )

            all_metrics.extend(basic_test_metrics)

        # Test /BATCHS_SIZE : THETA tests will only retrance Graphs once!

        NUM_CLIENTS_FIXED = 20
        THETA_FIXED = 1.

        for BATCH_SIZE in BATCH_SIZE_LIST:
            print()
            print(f"BATCH_SIZE testing. Current BATCH_SIZE : {BATCH_SIZE}")

            basic_test_metrics = basic_test(
                NUM_EPOCHS=NUM_EPOCHS_FIXED,
                C=C_FIXED,
                NUM_STEPS_UNTIL_RTC_CHECK=NUM_STEPS_UNTIL_RTC_CHECK_FIXED,
                NUM_CLIENTS=NUM_CLIENTS_FIXED,
                BATCH_SIZE=BATCH_SIZE,
                THETA=THETA_FIXED,
                EPSILON=EPSILON,
                ams_sketch=ams_sketch
            )

            all_metrics.extend(basic_test_metrics)
    
    except Exception as e:
        print("fuck")
    
    finally:
        return all_metrics

In [None]:
all_metrics = run_tests()


CLIENTS testing. Current NUM_CLIENTS : 20
retracing naive
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
-------------------------------------------------------------------------
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.0132421423]
Delta_i :  [0.0172166415]
Delta_i :  [0.0132447844]
Delta_i :  [0.0138673624]
Delta_i :  [0.0108123785]
Delta_i :  [0.0170950368]
Delta_i :  [0.0139151178]
Delta_i :  [0.0153238988]
Delta_i :  [0.0127497371]
Delta_i :  [0.0133840367]
Delta_i :  [0.0142025929]
Delta_i :  [0.0125393784]
Delta_i :  [0.0142428111]
Delta_i :  [0.013752386]
Delta_i :  [0.0139164021]
Delta_i :  [0.0145554245]
Delta_i :  [0.0117379399]
Delta_i :  [0.0115938196]
Delta_i :  [0.0121672824]
Delta_i :  [0.0151876211]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  Ten

Delta_i :  [0.715491056]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.637547612]
Delta_i :  [0.653401375]
Delta_i :  [0.78245759]
Delta_i :  [0.871678352]
Delta_i :  [0.817663789]
Delta_i :  [0.793392122]
Delta_i :  [0.749571681]
Delta_i :  [0.72384423]
Delta_i :  [0.847777069]
Delta_i :  [0.687362432]
Delta_i :  [0.751714468]
Delta_i :  [0.816705585]
Delta_i :  [0.833373189]
Delta_i :  [0.872193396]
Delta_i :  [0.842604]
Delta_i :  [0.660894]
Delta_i :  [0.733771563]
Delta_i :  [0.800927]
Delta_i :  [0.92299211]
Delta_i :  [0.802193046]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.94412297]
Delta_i :  [0.702837408]
Delta_i :  [0.805993199]
Delta_i :  [0.776063621]
Delta_i :  [0.955982268]
Delta_i :  [0.904014647]
Delta_i :  [0.943044186]
Delta_i :  [0.741892]
Delta_i :  [0.833768964]
Delta_i :  [0.94122386]
Delta_i :  [0.906350255]
Delta_i :  [0.844595969]
Delta_i :  [0.867055416]
D

Delta_i :  [0.39305678]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.473975152]
Delta_i :  [0.505191088]
Delta_i :  [0.482075691]
Delta_i :  [0.602935791]
Delta_i :  [0.449001402]
Delta_i :  [0.62421751]
Delta_i :  [0.506336451]
Delta_i :  [0.495054096]
Delta_i :  [0.517661]
Delta_i :  [0.560524046]
Delta_i :  [0.504615486]
Delta_i :  [0.491282761]
Delta_i :  [0.49213326]
Delta_i :  [0.428490639]
Delta_i :  [0.469659299]
Delta_i :  [0.448795766]
Delta_i :  [0.407371104]
Delta_i :  [0.456768274]
Delta_i :  [0.572565794]
Delta_i :  [0.474007815]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.523776114]
Delta_i :  [0.591929197]
Delta_i :  [0.656843066]
Delta_i :  [0.499791592]
Delta_i :  [0.600729167]
Delta_i :  [0.520374477]
Delta_i :  [0.541600645]
Delta_i :  [0.623432875]
Delta_i :  [0.559459329]
Delta_i :  [0.582802057]
Delta_i :  [0.530916214]
Delta_i :  [0.664083242]
Delta_i :  [0.6

Delta_i :  [0.214134827]
Delta_i :  [0.23588264]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.291152418]
Delta_i :  [0.242843047]
Delta_i :  [0.269167572]
Delta_i :  [0.316464722]
Delta_i :  [0.328142673]
Delta_i :  [0.301411599]
Delta_i :  [0.283855349]
Delta_i :  [0.259582281]
Delta_i :  [0.2726385]
Delta_i :  [0.31677869]
Delta_i :  [0.296632379]
Delta_i :  [0.303212702]
Delta_i :  [0.260621279]
Delta_i :  [0.282496214]
Delta_i :  [0.270962626]
Delta_i :  [0.3397789]
Delta_i :  [0.313709617]
Delta_i :  [0.278423727]
Delta_i :  [0.28262049]
Delta_i :  [0.308536381]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.328617752]
Delta_i :  [0.410011321]
Delta_i :  [0.393731833]
Delta_i :  [0.330838025]
Delta_i :  [0.381565899]
Delta_i :  [0.298457652]
Delta_i :  [0.376394]
Delta_i :  [0.341543496]
Delta_i :  [0.327523857]
Delta_i :  [0.352896333]
Delta_i :  [0.361604035]
Delta_i :  [0.37482

Delta_i :  [0.103264786]
Delta_i :  [0.0881375223]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.151044]
Delta_i :  [0.107837252]
Delta_i :  [0.146262825]
Delta_i :  [0.136270791]
Delta_i :  [0.116498373]
Delta_i :  [0.145086631]
Delta_i :  [0.131046459]
Delta_i :  [0.123043679]
Delta_i :  [0.142238721]
Delta_i :  [0.137208357]
Delta_i :  [0.135768801]
Delta_i :  [0.116104946]
Delta_i :  [0.129934192]
Delta_i :  [0.147447616]
Delta_i :  [0.119408123]
Delta_i :  [0.121466495]
Delta_i :  [0.12093173]
Delta_i :  [0.132991806]
Delta_i :  [0.130165696]
Delta_i :  [0.120287992]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.176632762]
Delta_i :  [0.153398126]
Delta_i :  [0.151913658]
Delta_i :  [0.195113897]
Delta_i :  [0.186223596]
Delta_i :  [0.19321993]
Delta_i :  [0.154106855]
Delta_i :  [0.176767811]
Delta_i :  [0.178460941]
Delta_i :  [0.165280893]
Delta_i :  [0.140388757]
Delta_i :  [0

Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.0345160738]
Delta_i :  [0.0378172845]
Delta_i :  [0.0347509198]
Delta_i :  [0.0278668441]
Delta_i :  [0.0315432698]
Delta_i :  [0.0317537561]
Delta_i :  [0.0455306917]
Delta_i :  [0.0300852861]
Delta_i :  [0.0371393226]
Delta_i :  [0.0349523388]
Delta_i :  [0.0363518223]
Delta_i :  [0.0343226306]
Delta_i :  [0.0406267717]
Delta_i :  [0.0364631861]
Delta_i :  [0.0296276119]
Delta_i :  [0.0378740467]
Delta_i :  [0.0385058858]
Delta_i :  [0.030176213]
Delta_i :  [0.0326439478]
Delta_i :  [0.0282008778]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.058156874]
Delta_i :  [0.0643409044]
Delta_i :  [0.0558852479]
Delta_i :  [0.0564070418]
Delta_i :  [0.0705364421]
Delta_i :  [0.0777840465]
Delta_i :  [0.0557515621]
Delta_i :  [0.0704575703]
Delta_i :  [0.0571541227]
Delta_i :  [0.0795477]
Delta_i :  [0.0729003549]
Delta_i :  [0.0603457093]
Delta_i

Delta_i :  [1.14158869]
Delta_i :  [0.975373685]
Delta_i :  [1.01250744]
Delta_i :  [0.92308253]
Delta_i :  [1.00840294]
Delta_i :  [0.978547573]
Delta_i :  [1.02316117]
Delta_i :  [0.916284502]
Delta_i :  [0.9990381]
Delta_i :  [0.957498]
Delta_i :  [1.00418162]
Delta_i :  [1.09924126]
Delta_i :  [0.973998427]
Delta_i :  [0.945376158]
Delta_i :  [0.969660819]
Delta_i :  [0.916707516]
Delta_i :  [1.05460441]
Delta_i :  [0.879226387]
Delta_i :  [1.23657703]
Delta_i :  [0.91332382]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [1.00511968]
Delta_i :  [1.18057549]
Delta_i :  [1.02469838]
Delta_i :  [1.09658694]
Delta_i :  [1.04323184]
Delta_i :  [1.13681746]
Delta_i :  [1.05536973]
Delta_i :  [1.08578956]
Delta_i :  [1.14715135]
Delta_i :  [1.10445261]
Delta_i :  [1.16654563]
Delta_i :  [1.27717316]
Delta_i :  [1.26264393]
Delta_i :  [1.02054095]
Delta_i :  [1.0880506]
Delta_i :  [1.1005795]
Delta_i :  [1.02110851]
Delta_i :  [1.3555522]
Del

Delta_i :  [0.607052684]
Delta_i :  [0.618643165]
Delta_i :  [0.670110166]
Delta_i :  [0.514918804]
Delta_i :  [0.697354674]
Delta_i :  [0.61622]
Delta_i :  [0.696768641]
Delta_i :  [0.596283615]
Delta_i :  [0.784342349]
Delta_i :  [0.690688431]
Delta_i :  [0.682774365]
Delta_i :  [0.686543226]
Delta_i :  [0.52662611]
Delta_i :  [0.682594657]
Delta_i :  [0.614767075]
Delta_i :  [0.816917]
Delta_i :  [0.566569686]
Delta_i :  [0.555285275]
Delta_i :  [0.639369547]
Delta_i :  [0.641027212]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.702259898]
Delta_i :  [0.799125612]
Delta_i :  [0.747618318]
Delta_i :  [0.767502367]
Delta_i :  [0.723716557]
Delta_i :  [0.717226744]
Delta_i :  [0.637876332]
Delta_i :  [0.798084617]
Delta_i :  [0.577475965]
Delta_i :  [0.704192936]
Delta_i :  [0.69253695]
Delta_i :  [0.757585227]
Delta_i :  [0.583746612]
Delta_i :  [0.767795384]
Delta_i :  [0.724980474]
Delta_i :  [0.647242]
Delta_i :  [0.89218837]
Delta

Delta_i :  [0.371105731]
Delta_i :  [0.437108785]
Delta_i :  [0.427617282]
Delta_i :  [0.391503751]
Delta_i :  [0.416334957]
Delta_i :  [0.483062506]
Delta_i :  [0.375921905]
Delta_i :  [0.411827922]
Delta_i :  [0.468034834]
Delta_i :  [0.440110266]
Delta_i :  [0.435549498]
Delta_i :  [0.36375165]
Delta_i :  [0.46023345]
Delta_i :  [0.403924704]
Delta_i :  [0.407485485]
Delta_i :  [0.382315069]
Delta_i :  [0.380519211]
Delta_i :  [0.373584867]
Delta_i :  [0.395965099]
Delta_i :  [0.364771843]
Delta_i shape :  TensorShape([100, 1])
tf.square(Delta_i) :  TensorShape([100, 1])
Delta_i :  [0.554679334]
Delta_i :  [0.491998941]
Delta_i :  [0.438007623]
Delta_i :  [0.463099957]
Delta_i :  [0.487064749]
Delta_i :  [0.42865479]
Delta_i :  [0.481121749]
Delta_i :  [0.55188024]
Delta_i :  [0.529252112]
Delta_i :  [0.546777368]
Delta_i :  [0.456711203]
Delta_i :  [0.514892519]
Delta_i :  [0.426151156]
Delta_i :  [0.484542817]
Delta_i :  [0.45167169]
Delta_i :  [0.493323147]
Delta_i :  [0.48458349

In [None]:
all_metrics

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(all_metrics)

In [None]:
df.to_csv('test_results/results.csv')

In [None]:
# Load DataFrame from a CSV file
df_from_csv = pd.read_csv('test_results/results.csv')

In [None]:
df_from_csv

1. check which changes produce new graph. We want 
2. steps_naive, steps_linear, ..
3. run_federated_simulation_naive, run_federated_simulation_linear, ...