In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf

In [2]:
tf.__version__

'2.11.0'

## Create Binary Classification data with sklearn

In [3]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


n = 10_000 # we want 200_00
d = 1_00


noise_factor = 0.02 # % of the labels are randomly flipped, DEFAULT=0.01
test_size = 0.2 # % of n
# The factor multiplying the hypercube size. Larger values spread out the 
# clusters/classes and make the classification task easier. DEFAULT=1
class_sep = 2
seed = 7 # `None` for no seed

# Create (noisy) testing data for binary classification.
X, y = make_classification(
    n_samples=n, 
    n_features=d,
    n_informative=d,
    n_redundant=0, 
    n_classes=2,
    class_sep=class_sep,
    flip_y=noise_factor,
    random_state=seed
)

# We will work with label values -1, +1 and not 0, +1 (convert)
y[y == 0] = -1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [4]:
del X, y

In [5]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

# PA-I regressor from sklearn
pa1 = PassiveAggressiveClassifier(C=0.01, loss="hinge", n_jobs=-1)
pa1.fit(X_train, y_train)

accuracy_score(y_test, pa1.predict(X_test))

0.943

## Convert to Tensors

In [6]:
X_train_tensor = tf.constant(X_train, dtype=tf.float32)
del X_train

y_train_tensor = tf.constant(y_train, dtype=tf.float32)
del y_train

X_test_tensor = tf.constant(X_test, dtype=tf.float32)
del X_test

y_test_tensor = tf.constant(y_test, dtype=tf.float32)
del y_test

## Prepare data for Federated Learning

### Create centralized testing dataset

In [7]:
slices_test = (X_test_tensor, y_test_tensor)

In [8]:
def create_tf_dataset_for_testing(batch_size):
    return tf.data.Dataset.from_tensor_slices(slices_test).batch(batch_size)

In [9]:
test_dataset = create_tf_dataset_for_testing(32)

### Slice the Tensors for each Client

We will cut the training data, i.e., (`X_train_tensor`, `y_train_tensor`) to equal parts, each part corresponding to one Client. We want to give the result back as a dictionary with key `client_id` and value the training tensor data.

In [10]:
def create_data_for_clients(num_clients):
    
    client_slices_train = {}

    n_test = int(n - n*test_size)

    for i in range(num_clients):
        # Compute the indices for this client's slice
        start_idx = int(i * n_test / num_clients)
        end_idx = int((i + 1) * n_test / num_clients)

        # Get the slice for this client
        X_client_train = X_train_tensor[start_idx:end_idx]
        y_client_train = y_train_tensor[start_idx:end_idx]
        
        # Combine the slices into a single dataset
        client_slices_train[f'client_{i}'] = (X_client_train, y_client_train)
    
    return client_slices_train

### Create TF friendly data for each Client

Given a Tensor slice (i.e. value of `client_slices_train["client_id"]` we convert it to highly optimized `tf.data.Dataset` to prepare for training.

In [11]:
def create_tf_dataset_for_client(client_tensor_slices, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed):
    
        return tf.data.Dataset.from_tensor_slices(client_tensor_slices) \
            .shuffle(buffer_size=shuffle_buffer_size, seed=seed).batch(batch_size) \
            .prefetch(tf.data.AUTOTUNE).take(num_steps_until_rtc_check)

### Create Federated Learning data

In [12]:
def create_federated_data(client_slices_train, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed=None):
    
    federated_dataset = [ 
        create_tf_dataset_for_client(client_tensor_slices, batch_size, shuffle_buffer_size, num_steps_until_rtc_check, seed)
        for client, client_tensor_slices in client_slices_train.items()
    ]
    
    return federated_dataset

# Miscallenious

## Metrics

In [13]:
def create_metrics_dict(fda_name, n, d, test_size, seed, class_sep, noise_factor, epochs, num_clients, 
                        batch_size, steps_in_one_fda_step, theta, c, total_fda_steps, 
                        total_rounds, final_accuracy, sketch_width=None, sketch_depth=None):
    metrics = {
            "fda_name" : fda_name,
            "theta" : theta,
            "n" : n,
            "d" : d,
            "seed" : seed,
            "class_sep" : class_sep,
            "noise_factor" : noise_factor,
            "test_size" : test_size,
            "epochs" : epochs,
            "num_clients" : num_clients,
            "batch_size" : batch_size,
            "steps_in_one_fda_step" : steps_in_one_fda_step,
            "sketch_width" : sketch_width,
            "sketch_depth" : sketch_depth,
            "c" : c
        }
    
    # one batch bytes
    metrics["one_sample_bytes"] = 4 * (metrics["d"] + 1)
    
    # training dataset size
    metrics["training_dataset_bytes"] = metrics["one_sample_bytes"] * (1 - metrics["test_size"]) * metrics["n"]
    
    # model bytes
    metrics["model_bytes"] = d * 4
    
    
    # local state bytes (i.e. S_i), for one client
    if fda_name == "naive":
        metrics["local_state_bytes"] = 4
    elif fda_name == "linear":
        metrics["local_state_bytes"] = 8
    else:
        metrics["local_state_bytes"] = sketch_width * sketch_depth * 4 + 4
        
    # accuracy (already computed in parameter)
    metrics["final_accuracy"] = final_accuracy
    
    # total fda steps from algo
    metrics["total_fda_steps"] = total_fda_steps
    
    # total steps (a single fda step might have many normal SGD steps, batch steps)
    metrics["total_steps"] = metrics["total_fda_steps"] * metrics["steps_in_one_fda_step"]
    
    # total rounds in algo. Reason why we differentiate from the hardcoded NUM_ROUNDS
    # is because we might run less rounds in the future (i.e. stop on 10^7 samples idk)
    metrics["total_rounds"] = total_rounds
    
    # bytes exchanged for synchronizing weights (x2 because server sends back)
    metrics["model_bytes_exchanged"] = metrics["total_rounds"] * metrics["model_bytes"] \
        * metrics["num_clients"] * 2
    
    # bytes exchanged for monitoring the variance (communication)
    metrics["monitoring_bytes_exchanged"] = metrics["local_state_bytes"] * metrics["total_fda_steps"] \
        * metrics["num_clients"]
    
    # total communication bytes (for both monitoring and model synchronization)
    metrics["total_communication_bytes"] = metrics["model_bytes_exchanged"] + metrics["monitoring_bytes_exchanged"]
    
    # total seen dataset bytes (across all learning, i.e., all clients)
    metrics["trained_in_bytes"] = metrics["batch_size"] * metrics["one_sample_bytes"] \
        * metrics["total_steps"] * metrics["num_clients"]
    
    return metrics

## Variance

In [14]:
# = client_num
w_spec = tf.TensorSpec(shape=(20, d, 1), dtype=tf.float32)

@tf.function(input_signature=[w_spec, w_spec])
def variance(w_t, w_sync):
    # w_t , w_sync tensors with shape=(NUM_CLIENTS, d, 1)
    
    # tensor with shape=(NUM_CLIENTS, d, 1)
    diff = w_t - w_sync
    
    # tensor with shape=(NUM_CLIENTS, 1) , For each client ||w_i_t - w_t||^2
    dot = tf.reduce_sum(tf.square(diff), axis=1)
    
    # Variance shape=() , scalar
    var = tf.reduce_mean(dot)
    
    return var

## Accuracy testing

In [15]:
@tf.function
def accuracy(model, dataset):
    
    @tf.function
    def _batch_accuracy(model, batch):
        x_batch, y_batch = batch
        # from shape (d,) make it (d,1)
        y_batch = tf.expand_dims(y_batch, axis=1)

        # dot(w, x) for the batch (each instance of x in x_batch) with with shape=(batchsize, 1)
        weights_dot_x_batch = tf.matmul(x_batch, model)

        # Prediction batch with shape=(batchsize, 1)
        y_pred_batch = tf.sign(weights_dot_x_batch)

        accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred_batch, y_batch), tf.float32))

        return accuracy
    
    # We take advantage of AutoGraph (convert Python code to TensorFlow-compatible graph code automatically)
    acc, num_batches = 0., 0.
    for batch in dataset:
        acc += _batch_accuracy(model, batch)
        num_batches += 1
        
    acc = acc / num_batches
    
    return acc

## PA-Classiers (binary classification)

![PA](images/PA_binary_classifiers.png)

In [16]:
@tf.function
def client_train(model, dataset, C):

    @tf.function
    def _train_on_batch(model, batch, C):

        x_batch, y_batch = batch
        
        # from shape (d,) make it (d,1)
        y_batch = tf.expand_dims(y_batch, axis=1)

        # dot(w, x) for the batch (each instance of x in x_batch) with with shape=(batchsize, 1)
        weights_dot_x_batch = tf.matmul(x_batch, model)

        # Prediction batch with shape=(batchsize, 1)
        y_pred_batch = tf.sign(weights_dot_x_batch)

        # Suffer loss for each prediction (of instance) in the batch with shape=(batchsize,1)
        loss_batch = tf.maximum(0., 1. - tf.multiply(y_batch, weights_dot_x_batch))

        # shape=(batchsize,1) where each instance is ||x||^2, x in x_batch
        norm_batch = tf.expand_dims(tf.reduce_sum(tf.square(x_batch), axis=1), axis=1)
        
        # PA-1 : Learning rate t for each instance x, with shape=(batchsize,1)
        t_batch = tf.maximum(C, tf.divide(loss_batch, norm_batch))

        # each instance is y*t*x, where y,t scalars and x in x_batch. shape=(batchsize,d)
        t_y_x_batch = tf.multiply(t_batch, tf.multiply(y_batch, x_batch))

        # !!!! Update with mean t*y*x
        t_y_x_update = tf.expand_dims(tf.reduce_mean(t_y_x_batch, axis=0) ,axis=1)

        # Update
        model.assign_add(t_y_x_update)
    
    for batch in dataset:
        _train_on_batch(model, batch, C)
    

# Functional Dynamic Averaging

We follow the Functional Dynamic Averaging (FDA) scheme. Let the mean model be

$$ \overline{w_t} = \frac{1}{k} \sum_{i=1}^{k} w_t^{(i)} $$

where $ w_t^{(i)} $ is the model at time $ t $ in some round in the $i$-th learner.

Local models are trained independently and cooperatively and we want to monitor the Round Terminating Conditon (**RTC**):

$$ \frac{1}{k} \sum_{i=1}^{k} \lVert w_t^{(i)} - \overline{w_t} \rVert_2^2  \leq \Theta $$

where the left-hand side is the **model variance**, and threshold $\Theta$ is a hyperparameter of the FDA, defined at the beginning of the round; it may change at each round. When the monitoring logic cannot guarantee the validity of RTC, the round terminates. All local models are pulled into `tff.SERVER`, and $\bar{w_t}$ is set to their average. Then, another round begins.

### Monitoring the RTC

FDA monitors the RTC by applying techniques from Functionary [Functional Geometric Averaging](http://users.softnet.tuc.gr/~minos/Papers/edbt19.pdf). We first restate the problem of monitoring RTC into the standard distributed stream monitoring formulation. Let

$$ S(t) =  \frac{1}{k} \sum_{i=1}^{k} S_i(t) $$

where $ S(t) \in \mathbb{R}^n $ be the "global state" of the system and $ S_i(t) \in \mathbb{R}^n $ the "local states". The goal is to monitor the threshold condition on the global state in the form $ F(S(t)) \leq \Theta $ where $ F : \mathbb{R}^n \to \mathbb{R} $ a non-linear function. Let

$$ \Delta_t^{(i)} = w_t^{(i)} - w_{t_0}^{(i)} $$

be the update at the $ i $-th learner, that is, the change to the local model at time $t$ since the beginning of the current round at time $t_0$. Let the average update be

$$ \overline{\Delta_t} = \frac{1}{k} \sum_{i=1}^{k} \Delta_t^{(i)} $$

it follows that the variance can be written as

$$ \frac{1}{k} \sum_{i=1}^{k} \lVert w_t^{(i)} - \overline{w_t} \rVert_2^2 = \Big( \frac{1}{k} \sum_{i=1}^{k} \lVert \Delta_t^{(i)} \rVert_2^2 \Big) - \lVert \overline{\Delta_t} \rVert_2^2 $$

So, conceptually, if we define
$$ S_i(t) = \begin{bmatrix}
           \lVert \Delta_t^{(i)} \rVert_2^2 \\
           \Delta_t^{(i)}
         \end{bmatrix} \quad \text{and} \quad
         F(\begin{bmatrix}
           v \\
           \bf{x}
         \end{bmatrix}) = v - \lVert \bf{x} \rVert_2^2 $$

The RTC is equivalent to condition $$ F(S(t)) \leq \Theta $$

In [17]:
import sys

## 1️⃣ Naive FDA

In the naive approach, we eliminate the update vector from the local state (i.e. recuce the dimension to 0). Define local state as

$$ S_i(t) = \lVert \Delta_t^{(i)} \rVert_2^2 \in \mathbb{R}$$ 

and the identity function

$$ F(v) = v $$

It is trivial that $ F(S(t)) \leq \Theta $ implies the RTC.

In [18]:
@tf.function
def steps_naive(last_sync_model, model, dataset, C):
    # number of steps depend on `.take()` from `dataset`
    client_train(model, dataset, C)
    
    Delta_i = model - last_sync_model
    
    Delta_i_euc_norm_squared = tf.reduce_sum(tf.square(Delta_i), axis=0) # ||D(t)_i||^2
    
    return Delta_i_euc_norm_squared

### Training Loop

In [19]:
def F_naive(S):
    return S

In [20]:
@tf.function
def run_federated_simulation_naive(server_model, client_models, federated_dataset, C,
                                   num_epochs, theta, epoch_fda_steps):
    
    print("retracing naive")
    
    total_rounds = 0
    total_fda_steps = 0
    
    round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
    epoch_count = tf.constant(0, shape=(), dtype=tf.int32)
    
    S = tf.constant(0., shape=(), dtype=tf.float32)
    
    while epoch_count < num_epochs:
        
        while F_naive(S) <= theta:
            S_i_clients = []

            # client steps (number depends on `federated_dataset`, i.e., `.take(num)`)
            for client_model, client_dataset in zip(client_models, federated_dataset):
                Delta_i_euc_norm_squared = steps_naive(server_model, client_model, client_dataset, C)
                S_i_clients.append(Delta_i_euc_norm_squared)
                
            S = tf.reduce_mean(S_i_clients)
            
            round_fda_steps += 1
            total_fda_steps += 1
            
            if round_fda_steps == epoch_fda_steps:
                epoch_count += 1
                round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
                
                if epoch_count == num_epochs:
                    break
        
        
        """------------------------------test--------------------------------------------"""
        #tf.print("\n sync : ", output_stream=sys.stdout)
        #Delta_i_clients = [tf.subtract(client_model, server_model) for client_model in client_models] #test
        #testing_approx_0 = tf.reduce_sum(tf.square(tf.reduce_mean(Delta_i_clients, axis=0)), axis=0) #test
        #for client_model in client_models:
        #        tf.print("acc : ", accuracy(client_model, test_dataset), output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # server average
        server_model.assign(tf.reduce_mean(client_models, axis=0))
        
        """------------------------------test--------------------------------------------"""
        #tf.print("acc (after) : ", accuracy(server_model, test_dataset), output_stream=sys.stdout)
        #tf.print("Naive Epoch count: ", epoch_count, " Round fda steps: ", round_fda_steps, " Epoch fda steps:", epoch_fda_steps, output_stream=sys.stdout)
        #tf.print("Naive Epoch count: ", epoch_count, output_stream=sys.stdout)
        #actual_var = variance(client_models, [server_model]*len(client_models)) #test
        #tf.print("Est var: ", S, " Actual var: ", actual_var, " Total fda steps: ", total_fda_steps, output_stream=sys.stdout)
        #tf.print("\n", output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # reset variance approx
        S = tf.constant(0., shape=(), dtype=tf.float32)

        # synchronize clients
        for client_model in client_models:
            client_model.assign(server_model)
            
        total_rounds += 1
    
    return total_rounds, total_fda_steps

## 2️⃣ Linear FDA

In the linear case, we reduce the update vector to a scalar, $ \xi \Delta_t^{(i)} \in \mathbb{R}$, where $ \xi $ is any unit vector.

Define the local state to be 

$$ S_i(t) = \begin{bmatrix}
           \lVert \Delta_t^{(i)} \rVert_2^2 \\
           \xi \Delta_t^{(i)}
         \end{bmatrix} \in \mathbb{R}^2 $$

Also, define 

$$ F(v, x) = v - x^2 $$

The RTC is equivalent to condition 

$$ F(S(t)) \leq \Theta $$

A random choice of $ \xi $ is likely to perform poorly (terminate round prematurely), as it wil likely be close to orthogonal to $ \overline{\Delta_t} $. A good choice would be a vector $ \xi $ correlated to $ \overline{\Delta_t} $. A heuristic choice is to take $ \overline{\Delta_{t_0}} $ (after scaling it to norm 1), i.e., the update vector right before the current round started. All nodes can estimate this without communication, as $ \overline{w_{t_0}} - \overline{w_{t_{-1}}} $, the difference of the last two models pushed by the Server. Hence, 

$$ \xi = \frac{\overline{w_{t_0}} - \overline{w_{t_{-1}}}}{\lVert \overline{w_{t_0}} - \overline{w_{t_{-1}}} \rVert_2} $$

In [21]:
@tf.function
def ksi_unit_fn(w_t0, w_tminus1):
    
    if tf.reduce_all(tf.equal(w_t0, w_tminus1)):
        # if equal then ksi becomes a random vector (will only happen in round 1)
        ksi = tf.random.normal(shape=w_t0.shape)
    else:
        ksi = w_t0 - w_tminus1

    # Normalize and return
    return tf.divide(ksi, tf.norm(ksi))

In [22]:
@tf.function
def steps_linear(model_tminus, model_t0, model, dataset, C):
    # number of steps depend on `.take()` from `dataset`
    client_train(model, dataset, C)
    
    Delta_i = model - model_t0
    
    #||D(t)_i||^2 , shape = (1,) 
    Delta_i_euc_norm_squared = tf.reduce_sum(tf.square(Delta_i), axis=0)
    
    # heuristic unit vector ksi
    ksi = ksi_unit_fn(model_t0, model_tminus)
    
    # ksi * Delta_i (* is dot) , shape = ()
    ksi_Delta_i = tf.reduce_sum(tf.multiply(ksi, Delta_i))
    
    return Delta_i_euc_norm_squared, ksi_Delta_i

### Training Loop

In [23]:
def F_linear(S_1, S_2):
    return S_1 - S_2**2

In [24]:
@tf.function
def run_federated_simulation_linear(previous_server_model, server_model,
                                    client_models, federated_dataset, C, 
                                    num_epochs, theta, epoch_fda_steps):
    
    print("retracing linear")
    
    total_rounds = 0
    total_fda_steps = 0
    
    round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
    epoch_count = tf.constant(0, shape=(), dtype=tf.int32)
    
    S_1 = tf.constant(0., shape=(), dtype=tf.float32)
    S_2 = tf.constant(0., shape=(), dtype=tf.float32)
    
    while epoch_count < num_epochs:
        
        while F_linear(S_1, S_2) <= theta:
            euc_norm_squared_clients = []
            ksi_delta_clients = []

            # client steps (number depends on `federated_dataset`, i.e., `.take(num)`)
            for client_model, client_dataset in zip(client_models, federated_dataset):
                
                Delta_i_euc_norm_squared, ksi_Delta_i  = steps_linear(
                    previous_server_model, server_model, client_model, client_dataset, C
                )
                
                euc_norm_squared_clients.append(Delta_i_euc_norm_squared)
                ksi_delta_clients.append(ksi_Delta_i)
                
            S_1 = tf.reduce_mean(euc_norm_squared_clients)
            S_2 = tf.reduce_mean(ksi_delta_clients)
            
            round_fda_steps += 1
            total_fda_steps += 1
            
            if round_fda_steps == epoch_fda_steps:
                epoch_count += 1
                round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
                
                if epoch_count == num_epochs:
                    break
        
        # last server model (previous sync)
        previous_server_model.assign(server_model)
        
        """------------------------------test--------------------------------------------"""
        #tf.print("\n sync : ", output_stream=sys.stdout)
        #Delta_i_clients = [tf.subtract(client_model, server_model) for client_model in client_models] #test
        #testing_approx_0 = tf.reduce_sum(tf.square(tf.reduce_mean(Delta_i_clients, axis=0)), axis=0) #test
        #for client_model in client_models:
        #        tf.print("acc : ", accuracy(client_model, test_dataset), output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # server average
        server_model.assign(tf.reduce_mean(client_models, axis=0))
        
        """------------------------------test--------------------------------------------"""
        #tf.print("acc (after) : ", accuracy(server_model, test_dataset), output_stream=sys.stdout)
        #tf.print("Linear Epoch count: ", epoch_count, " Round fda steps: ", round_fda_steps, " Epoch fda steps:", epoch_fda_steps, output_stream=sys.stdout)
        #tf.print("Linear Epoch count: ", epoch_count, output_stream=sys.stdout)
        #actual_var = variance(client_models, [server_model]*len(client_models)) #test
        #tf.print("Est left: ", S_1, " Est S_2: ", S_2**2, "Est var: ", S_1-S_2**2, " Actual var: ", actual_var, " Total fda steps: ", total_fda_steps, output_stream=sys.stdout)
        #tf.print("\n", output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        
        # reset variance approx
        S_1 = tf.constant(0., shape=(), dtype=tf.float32)
        S_2 = tf.constant(0., shape=(), dtype=tf.float32)

        # synchronize clients
        for client_model in client_models:
            client_model.assign(server_model)
            
        total_rounds += 1
    
    return total_rounds, total_fda_steps

## 3️⃣ Sketch FDA

An optimal estimator for $ \lVert \overline{\Delta_t} \rVert_2^2  $ can be obtained by employing AMS sketches. An AMS sketch of a vector $ v \in \mathbb{R}^M $ is a $ d \times m $ real matrix

$$ \Xi = \text{sk}(v) = \begin{bmatrix}
           \Xi_1 \\
           \Xi_2 \\
           \vdots \\
           \Xi_d 
         \end{bmatrix} $$
         
where $ d \cdot m \ll M$. Operator sk($ \cdot $) is linear, i.e., let $a, b \in \mathbb{R}$ and $v_1, v_2 \in \mathbb{R}^N$ then 

$$ \text{sk}(a v_1 + b v_2) = a \; \text{sk}(v_1) + b \; \text{sk}(v_2)  $$

Also, sk($ v $) can be computed in $ \mathcal{O}(dN) $ steps.

The interesting property of AMS sketches is that the function 

$$ M(sk(\textbf{v})) = \underset{i=1,...,d}{\text{median}} \; \lVert \boldsymbol{\Xi}_i \rVert_2^2  $$ 

is an excellent estimator of the Euclidean norm of **v** (within relative $\epsilon$-error):

$$ M(sk(\textbf{v})) \; \in (1 \pm \epsilon) \lVert \textbf{v} \rVert_2^2 \; \; \text{with probability at least} \; (1-\delta) $$

where $m = \mathcal{O}(\frac{1}{\epsilon^2})$ and $d = \mathcal{O}(\log \frac{1}{\delta})$
            
Moreover, let $\boldsymbol{\Xi} \in \mathbb{R}^{d \times m}$ and $ k \in \mathbb{R}$. It can be proven that

$$ M( \frac{1}{k} \boldsymbol{\Xi}) = \frac{1}{k^2} M(\boldsymbol{\Xi}) $$

Let's investigate a little further on how this helps us. The $i$-th client computes $ sk(\Delta_t^{(i)}) $ and sends it to the server. Notice

$$ M\big(sk(\Delta_t^{(1)}) + sk(\Delta_t^{(2)}) + ... + sk(\Delta_t^{(k)}) \big) = M\Big( \text{sk}\big( \sum_{i=1}^{k} \Delta_t^{(i)} \big) \Big)$$

Remember that

$$ \overline{\boldsymbol{\Delta}}_t = \frac{1}{k} \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} $$

Then
            
$$ M\Big( \text{sk}\big( \overline{\boldsymbol{\Delta}}_t \big) \Big) = M\Big( \text{sk}\big( \frac{1}{k} \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} \big) \Big) = \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} \big) \Big) $$


Which means that 

$$ \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \boldsymbol{\Delta}_t^{(i)} \big) \Big) \in (1 \pm \epsilon) \lVert \overline{\boldsymbol{\Delta}}_t \rVert_2^2 \; \; \text{w.p. at least} \; (1-\delta) $$

In the monitoring process it is essential that we do not overestimate $ \lVert \overline{\Delta_t} \rVert_2^2 $ because we would then underestimate the variance which would potentially result in actual varience exceeding $ \Theta$ without us noticing it. With this in mind,

$$ \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \Delta_t^{(i)} \big) \Big) \leq (1+\epsilon) \lVert \overline{\Delta_t} \rVert_2^2 \quad \text{with probability at least} \; (1-\delta)$$

Which means

$$ \frac{1}{(1+\epsilon)} \frac{1}{k^2} M\Big( \text{sk}\big( \sum_{i=1}^{k} \Delta_t^{(i)} \big) \Big) \leq \lVert \overline{\Delta_t} \rVert_2^2 \quad \text{with probability at least} \; (1-\delta)$$

Hence, the Server's estimation of $ \lVert \overline{\Delta_t} \rVert_2^2 $ is

$$ \frac{1}{(1+\epsilon)} \frac{1}{k^2} M\Big( sk(\Delta_t^{(1)}) + sk(\Delta_t^{(2)}) + ... + sk(\Delta_t^{(k)}) \big) \Big) $$

Define the local state to be 

$$ S_i(t) = \begin{bmatrix}
           \lVert \Delta_t^{(i)} \rVert_2^2 \\
           sk(\Delta_t^{(i)})
         \end{bmatrix} \in \mathbb{R}^{1+d \times m} \quad \text{and} \quad
         F(\begin{bmatrix}
           v \\
           \Xi
         \end{bmatrix}) = v - \frac{1}{(1+\epsilon)} \frac{1}{k^2} M(\Xi) \quad \text{where} \quad \Xi = \sum_{i=1}^{k} sk(\Delta_t^{(i)}) $$

It follows that $ F(S(t)) \leq \Theta $ implies that the variance is less or equal to $ \Theta $ with probability at least $ 1-\delta $.


## AMS sketch

We use `ExtensionType` which is the way to go in order to avoid unecessary graph retracing when passing around `AmsSketch` type 'objects'.

In [25]:
from tensorflow.experimental import ExtensionType


class AmsSketch(ExtensionType):
    depth: int
    width: int
    F: tf.Tensor
        
    def __init__(self, depth=7, width=1500):
        self.depth = depth
        self.width = width
        self.F = tf.random.uniform(shape=(6, depth), minval=0, maxval=(1 << 31) - 1, dtype=tf.int32)

        
    @tf.function
    def hash31(self, x, a, b):

        r = a * x + b
        fold = tf.bitwise.bitwise_xor(tf.bitwise.right_shift(r, 31), r)
        return tf.bitwise.bitwise_and(fold, 2147483647)
    
    
    @tf.function
    def tensor_hash31(self, x, a, b): # GOOD
        """ Assumed that x is tensor shaped (d,) , i.e., a vector (for example, indices, i.e., tf.range(d)) """

        # Reshape x to have an extra dimension, resulting in a shape of (k, 1)
        x_reshaped = tf.expand_dims(x, axis=-1)

        # shape=(`v_dim`, 7)
        r = tf.multiply(a, x_reshaped) + b

        fold = tf.bitwise.bitwise_xor(tf.bitwise.right_shift(r, 31), r)
        
        return tf.bitwise.bitwise_and(fold, 2147483647)
    
    
    @tf.function
    def tensor_fourwise(self, x):
        """ Assumed that x is tensor shaped (d,) , i.e., a vector (for example, indices, i.e., tf.range(d)) """
        # 1st use the tensor hash31
        in1 = self.tensor_hash31(x, self.F[2], self.F[3])  # (`x_dim`, 7)
        
        # 2nd (notice we swap the first two params, no change really)
        in2 = self.tensor_hash31(x, in1, self.F[4])  # (`x_dim`, 7)
        
        in3 = self.tensor_hash31(x, in2, self.F[5])  # (`x_dim`, 7)
        
        in4 = tf.bitwise.bitwise_and(in3, 32768)  # (`x_dim`, 7)
        
        return 2 * (tf.bitwise.right_shift(in4, 15)) - 1  # (`x_dim`, 7)
        
        
    @tf.function
    def fourwise(self, x):

        result = 2 * (tf.bitwise.right_shift(tf.bitwise.bitwise_and(self.hash31(self.hash31(self.hash31(x, self.F[2], self.F[3]), x, self.F[4]), x, self.F[5]), 32768), 15)) - 1
        return result
    
    
    @tf.function
    def sketch_for_vector(self, v):
        """ Extremely efficient computation of sketch with only using tensors. """
        
        sketch = tf.zeros(shape=(self.depth, self.width), dtype=tf.float32)
        
        len_v = v.shape[0]
        
        pos_tensor = self.tensor_hash31(tf.range(len_v), self.F[0], self.F[1]) % self.width
        
        v_expand = tf.expand_dims(v, axis=-1)
        
        deltas_tensor = tf.multiply(tf.cast(self.tensor_fourwise(tf.range(len_v)), dtype=tf.float32), v_expand)
        
        range_tensor = tf.range(self.depth)
        
        # Expand dimensions to create a 2D tensor with shape (1, depth)
        range_tensor_expanded = tf.expand_dims(range_tensor, 0)

        # Use tf.tile to repeat the tensor `len_v` times
        repeated_range_tensor = tf.tile(range_tensor_expanded, [len_v, 1])
        
        # shape=(`len_v`, 7, 2)
        indices = tf.stack([repeated_range_tensor, pos_tensor], axis=-1)
        
        sketch = tf.tensor_scatter_nd_add(sketch, indices, deltas_tensor)
        
        return sketch
    
    
    @tf.function
    def sketch_for_vector2(self, v):
        """ Bad implementation for tensorflow. """

        sketch = tf.zeros(shape=(self.depth, self.width), dtype=tf.float32)

        for i in tf.range(tf.shape(v)[0], dtype=tf.int32):
            pos = self.hash31(i, self.F[0], self.F[1]) % self.width
            delta = tf.cast(self.fourwise(i), dtype=tf.float32) * v[i]
            indices_to_update = tf.stack([tf.range(self.depth, dtype=tf.int32), pos], axis=1)
            sketch = tf.tensor_scatter_nd_add(sketch, indices_to_update, delta)

        return sketch
        
    
    @staticmethod
    @tf.function
    def estimate_euc_norm_squared(sketch):

        @tf.function
        def _median(v):
            """ Median of tensor `v` with shape=(n,). Note: Suboptimal O(nlogn) but it's ok bcz n = `depth`"""
            length = tf.shape(v)[0]
            sorted_v = tf.sort(v)
            middle = length // 2

            return tf.cond(
                tf.equal(length % 2, 0),
                lambda: (sorted_v[middle - 1] + sorted_v[middle]) / 2.0,
                lambda: sorted_v[middle]
            )

        return _median(tf.reduce_sum(tf.square(sketch), axis=1))

In [26]:
@tf.function
def steps_sketch(last_sync_model, model, dataset, C, ams_sketch):
    # number of steps depend on `.take()` from `dataset`
    client_train(model, dataset, C)
    
    Delta_i = model - last_sync_model
    
    #||D(t)_i||^2 , shape = (1,) 
    Delta_i_euc_norm_squared = tf.reduce_sum(tf.square(Delta_i), axis=0)
    
    # sketch approx
    sketch = ams_sketch.sketch_for_vector(tf.reshape(Delta_i, shape=[-1]))
    
    return Delta_i_euc_norm_squared, sketch

### Training Loop

In [27]:
def F_sketch(S_1, S_2, epsilon, num_clients):
    """ `S_1` is mean || ||^2 as usual, S_2 is the `Ξ` as defined in the theoretical analysis above """
    one = tf.constant(1., shape=(), dtype=tf.float32)
    
    return S_1 - (one / (one + epsilon)) * (one / num_clients**2) * AmsSketch.estimate_euc_norm_squared(S_2)

In [28]:
@tf.function
def run_federated_simulation_sketch(server_model, client_models, federated_dataset, C,
                                    num_epochs, theta, epoch_fda_steps, 
                                    ams_sketch, epsilon, num_clients):
    
    print("retracing sketch")
    
    total_rounds = 0
    total_fda_steps = 0
    
    round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
    epoch_count = tf.constant(0, shape=(), dtype=tf.int32)
    
    S_1 = tf.constant(0., shape=(), dtype=tf.float32)
    S_2 = tf.zeros(shape=(ams_sketch.depth, ams_sketch.width), dtype=tf.float32)
    
    while epoch_count < num_epochs:
        
        while F_sketch(S_1, S_2, epsilon, num_clients) <= theta:
            euc_norm_squared_clients = []
            sketch_clients = []

            # client steps (number depends on `federated_dataset`, i.e., `.take(num)`)
            for client_model, client_dataset in zip(client_models, federated_dataset):
                
                Delta_i_euc_norm_squared, sketch  = steps_sketch(
                    server_model, client_model, client_dataset, C, ams_sketch
                )
                
                euc_norm_squared_clients.append(Delta_i_euc_norm_squared)
                sketch_clients.append(sketch)
                
            S_1 = tf.reduce_mean(euc_norm_squared_clients)
            S_2 = tf.reduce_sum(sketch_clients, axis=0)  # shape=(`depth`, width`). See `Ξ` in theoretical analysis
            
            round_fda_steps += 1
            total_fda_steps += 1
            
            if round_fda_steps == epoch_fda_steps:
                epoch_count += 1
                round_fda_steps = tf.constant(0, shape=(), dtype=tf.int32)
                
                if epoch_count == num_epochs:
                    break
        
        """------------------------------test--------------------------------------------"""
        #tf.print("\n sync : ", output_stream=sys.stdout)
        #Delta_i_clients = [tf.subtract(client_model, server_model) for client_model in client_models] #test
        #testing_approx_0 = tf.reduce_sum(tf.square(tf.reduce_mean(Delta_i_clients, axis=0)), axis=0) #test
        #for client_model in client_models:
        #        tf.print("acc : ", accuracy(client_model, test_dataset), output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        # server average
        server_model.assign(tf.reduce_mean(client_models, axis=0))
        
        """------------------------------test--------------------------------------------"""
        #tf.print("acc (after) : ", accuracy(server_model, test_dataset), output_stream=sys.stdout)
        #tf.print("Sketch Epoch count: ", epoch_count, " Round fda steps: ", round_fda_steps, " Epoch fda steps:", epoch_fda_steps, output_stream=sys.stdout)
        #tf.print("Sketch Epoch count: ", epoch_count, output_stream=sys.stdout)
        #actual_var = variance(client_models, [server_model]*len(client_models)) #test
        #tf.print("Est left: ", S_1, "Est var: ", F_sketch(S_1, S_2, epsilon, num_clients), " Actual var: ", actual_var, " Total fda steps: ", total_fda_steps, output_stream=sys.stdout)
        #tf.print("\n", output_stream=sys.stdout)
        """------------------------------test--------------------------------------------"""
        
        
        # reset variance approx
        S_1 = tf.constant(0., shape=(), dtype=tf.float32)
        S_2 = tf.zeros(shape=(ams_sketch.depth, ams_sketch.width), dtype=tf.float32)

        # synchronize clients
        for client_model in client_models:
            client_model.assign(server_model)
            
        total_rounds += 1
    
    return total_rounds, total_fda_steps

# Simulation tests

In [29]:
def basic_test(NUM_EPOCHS, C, NUM_STEPS_UNTIL_RTC_CHECK, NUM_CLIENTS,
               BATCH_SIZE, THETA, EPSILON, ams_sketch, client_slices_train):
    
    """ One test for Naive,Linear,Sketch. Returns metrics """
    
    c = tf.constant(C, shape=(), dtype=tf.float32)
    num_epochs = tf.constant(NUM_EPOCHS, shape=(), dtype=tf.int32)
    theta = tf.constant(THETA, shape=(), dtype=tf.float32)
    
    # for sketch
    epsilon = tf.constant(EPSILON, shape=(), dtype=tf.float32) # new
    num_clients = tf.constant(float(NUM_CLIENTS), shape=(), dtype=tf.float32) # new
    
    
    epoch_client_batches = ((1-test_size)*n / BATCH_SIZE) / NUM_CLIENTS
    epoch_max_fda_steps = epoch_client_batches / NUM_STEPS_UNTIL_RTC_CHECK
    epoch_max_fda_steps = tf.constant(int(epoch_max_fda_steps), shape=(), dtype=tf.int32)
    
    basic_test_metrics = []
    
    """ --------------- Naive ----------------------------------"""
        
    # 1. tf.data.Dataset (we create it again because we want determinism)

    federated_dataset = create_federated_data(
        client_slices_train=client_slices_train,
        batch_size=BATCH_SIZE,
        shuffle_buffer_size=int(n/NUM_CLIENTS),
        num_steps_until_rtc_check=NUM_STEPS_UNTIL_RTC_CHECK,
        seed=seed
    )

    # 2. Models init
    
    server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)

    client_models = [
        tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)
        for _ in range(NUM_CLIENTS)
    ]

    # 3. Run 
    
    total_rounds, total_fda_steps = run_federated_simulation_naive(
        server_model, 
        client_models, 
        federated_dataset, 
        c,
        num_epochs, 
        theta,
        epoch_max_fda_steps
    )

    # 4. Compute metrics 
    
    final_accuracy = accuracy(server_model, test_dataset)

    metrics = create_metrics_dict(
        fda_name="naive", 
        n=n, 
        d=d, 
        test_size=test_size,
        seed=seed,
        class_sep=class_sep,
        noise_factor=noise_factor,
        epochs=NUM_EPOCHS,
        num_clients=NUM_CLIENTS, 
        batch_size=BATCH_SIZE, 
        steps_in_one_fda_step=NUM_STEPS_UNTIL_RTC_CHECK, 
        theta=THETA, 
        c=C, 
        total_fda_steps=total_fda_steps.numpy(), 
        total_rounds=total_rounds.numpy(), 
        final_accuracy=final_accuracy.numpy(), 
        sketch_width=None, 
        sketch_depth=None
    )

    basic_test_metrics.append(metrics)

    del federated_dataset, server_model, client_models, total_rounds, total_fda_steps

    """ ----------------- Linear ----------------------------------"""

    # 1. tf.data.Dataset (we create it again because we want determinism)

    federated_dataset = create_federated_data(
        client_slices_train=client_slices_train,
        batch_size=BATCH_SIZE,
        shuffle_buffer_size=int(n/NUM_CLIENTS),
        num_steps_until_rtc_check=NUM_STEPS_UNTIL_RTC_CHECK,
        seed=seed
    )

    # 3. Model init
    
    server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)

    client_models = [
        tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)
        for _ in range(NUM_CLIENTS)
    ]

    # for `ξ` approximation.
    previous_server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)


    # 3. Run
    total_rounds, total_fda_steps = run_federated_simulation_linear(
        previous_server_model,
        server_model, 
        client_models, 
        federated_dataset, 
        c,
        num_epochs, 
        theta,
        epoch_max_fda_steps
    )
    
    # 4. Compute metrics 

    final_accuracy = accuracy(server_model, test_dataset)

    metrics = create_metrics_dict(
        fda_name="linear", 
        n=n, 
        d=d, 
        test_size=test_size, 
        seed=seed,
        class_sep=class_sep,
        noise_factor=noise_factor,
        epochs=NUM_EPOCHS,
        num_clients=NUM_CLIENTS, 
        batch_size=BATCH_SIZE, 
        steps_in_one_fda_step=NUM_STEPS_UNTIL_RTC_CHECK, 
        theta=THETA, 
        c=C, 
        total_fda_steps=total_fda_steps.numpy(), 
        total_rounds=total_rounds.numpy(), 
        final_accuracy=final_accuracy.numpy(), 
        sketch_width=None, 
        sketch_depth=None
    )

    basic_test_metrics.append(metrics)

    del federated_dataset, server_model, client_models, previous_server_model, total_rounds, total_fda_steps
    
    
    """ --------------- Sketch ----------------------------------"""
    
    # 1. tf.data.Dataset (we create it again because we want determinism)

    federated_dataset = create_federated_data(
        client_slices_train=client_slices_train,
        batch_size=BATCH_SIZE,
        shuffle_buffer_size=int(n/NUM_CLIENTS),
        num_steps_until_rtc_check=NUM_STEPS_UNTIL_RTC_CHECK,
        seed=seed
    )

    # 2. Models init

    server_model = tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)

    client_models = [
        tf.Variable(tf.zeros(shape=(d, 1)), trainable=True, name='weights', dtype=tf.float32)
        for _ in range(NUM_CLIENTS)
    ]

    # 3. Run 

    total_rounds, total_fda_steps = run_federated_simulation_sketch(
        server_model, 
        client_models, 
        federated_dataset, 
        c,
        num_epochs, 
        theta,
        epoch_max_fda_steps,
        ams_sketch, # new
        epsilon,  # new
        num_clients # new
    )

    # 4. Compute metrics 

    final_accuracy = accuracy(server_model, test_dataset)

    metrics = create_metrics_dict(
        fda_name="sketch", 
        n=n, 
        d=d, 
        test_size=test_size,
        seed=seed,
        class_sep=class_sep,
        noise_factor=noise_factor,
        epochs=NUM_EPOCHS,
        num_clients=NUM_CLIENTS, 
        batch_size=BATCH_SIZE, 
        steps_in_one_fda_step=NUM_STEPS_UNTIL_RTC_CHECK, 
        theta=THETA, 
        c=C, 
        total_fda_steps=total_fda_steps.numpy(), 
        total_rounds=total_rounds.numpy(), 
        final_accuracy=final_accuracy.numpy(), 
        sketch_width=ams_sketch.width, 
        sketch_depth=ams_sketch.depth
    )
    
    basic_test_metrics.append(metrics)
    
    del federated_dataset, server_model, client_models, total_rounds, total_fda_steps

    return basic_test_metrics
    

In [30]:
from math import sqrt # new

def run_tests(NUM_EPOCHS_FIXED, C_FIXED, NUM_STEPS_UNTIL_RTC_CHECK_FIXED, 
              SKETCH_DEPTH, SKETCH_WIDTH, NUM_CLIENTS_LIST, BATCH_SIZE_LIST, THETA_LIST,
              BATCH_SIZE_FIXED, THETA_FIXED, NUM_CLIENTS_FIXED):
    
    """ --------------- Fixed configurations -------------------"""

    ams_sketch = AmsSketch(
        depth=SKETCH_DEPTH,
        width=SKETCH_WIDTH
    )

    EPSILON = 1. / sqrt(SKETCH_WIDTH)
    
    
    """ --------------- Metrics list ----------------------"""
    
    all_metrics = []
    
    try:
        
        """ --------------- Run tests -------------------"""

        # Test /k nodes
        # FIX: BATCH_SIZE, THETA

        for NUM_CLIENTS in NUM_CLIENTS_LIST:
            print()
            print(f"CLIENTS testing. Current NUM_CLIENTS : {NUM_CLIENTS}")
            
            client_slices_train = create_data_for_clients(NUM_CLIENTS)  # new sliced dataset (diff NUM_CLIENTS)

            basic_test_metrics = basic_test(
                NUM_EPOCHS=NUM_EPOCHS_FIXED,
                C=C_FIXED,
                NUM_STEPS_UNTIL_RTC_CHECK=NUM_STEPS_UNTIL_RTC_CHECK_FIXED,
                NUM_CLIENTS=NUM_CLIENTS,
                BATCH_SIZE=BATCH_SIZE_FIXED,
                THETA=THETA_FIXED,
                EPSILON=EPSILON,
                ams_sketch=ams_sketch,
                client_slices_train=client_slices_train
            )

            all_metrics.extend(basic_test_metrics)
            
            del client_slices_train
            

        # Test /THETA : THETA tests will only retrance Graphs once!
        
        client_slices_train = create_data_for_clients(NUM_CLIENTS_FIXED)  # same sliced dataset (fixed NUM_CLIENTS)

        for THETA in THETA_LIST:
            print()
            print(f"THETA testing. Current THETA : {THETA}")

            basic_test_metrics = basic_test(
                NUM_EPOCHS=NUM_EPOCHS_FIXED,
                C=C_FIXED,
                NUM_STEPS_UNTIL_RTC_CHECK=NUM_STEPS_UNTIL_RTC_CHECK_FIXED,
                NUM_CLIENTS=NUM_CLIENTS_FIXED,
                BATCH_SIZE=BATCH_SIZE_FIXED,
                THETA=THETA,
                EPSILON=EPSILON,
                ams_sketch=ams_sketch,
                client_slices_train=client_slices_train
            )

            all_metrics.extend(basic_test_metrics)

        # Test /BATCHS_SIZE : THETA tests will only retrance Graphs once!
        
        client_slices_train = create_data_for_clients(NUM_CLIENTS_FIXED) # same sliced dataset (fixed NUM_CLIENTS)

        for BATCH_SIZE in BATCH_SIZE_LIST:
            print()
            print(f"BATCH_SIZE testing. Current BATCH_SIZE : {BATCH_SIZE}")

            basic_test_metrics = basic_test(
                NUM_EPOCHS=NUM_EPOCHS_FIXED,
                C=C_FIXED,
                NUM_STEPS_UNTIL_RTC_CHECK=NUM_STEPS_UNTIL_RTC_CHECK_FIXED,
                NUM_CLIENTS=NUM_CLIENTS_FIXED,
                BATCH_SIZE=BATCH_SIZE,
                THETA=THETA_FIXED,
                EPSILON=EPSILON,
                ams_sketch=ams_sketch,
                client_slices_train=client_slices_train
            )

            all_metrics.extend(basic_test_metrics)
    
    except Exception as e:
        print(e)
        print("shit")
    
    finally:
        return all_metrics

In [31]:
all_metrics = run_tests(
    NUM_EPOCHS_FIXED=3,
    C_FIXED=0.01, 
    NUM_STEPS_UNTIL_RTC_CHECK_FIXED=1,
    SKETCH_DEPTH=7, 
    SKETCH_WIDTH=1000, 
    NUM_CLIENTS_LIST=[], #[10, 20, 35, 50, 75, 100, 250, 500],
    BATCH_SIZE_LIST=[], #[16, 32, 64, 126, 256, 512], 
    THETA_LIST=[1., 5.], #[10., 25., 50., 75., 100., 250., 500., 1000.],
    BATCH_SIZE_FIXED=32,  # When testing THETA_LIST and NUM_CLIENTS_LIST
    THETA_FIXED=5.,  # When testing BATCH_SIZE_LIST and NUM_CLIENTS_LIST
    NUM_CLIENTS_FIXED=20  # When testing BATCH_SIZE_LIST and THETA_LIST
)


THETA testing. Current THETA : 1.0
retracing naive
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
retracing linear
retracing sketch

THETA testing. Current THETA : 5.0


In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame(all_metrics)

In [34]:
df

Unnamed: 0,fda_name,theta,n,d,seed,class_sep,noise_factor,test_size,epochs,num_clients,...,model_bytes,local_state_bytes,final_accuracy,total_fda_steps,total_steps,total_rounds,model_bytes_exchanged,monitoring_bytes_exchanged,total_communication_bytes,trained_in_bytes
0,naive,1.0,10000,100,7,2,0.02,0.2,3,20,...,400,4,0.951885,36,36,4,64000,2880,66880,9308160
1,linear,1.0,10000,100,7,2,0.02,0.2,3,20,...,400,8,0.951885,36,36,2,32000,5760,37760,9308160
2,sketch,1.0,10000,100,7,2,0.02,0.2,3,20,...,400,28004,0.951885,36,36,2,32000,20162880,20194880,9308160
3,naive,5.0,10000,100,7,2,0.02,0.2,3,20,...,400,4,0.951885,36,36,2,32000,2880,34880,9308160
4,linear,5.0,10000,100,7,2,0.02,0.2,3,20,...,400,8,0.951885,36,36,2,32000,5760,37760,9308160
5,sketch,5.0,10000,100,7,2,0.02,0.2,3,20,...,400,28004,0.951885,36,36,1,16000,20162880,20178880,9308160


In [35]:
df.to_csv('test_results/results.csv', index=False)

TODO:

1. variance (follow NN approach) decouple `NUM_CLIENTS` fixed. Better approach plz
2. for in : for in : for in: approach in testing. REMOVE FIXED SHIT