## Prepare Data

In [1]:
import os
import glob 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_parquet("../metrics/epoch_metrics")

In [3]:
grouped = df.groupby(['dataset_name', 'strat_name', 'num_clients', 'batch_size', 'theta', 'nn_name'])

In [4]:
df['total_time'] = grouped['epoch_duration_sec'].cumsum()

In [5]:
def dataset_size(ds_name):
    if ds_name == 'CIFAR10':
        return 50_000
    if ds_name == 'MNIST':
        return 60_000
    return None

In [6]:
df['ds_samples'] = df['dataset_name'].apply(dataset_size)

In [7]:
def model_num_weights(nn_name):
    if nn_name == 'LeNet-5':
        return 61_706
    if nn_name == 'AdvancedCNN':
        return 2_592_202
    if nn_name == 'DenseNet121':
        return 6_964_106
    if nn_name == 'DenseNet201':
        return 18_112_138
    return None

In [8]:
df['num_weights'] = df['nn_name'].apply(model_num_weights)

In [9]:
def local_state_size(strat_name):
    if strat_name == 'naive':
        return 1
    if strat_name == 'linear':
        return 2
    if strat_name == 'sketch':
        return 1000
    return None

In [10]:
df['local_state_size'] = df['strat_name'].apply(local_state_size)

In [11]:
def sample_size(ds_name):
    if ds_name == 'CIFAR10':
        return 32 * 32 * 3
    if ds_name == 'MNIST':
        return 28 * 28
    return None

In [12]:
df['sample_size'] = df['dataset_name'].apply(sample_size)

In [13]:
def local_state_computation_cost(row):
    if row['strat_name'] == 'naive':
        return row['num_weights']  # O(d)
    if row['strat_name'] == 'linear':
        return row['num_weights']  #  O(2*d)
    if row['strat_name'] == 'sketch':
        return row['num_weights'] * 5 * 250  #  O(width * d)
    return None

In [14]:
df['local_state_computation_cost'] = df.apply(local_state_computation_cost, axis=1)

In [15]:
def monitoring_size(row):
    if row['strat_name'] == 'naive':
        return 1
    if row['strat_name'] == 'linear':
        return 2
    if row['strat_name'] == 'sketch':
        return 5 * 250
    return None

In [16]:
df['monitoring_size'] = df.apply(monitoring_size, axis=1)

## Regression

In [17]:
def linear_regression(data_X_df, data_y_df):    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data_X_df, data_y_df, test_size=0.1)

    regr_model = linear_model.LinearRegression(positive=True, fit_intercept=False)

    regr_model.fit(X_train, y_train)

    y_pred = regr_model.predict(X_test)

    # Evaluate the model
    print("Linear Regression")
    print(f"Coefficients: {regr_model.coef_}\n")
    print("Linear Regression")
    print(f"RMS error: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"Coefficient of determination: {r2_score(y_test, y_pred):.2f}")
    
    return regr_model

## NaiveFDA


In [28]:
df_strat = df[df.strat_name == 'naive']

data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df_strat['total_fda_steps'] * df_strat['num_weights'] * df_strat['batch_size'],
        'dataset_computation_cost': df_strat['total_fda_steps'] * df_strat['sample_size'],
        #'local_state_computation_cost': df_strat['local_state_computation_cost'],
        
        'sync_communication_cost': df_strat['total_rounds'] * df_strat['num_weights'] * np.log2(df_strat['num_clients']),
        #'monitor_communication_cost': df_strat['total_fda_steps'] * df_strat['monitoring_size'] * np.ceil(np.log2(df_strat['num_clients']))
    }
)

data_y_df = df_strat['total_time']

In [29]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.35921543e-09 1.55093648e-04 1.71254209e-07]

Linear Regression
RMS error: 395.47
Coefficient of determination: 1.00


## LinearFDA

In [32]:
df_strat = df[df.strat_name == 'linear']

data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df_strat['total_fda_steps'] * df_strat['num_weights'] * df_strat['batch_size'],
        'dataset_computation_cost': df_strat['total_fda_steps'] * df_strat['sample_size'],
        #'local_state_computation_cost': df_strat['local_state_computation_cost'],
        
        'sync_communication_cost': df_strat['total_rounds'] * df_strat['num_weights'] * np.log2(df_strat['num_clients']),
        #'monitor_communication_cost': df_strat['total_fda_steps'] * df_strat['monitoring_size'] * np.ceil(np.log2(df_strat['num_clients']))
    }
)

data_y_df = df_strat['total_time']

In [33]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.33928903e-09 1.55690658e-04 1.90845646e-07]

Linear Regression
RMS error: 420.76
Coefficient of determination: 1.00


## SketchFDA

In [34]:
df_strat = df[df.strat_name == 'sketch']

data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df_strat['total_fda_steps'] * df_strat['num_weights'] * df_strat['batch_size'],
        'dataset_computation_cost': df_strat['total_fda_steps'] * df_strat['sample_size'],
        #'local_state_computation_cost': df_strat['local_state_computation_cost'],
        
        'sync_communication_cost': df_strat['total_rounds'] * df_strat['num_weights'] * np.log2(df_strat['num_clients']),
        #'monitor_communication_cost': df_strat['total_fda_steps'] * df_strat['monitoring_size'] * np.ceil(np.log2(df_strat['num_clients']))
    }
)

data_y_df = df_strat['total_time']

In [35]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.63265350e-09 1.75293755e-04 1.57352800e-07]

Linear Regression
RMS error: 460.80
Coefficient of determination: 1.00


## All FDA Strategies together

In [51]:
data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df['total_fda_steps'] * df['num_weights'] * df['batch_size'],
        'dataset_computation_cost': df['total_fda_steps'] * df['sample_size'],
        'local_state_computation_cost': df['local_state_computation_cost'],
        
        'sync_communication_cost': df['total_rounds'] * df['num_weights'] * np.log2(df['num_clients']),
        'monitor_communication_cost': df['total_fda_steps'] * df['monitoring_size'] * np.log2(df['num_clients'])
    }
)

data_y_df = df['total_time']

In [52]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.49770362e-09 1.54578559e-04 2.34288908e-08 7.23417476e-08
 2.08852528e-05]

Linear Regression
RMS error: 558.08
Coefficient of determination: 0.99


In [46]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.50353396e-09 1.63845132e-04 7.49103166e-08]

Linear Regression
RMS error: 700.58
Coefficient of determination: 0.99


In [None]:
[1.33928903e-09 1.55690658e-04 1.90845646e-07]

In [377]:
import pickle

In [378]:
with open('regression_model.pkl','wb') as f:
    pickle.dump(regr_model, f)