## Prepare Data

In [1]:
import os
import glob 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_parquet("../metrics/epoch_metrics")

In [3]:
grouped = df.groupby(['dataset_name', 'strat_name', 'num_clients', 'batch_size', 'theta', 'nn_name'])

In [4]:
df['total_time'] = grouped['epoch_duration_sec'].cumsum()

In [5]:
def dataset_size(ds_name):
    if ds_name == 'CIFAR10':
        return 50_000
    if ds_name == 'MNIST':
        return 60_000
    return None

In [6]:
df['ds_samples'] = df['dataset_name'].apply(dataset_size)

In [7]:
def model_num_weights(nn_name):
    if nn_name == 'LeNet-5':
        return 61_706
    if nn_name == 'AdvancedCNN':
        return 2_592_202
    if nn_name == 'DenseNet121':
        return 6_964_106
    if nn_name == 'DenseNet201':
        return 18_112_138
    return None

In [8]:
df['num_weights'] = df['nn_name'].apply(model_num_weights)

In [9]:
def local_state_size(strat_name):
    if strat_name == 'naive':
        return 1
    if strat_name == 'linear':
        return 2
    if strat_name == 'sketch':
        return 1000
    return None

In [10]:
df['local_state_size'] = df['strat_name'].apply(local_state_size)

In [11]:
def sample_size(ds_name):
    if ds_name == 'CIFAR10':
        return 32 * 32 * 3
    if ds_name == 'MNIST':
        return 28 * 28
    return None

In [12]:
df['sample_size'] = df['dataset_name'].apply(sample_size)

In [13]:
def local_state_computation_cost(row):
    if row['strat_name'] == 'naive':
        return row['num_weights']  # O(d)
    if row['strat_name'] == 'linear':
        return row['num_weights']  #  O(2*d)
    if row['strat_name'] == 'sketch':
        return row['num_weights'] * 5 * 250  #  O(width * d)
    return None

In [14]:
df['local_state_computation_cost'] = df.apply(local_state_computation_cost, axis=1)

In [19]:
def monitoring_size(row):
    if row['strat_name'] == 'naive':
        return 1
    if row['strat_name'] == 'linear':
        return 2
    if row['strat_name'] == 'sketch':
        return 5 * 250
    return None

In [20]:
df['monitoring_size'] = df.apply(monitoring_size, axis=1)

## Regression

In [274]:
def linear_regression(data_X_df, data_y_df):    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data_X_df, data_y_df, test_size=0.1)

    regr_model = linear_model.LinearRegression(positive=True, fit_intercept=False)

    regr_model.fit(X_train, y_train)

    y_pred = regr_model.predict(X_test)

    # Evaluate the model
    print("Linear Regression")
    print(f"Coefficients: {regr_model.coef_}\n")
    print("Linear Regression")
    print(f"RMS error: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"Coefficient of determination: {r2_score(y_test, y_pred):.2f}")
    
    return regr_model

## NaiveFDA


In [275]:
df_strat = df[df.strat_name == 'naive']

data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df_strat['total_fda_steps'] * df_strat['num_weights'] * df_strat['batch_size'],
        'dataset_computation_cost': df_strat['total_fda_steps'] * df_strat['sample_size'],
        #'local_state_computation_cost': df_strat['local_state_computation_cost'],
        
        'sync_communication_cost': df_strat['total_rounds'] * df_strat['num_weights'] * np.log2(df_strat['num_clients']),
        #'monitor_communication_cost': df_strat['total_fda_steps'] * df_strat['monitoring_size'] * np.ceil(np.log2(df_strat['num_clients']))
    }
)

data_y_df = df_strat['total_time']

In [276]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.36949766e-09 1.54921969e-04 1.55029912e-07]

Linear Regression
RMS error: 400.16
Coefficient of determination: 1.00


## LinearFDA

In [279]:
df_strat = df[df.strat_name == 'linear']

data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df_strat['total_fda_steps'] * df_strat['num_weights'] * df_strat['batch_size'],
        'dataset_computation_cost': df_strat['total_fda_steps'] * df_strat['sample_size'],
        #'local_state_computation_cost': df_strat['local_state_computation_cost'],
        
        'sync_communication_cost': df_strat['total_rounds'] * df_strat['num_weights'] * np.log2(df_strat['num_clients']),
        #'monitor_communication_cost': df_strat['total_fda_steps'] * df_strat['monitoring_size'] * np.ceil(np.log2(df_strat['num_clients']))
    }
)

data_y_df = df_strat['total_time']

In [280]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.35193922e-09 1.55074384e-04 1.76021418e-07]

Linear Regression
RMS error: 399.29
Coefficient of determination: 1.00


## SketchFDA

In [281]:
df_strat = df[df.strat_name == 'sketch']

data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df_strat['total_fda_steps'] * df_strat['num_weights'] * df_strat['batch_size'],
        'dataset_computation_cost': df_strat['total_fda_steps'] * df_strat['sample_size'],
        #'local_state_computation_cost': df_strat['local_state_computation_cost'],
        
        'sync_communication_cost': df_strat['total_rounds'] * df_strat['num_weights'] * np.log2(df_strat['num_clients']),
        #'monitor_communication_cost': df_strat['total_fda_steps'] * df_strat['monitoring_size'] * np.ceil(np.log2(df_strat['num_clients']))
    }
)

data_y_df = df_strat['total_time']

In [282]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.63786109e-09 1.74186295e-04 1.54022511e-07]

Linear Regression
RMS error: 461.61
Coefficient of determination: 1.00


## All FDA Strategies together

In [342]:
data_X_df = pd.DataFrame(
    {
        'train_computation_cost': df['total_fda_steps'] * df['num_weights'] * df['batch_size'],
        'dataset_computation_cost': df['total_fda_steps'] * df['sample_size'],
        'local_state_computation_cost': df['local_state_computation_cost'],
        
        'sync_communication_cost': df['total_rounds'] * df['num_weights'] * np.log2(df['num_clients']),
        'monitor_communication_cost': df['total_fda_steps'] * df['monitoring_size'] * np.log2(df['num_clients'])
    }
)

data_y_df = df['total_time']

In [343]:
regr_model = linear_regression(data_X_df, data_y_df)

Linear Regression
Coefficients: [1.49941354e-09 1.54409442e-04 2.37021686e-08 7.12587645e-08
 2.03467380e-05]

Linear Regression
RMS error: 518.55
Coefficient of determination: 0.99


In [344]:
import pickle

In [345]:
with open('regression_model.pkl','wb') as f:
    pickle.dump(regr_model, f)