In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import glob

## Combine metrics

In [2]:
df_grid = pd.concat([pd.read_parquet("../../metrics/epoch_metrics/tmp"), pd.read_parquet("../../metrics/epoch_metrics/tmp_grid")])

In [3]:
csv_files = []
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris2/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris3/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris4/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris5/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris6/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris7/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris8/*.csv'))
csv_files.extend(glob.glob('../../metrics/epoch_metrics/tmp_aris9/*.csv'))

In [4]:
df_aris = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

In [5]:
df = pd.concat([df_grid, df_aris])

## Read from `~/metrics/epoch_metrics/` directory

In [6]:
df = pd.read_parquet("../../metrics/epoch_metrics/combined_epoch_metrics.parquet")

In [7]:
#df.to_parquet("../../metrics/epoch_metrics/combined_epoch_metrics.parquet")

# Helpful new Dataframe metrics

In [8]:
df = df[(df.theta <= 2)]

### Add NN name

In [9]:
def nn_name(row):
    if row['nn_num_weights'] == 61706:
        return 'LeNet-5'
    if row['nn_num_weights'] == 2592202:
        return 'AdvancedCNN'

In [10]:
df['nn_name'] = df.apply(nn_name, axis=1)

### Add Helpful Dataset Metrics 

In [11]:
def dataset_n_train(row):
    if row['dataset_name'] == "EMNIST":
        return 60_000
    else:
        return -1


def dataset_one_sample_bytes(row):
    if row['dataset_name'] == "EMNIST":
        # input image 784 tf.float32 pixels and a tf.int32 label
        return 4 * (784 + 1)
    else:
        return -1

In [12]:
df['n_train'] = df.apply(dataset_n_train, axis=1)
df['one_sample_bytes'] = df.apply(dataset_one_sample_bytes, axis=1)

### Add Helpful model metrics

In [13]:
df['model_bytes'] = df['nn_num_weights'] * 4

### Add Helpful FDA method metrics

In [14]:
def fda_local_state_bytes(row):
    if row['fda_name'] == "naive":
        return 4
    if row['fda_name'] == "linear":
        return 8
    if row['fda_name'] == "sketch":
        return row['sketch_width'] * row['sketch_depth'] * 4 + 4
    if row['fda_name'] == "synchronous":
        return 0

In [15]:
df['local_state_bytes'] = df.apply(fda_local_state_bytes, axis=1)

### Add Total Steps

total steps (a single fda step might have many normal SGD steps, batch steps)

In [16]:
df['total_steps'] = df['total_fda_steps'] * df['num_steps_until_rtc_check']

### Add communication metrics

The communication bytes exchanged for model synchronization. Remember that the Clients send their models to the Server and the Server sends the global model back. This happens at the end of every round.

In [17]:
df['model_bytes_exchanged'] = df['total_rounds'] * df['model_bytes'] * df['num_clients'] * 2

The communication bytes exchanged for monitoring the variance. This happens at the end of every FDA step which consists of `num_steps_until_rtc_check` number of steps. 

In [18]:
df['monitoring_bytes_exchanged'] = df['local_state_bytes'] * df['total_fda_steps'] * df['num_clients']

The total communication bytes exchanged in the whole Federated Learning lifecycle.

In [19]:
df['total_communication_bytes'] = df['model_bytes_exchanged'] + df['monitoring_bytes_exchanged']

In [20]:
df['total_communication_gb'] = df['total_communication_bytes'] / 10**9

Add rounds in one epoch.

In [21]:
df = df.sort_values(by=['dataset_name', 'fda_name', 'nn_num_weights', 'num_clients', 'batch_size', 'num_steps_until_rtc_check', 'theta', 'epoch'])

df['epoch_rounds'] = df.groupby(['dataset_name', 'fda_name', 'nn_num_weights', 'num_clients', 'batch_size', 'num_steps_until_rtc_check', 'theta'])['total_rounds'].diff()

# NaN first epoch
df['epoch_rounds'] = df['epoch_rounds'].fillna(df['total_rounds'])

df['epoch_rounds'] = df['epoch_rounds'].astype(int)

# HyperParameter ranking

### AdvancedCNN
On 8 CPUs, the step time:

1. *Batch Size* = 32 -> `307ms`
2. *Batch Size* = 64 -> `445ms`
3. *Batch Size* = 128 -> `815ms`
4. *Batch Size* = 256 -> `1401ms`

Best fit line:

step(ms) = 4.97092 * batch_size + 147.739

### LeNet-5
On 8 CPUs, the step time:

1. *Batch Size* = 32 -> `5.93ms`
2. *Batch Size* = 64 -> `9.16ms`
3. *Batch Size* = 128 -> `18.5ms`
4. *Batch Size* = 256 -> `30.6ms`

Best fit line:

step(ms) = 0.11124 * batch_size + 2.69913

In [22]:
def step_ms(batch_size, nn_name):
    if nn_name == 'AdvancedCNN': 
        return 4.97092 * batch_size + 147.739
    if nn_name == 'LeNet-5':
        return 0.11124 * batch_size + 2.69913

Time cost for training-reducing

In [23]:
import numpy as np

def cpu_time_cost(row):
    """ Total cpu time cost in (sec).
    A single `step` means each client performed a single `step` 
    """
    return row['total_steps'] * step_ms(row['batch_size'], row['nn_name']) / 1000

def communication_time_cost(num_clients, total_communication_bytes, comm_model):
    """ Assuming channel is 1Gbps """

    total_communication_gbit = total_communication_bytes * 8e-9

    if comm_model == 'linear':
        a = 0.5

        return (1 + a * (num_clients - 1) / num_clients) * total_communication_gbit    # sec

    if comm_model == 'logarithmic':

        return (np.ceil(np.log(num_clients)) / num_clients) * total_communication_gbit   # sec

In [24]:
df['cpu_time_cost'] = df.apply(cpu_time_cost, axis=1)

In [25]:
df['logarithmic_communication_time_cost'] = communication_time_cost(df['num_clients'], df['total_communication_bytes'], 'logarithmic')

In [26]:
df['linear_communication_time_cost'] = communication_time_cost(df['num_clients'], df['total_communication_bytes'], 'linear')

In [27]:
df['logarithmic_time_cost'] = df['cpu_time_cost'] + df['logarithmic_communication_time_cost']

In [28]:
df['linear_time_cost'] = df['cpu_time_cost'] + df['linear_communication_time_cost']

In [29]:
df['logarithmic_comm_cpu_time_ratio'] = df['logarithmic_communication_time_cost'] / df['cpu_time_cost']

In [30]:
df['linear_comm_cpu_time_ratio'] = df['linear_communication_time_cost'] / df['cpu_time_cost']

# Plots about cost

In [31]:
# Define styles for each fda_name
fda_styles = {
    'naive': 'o-r',
    'linear': 's-g',
    'sketch': '^-b',
    'synchronous': 'x-c'
}
fda_names = sorted(df['fda_name'].unique())

In [32]:
import matplotlib

num_clients_values = sorted(df['num_clients'].unique())
cmap = matplotlib.colormaps['tab20b']
colors_dict = {
    num_clients: color 
    for num_clients, color in zip(num_clients_values, cmap(np.linspace(0, 1, len(num_clients_values))))
}

## Total time cost with accuracy (scatter)

In [33]:
def scatter_time_cost(df, filename):
    pdf = PdfPages(filename)
    
    fig, axs = plt.subplots(1, 2, figsize=(20, 6))
    
    # Prepare lists to store the average information for each subplot
    avg_info_linear = []
    avg_info_logarithmic = []

    # Plot the data points for each method (fda_name) for linear communication model
    for fda_name in fda_names:
        fda_filtered_data = df[(df['fda_name'] == fda_name)] 
        axs[0].scatter(fda_filtered_data['linear_time_cost'], fda_filtered_data['accuracy'], label=fda_name)
        
        # Calculate the average time cost for linear model and append to avg_info_linear
        avg_time_linear = fda_filtered_data['linear_time_cost'].mean()
        avg_info_linear.append(f'{fda_name}: {avg_time_linear:.2f} sec')
        
    
    text_linear = "Average Time Cost:\n" + '\n'.join(avg_info_linear)
    # Add the text annotation inside the first plot
    axs[0].text(0.65, 0.97, text_linear, transform=axs[0].transAxes, fontsize=9, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.4))

    axs[0].set_xlabel('Time Cost (sec)')
    axs[0].set_ylabel('Accuracy')
    axs[0].legend()
    axs[0].set_title("Linear Contention Communication Model")

    # Plot the data points for each method (fda_name) for multi-bus model
    for fda_name in fda_names:
        fda_filtered_data = df[(df['fda_name'] == fda_name)] 
        axs[1].scatter(fda_filtered_data['logarithmic_time_cost'], fda_filtered_data['accuracy'], label=fda_name)
        
        # Calculate the average time cost for logarithmic model and append to avg_info_logarithmic
        avg_time_logarithmic = fda_filtered_data['logarithmic_time_cost'].mean()
        avg_info_logarithmic.append(f'{fda_name}: {avg_time_logarithmic:.2f} sec')
        
    text_logarithmic = "Average Time Cost:\n" + '\n'.join(avg_info_logarithmic)
    # Add the text annotation inside the second plot
    axs[1].text(0.65, 0.97, text_logarithmic, transform=axs[1].transAxes, fontsize=9, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.4))    

    axs[0].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
    axs[1].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
        
    axs[1].set_xlabel('Time Cost (sec)')
    axs[1].set_ylabel('Accuracy')
    axs[1].legend()
    axs[1].set_title("Logarithmic Communication Model")
    
    axs[0].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
    axs[1].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)

    plt.tight_layout()
    
    pdf.savefig(fig)
        
    # Close the current figure to prevent it from being displayed in the notebook
    plt.close(fig)
    pdf.close()

## Total Communication cost (in gb) with accuracy (scatter)

In [34]:
def scatter_communication_cost(df, filename):
    pdf = PdfPages(filename)
    
    plt.figure(figsize=(10, 6))
    
    # Prepare a list to store the average information for the title
    avg_info = []

    # Plot the data points for each method (fda_name) for linear communication model
    for fda_name in fda_names:
        fda_filtered_data = df[(df['fda_name'] == fda_name)] 
        plt.scatter(fda_filtered_data['total_communication_gb'], fda_filtered_data['accuracy'], label=fda_name)

        # Calculate the average communication for each method and append to avg_info
        avg_communication = fda_filtered_data['total_communication_gb'].mean()
        avg_info.append(f'{fda_name}: {avg_communication:.2f} GB')

    text = "Average Communication:\n" + '\n'.join(avg_info)
    # Add the text annotation inside the plot
    plt.text(0.65, 0.97, text, transform=plt.gca().transAxes, fontsize=9, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.4))

    plt.xlabel('Communication (GB)')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)

    plt.tight_layout()
    
    pdf.savefig(plt.gcf()) # Save the current figure
        
    # Close the current figure to prevent it from being displayed in the notebook
    plt.close()
    pdf.close()


## Total CPU time (in Seconds) with accuracy (scatter)

In [35]:
def scatter_cpu_time_cost(df, filename):
    pdf = PdfPages(filename)
    
    plt.figure(figsize=(10, 6))
    
    # Prepare a list to store the average information for the title
    avg_info = []

    # Plot the data points for each method (fda_name) for linear communication model
    for fda_name in fda_names:
        fda_filtered_data = df[(df['fda_name'] == fda_name)] 
        plt.scatter(fda_filtered_data['cpu_time_cost'], fda_filtered_data['accuracy'], label=fda_name)
        
        # Calculate the average communication for each method and append to avg_info
        avg_cpu_time = fda_filtered_data['cpu_time_cost'].mean()
        avg_info.append(f'{fda_name}: {avg_cpu_time:.2f} sec')
    
    text = "Average CPU time:\n" + '\n'.join(avg_info)
    # Add the text annotation inside the plot
    plt.text(0.65, 0.97, text, transform=plt.gca().transAxes, fontsize=9, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.4))

    plt.xlabel('CPU time cost (sec)')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)

    plt.tight_layout()
    
    pdf.savefig(plt.gcf()) # Save the current figure
        
    # Close the current figure to prevent it from being displayed in the notebook
    plt.close()
    pdf.close()

## Communication/CPU time - Accuracy (scatter)

In [36]:
def scatter_time_cost_cpu_ratio(df, filename):
    
    pdf = PdfPages(filename)

    fig, axs = plt.subplots(1, 2, figsize=(20, 6))

    # Plot the data points for each method (fda_name) for single-bus model
    for fda_name in fda_names:
        fda_filtered_data = df[(df['fda_name'] == fda_name)] 
        axs[0].scatter(fda_filtered_data['linear_comm_cpu_time_ratio'], fda_filtered_data['accuracy'], label=fda_name)

    axs[0].set_xlabel('(Communication time) / (CPU time)')
    axs[0].set_ylabel('Accuracy')
    axs[0].legend()
    axs[0].set_title("Linear Contention Communication Model")

    # Plot the data points for each method (fda_name) for multi-bus model
    for fda_name in fda_names:
        fda_filtered_data = df[(df['fda_name'] == fda_name)] 
        axs[1].scatter(fda_filtered_data['logarithmic_comm_cpu_time_ratio'], fda_filtered_data['accuracy'], label=fda_name)

    axs[0].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
    axs[1].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)   
        
    axs[1].set_xlabel('(Communication time) / (CPU time)')
    axs[1].set_ylabel('Accuracy')
    axs[1].legend()
    axs[1].set_title("Logarithmic Communication Model")
    
    axs[0].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
    axs[1].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)

    plt.tight_layout()
    pdf.savefig(fig)
        
    # Close the current figure to prevent it from being displayed in the notebook
    plt.close(fig)
    pdf.close()

## Different FDA-method runs visualization with lines (per clients)

In [63]:
import matplotlib
from matplotlib import cm

def fda_method_run_line_plot(df, fda_name, filename, limit_x_axis=False):
    
    hybrid_df = df[df.fda_name == fda_name]
    
    batch_size_values = sorted(hybrid_df['batch_size'].unique())
    theta_values = sorted(hybrid_df['theta'].unique())
    
    pdf = PdfPages(filename)

    for batch_size in batch_size_values:
        for theta in theta_values:
            filtered_data = hybrid_df[(hybrid_df['theta'] == theta) &
                                       (hybrid_df['batch_size'] == batch_size)] 

            if filtered_data.empty:
                continue
            
            num_clients_values = sorted(filtered_data['num_clients'].unique())
            
            #print(f"{batch_size} {theta} {filtered_data[['fda_name','accuracy']]}")
                
            # Because num_clients = 2 reaches very good accuracy very early we need to limit the x-axis so we can 
            # visualize the rest of the data more easily. We get the next max time cost and limit x
            # Get the maximum 'linear_time_cost' and 'logarithmic_time_cost' when num_clients is not 2
            if limit_x_axis:
                # The num clients = 2 cause problems. We put limit 10 after the next in line
                max_linear_time_cost = filtered_data[filtered_data['num_clients'] != 2]['linear_time_cost'].max()
                max_logarithmic_time_cost = filtered_data[filtered_data['num_clients'] != 2]['logarithmic_time_cost'].max()


            fig, axs = plt.subplots(1, 2, figsize=(20, 6))

            # Plot each group with a unique color based on num_clients
            for num_clients in num_clients_values:
                data = filtered_data[filtered_data['num_clients'] == num_clients]

                axs[0].plot(data['linear_time_cost'], data['accuracy'], color=colors_dict[num_clients], label=num_clients, marker='o', markersize=3)
                axs[1].plot(data['logarithmic_time_cost'], data['accuracy'], color=colors_dict[num_clients], label=num_clients, marker='o', markersize=3)
                
            
            axs[0].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
            axs[1].grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
            
            #axs[0].set_ylim(top=1)
            #axs[1].set_ylim(top=1)

            axs[0].set_xlabel('Time Cost')
            axs[0].set_ylabel('Accuracy')
            axs[0].set_title("Linear Contention Communication Model")
            axs[0].legend(title='Num Clients')
            if limit_x_axis:
                axs[0].set_xlim(0, max_linear_time_cost+700)  # set x-axis limit

            axs[1].set_xlabel('Time Cost')
            axs[1].set_ylabel('Accuracy')
            axs[1].set_title("Logarithmic Communication Model")
            axs[1].legend(title='Num Clients')
            if limit_x_axis:
                axs[1].set_xlim(0, max_logarithmic_time_cost+500)  # set x-axis limit

            title = f'Batch Size : {batch_size} , $\Theta$ : {theta}'

            fig.suptitle(title)

            plt.tight_layout()

            pdf.savefig(fig)

            plt.close(fig)

    pdf.close()

## Save all those time-cost plots

In [64]:
import os

def time_cost_plots(df, acc_threshold, nn_name, limit_x_axis=False):
    # Filter out based on `acc_threshold`
    acceptable_acc_df = df[(df.accuracy > acc_threshold) & (df.nn_name == nn_name)]
    
    str_thresh = str(acc_threshold).replace('.', '_')  # replace '.'
    
    if not os.path.exists(f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}"):
        os.makedirs(f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}")
    
    # Same runs are included
    scatter_time_cost(acceptable_acc_df, f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/nonfiltered_scatter_time_cost.pdf")
    scatter_time_cost_cpu_ratio(acceptable_acc_df, f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/nonfiltered_scatter_time_cost_cpu_ratio.pdf")
    
    # Filter out same runs. We choose the instance which first hits the `acc_threshold`
    idx = acceptable_acc_df.groupby(['fda_name', 'num_clients', 'batch_size', 'theta'])['epoch'].idxmin()
    filtered_acceptable_acc_df = acceptable_acc_df.loc[idx]
    
    # Same runs are NOT included
    scatter_time_cost(filtered_acceptable_acc_df, f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/filtered_scatter_time_cost.pdf")
    scatter_time_cost_cpu_ratio(filtered_acceptable_acc_df, f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/filtered_scatter_time_cost_cpu_ratio.pdf")
    scatter_communication_cost(filtered_acceptable_acc_df, f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/filtered_scatter_communication_cost.pdf")
    scatter_cpu_time_cost(filtered_acceptable_acc_df, f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/filtered_scatter_cpu_time_cost.pdf")
    
    # Plot the runs of each method. x-axis : time cost, y-axis : accuracy, PER number of clients (lines)
    fda_method_run_line_plot(acceptable_acc_df, 'sketch', f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/sketch_run.pdf", limit_x_axis=limit_x_axis)
    fda_method_run_line_plot(acceptable_acc_df, 'naive', f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/naive_run.pdf", limit_x_axis=limit_x_axis)
    fda_method_run_line_plot(acceptable_acc_df, 'linear', f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/linear_run.pdf", limit_x_axis=limit_x_axis)
    fda_method_run_line_plot(acceptable_acc_df, 'synchronous', f"../../metrics/plots/{nn_name}/hybrid/{str_thresh}/synchronous_run.pdf", limit_x_axis=False)
                             

In [58]:
time_cost_plots(df, 0.95, 'LeNet-5')

In [42]:
time_cost_plots(df, 0.97, 'LeNet-5')

In [43]:
time_cost_plots(df, 0.96, 'LeNet-5')

In [44]:
time_cost_plots(df, 0.975, 'LeNet-5')

In [45]:
time_cost_plots(df, 0.98, 'LeNet-5')

In [46]:
time_cost_plots(df, 0.95, 'AdvancedCNN')

In [65]:
time_cost_plots(df, 0.96, 'AdvancedCNN', True)

In [48]:
time_cost_plots(df, 0.97, 'AdvancedCNN')

In [49]:
time_cost_plots(df, 0.98, 'AdvancedCNN')

In [50]:
time_cost_plots(df, 0.984, 'AdvancedCNN')

In [51]:
time_cost_plots(df, 0.985, 'AdvancedCNN')