In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
import os

sns.set()
sns.set_style("ticks")
# sns.axes_style("whitegrid")

In [2]:
os.makedirs('plots', exist_ok=True)

In [14]:
# read csv
df = pd.read_csv("scaling/scaling_multinode_realdata_torch.csv")
df.sort_values('num_gpu', inplace=True)
epoch_size = 3584

# add ideal throughput curve
df['throughput'] = epoch_size / df['train_time_per_epoch']
df['ideal'] = df['num_gpu'] * df.loc[df['num_gpu'] == 1, 'throughput'].values[0]

# add efficiency curve
df['efficiency'] = df['throughput'] / df['ideal']

In [24]:
# Throughput vs nGPUs
fig = plt.figure(figsize=(4, 4))
plt.plot(df['num_gpu'], epoch_size / df['train_time_per_epoch'], '-o', label='measured')
plt.plot(df['num_gpu'], df['ideal'], '--', label='ideal')
plt.xlabel('Number of GPUs')
plt.ylabel('Throughput (samples per sec)')
plt.yticks(np.linspace(0, 2000, 5))
plt.legend()
# plt.title("Throughput vs num GPUs")
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
# plt.show()
plt.tight_layout()
plt.savefig('plots/scaling_throughput_vs_gpu.pdf')
plt.close(fig)

# Scaling efficiency vs nGPUs
fig = plt.figure(figsize=(4, 4))
plt.plot(df['num_gpu'], df['efficiency'], '-o', label='measured')
plt.plot(df['num_gpu'], np.ones(df.shape[0]), '--', label='ideal')
plt.xlabel('Number of GPUs')
plt.ylabel('Fraction of ideal scaling')
# plt.title("Throughput vs num GPUs")
plt.legend(loc='upper right')
# plt.show()
plt.tight_layout()
plt.savefig('plots/scaling_ratio_vs_gpu.pdf')
plt.close(fig)

In [20]:
np.linspace(0, 2000, 5)

array([   0.,  500., 1000., 1500., 2000.])

In [25]:
def p10(x):
    return np.percentile(x, 10)
def p50(x):
    return np.percentile(x, 50)
def p90(x):
    return np.percentile(x, 90)

def plot_df_with_interval(df, y):
    df.loc[:, 'epoch_bin'] = pd.qcut(
        df['epoch'], 50, labels=np.linspace(0.5, 98.5, 50))
    df.loc[:, 'epoch_bin'] = df['epoch_bin'].astype('float')
    df_group = df.groupby(['epoch_bin']).agg(
        loss_p10=pd.NamedAgg(column='loss', aggfunc=p10),
        loss_p50=pd.NamedAgg(column='loss', aggfunc=p50),
        loss_p90=pd.NamedAgg(column='loss', aggfunc=p90),
        wall_time_mean=pd.NamedAgg(column='wall_time', aggfunc='mean'),
    ).reset_index()
    plt.plot(
        df_group[y],
        df_group['loss_p50'],
        label=f'{n} gpu' + f"{'s' if n > 1 else ''}",
    )
    plt.fill_between(
        df_group[y],
        df_group['loss_p10'],
        df_group['loss_p90'],
        alpha=0.3,
    )
    if y == 'wall_time_mean':
        plt.ticklabel_format(axis="x", style="sci", scilimits=(0,0))

ngpus = [1, 2, 16, 128]
losslim = 0.10
csvs = [f'log/scaling{n}gpu_lr1e-2/train_curve.csv' for n in ngpus]
csvs[-1] = 'log/scaling128gpu_lr4e-2/train_curve.csv'

# plot loss vs epoch
fig = plt.figure(figsize=(4, 4))
for idx, n in enumerate(ngpus):
    df = pd.read_csv(csvs[idx])    
    plot_df_with_interval(df, 'epoch_bin')
#     plt.plot(df['epoch'], df['loss'], label=f'{n} gpu'+f"{'s' if n > 1 else ''}")
axes = plt.gca()
axes.set_ylim([0,losslim])
axes.set_xlabel('Epochs')
axes.set_ylabel('Loss')
plt.legend()
# plt.show()
plt.tight_layout()
plt.savefig('plots/scaling_loss_vs_epoch.pdf')
plt.close(fig)

# plot loss vs wall time
fig = plt.figure(figsize=(4, 4))
for idx, n in enumerate(ngpus):
    df = pd.read_csv(f'log/scaling{n}gpu_lr1e-2/train_curve.csv')    
    plot_df_with_interval(df, 'wall_time_mean')
#     plt.plot(df['epoch'], df['loss'], label=f'{n} gpu'+f"{'s' if n > 1 else ''}")
axes = plt.gca()
axes.set_ylim([0,losslim])
axes.set_xlabel('Wall Time (s)')
axes.set_ylabel('Loss')
plt.legend()
# plt.show()
plt.tight_layout()
plt.savefig('plots/scaling_loss_vs_time.pdf')
plt.close(fig)