In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from IPython.display import display, HTML
from scipy.stats.mstats import gmean
from pprint import pprint
import matplotlib.pyplot as plt
import os, re, sys, glob

plt.rcParams['figure.max_open_warning'] = 50
display(HTML("<style>.container { width:100% !important; }</style>"))
PM_HOME = os.getcwd() + "/../../"

In [None]:
GPU_COUNT = 4
GPU_NAME = "GV100"
gpus = [GPU_NAME]
batch_sizes = [512, 1024, 2048, 4096]
nlp_batch_sizes = [8, 16, 32, 64]
trimmed_iters = 30

### Multi-GPU DLRM E2E Plot

In [None]:
# Automatically get errors from logs
errors = {}
actual_time = {}
for gpu in gpus:
    errors[gpu] = {}
    actual_time[gpu] = {}
    dirs = glob.glob('{}/data/{}/e2e/DLRM_open_source/*/*/f/size_lookup_greedy/barrier_bucketed_allreduce/25'.format(PM_HOME, gpu))
    ff = filter(lambda f: os.path.isdir(f), dirs)
    for prefix in ff:
        task = "{},{}".format(prefix.split('/')[-6], prefix.split('/')[-5])
        if task not in errors[gpu].keys():
            errors[gpu][task] = []
        if task not in actual_time[gpu].keys():
            actual_time[gpu][task] = []
        for batch_size in batch_sizes:
            log_file = '{}/{}_{}_distributed.log'.format(prefix, GPU_COUNT, batch_size)
            if not os.path.exists(log_file):
                continue
            assert os.path.exists(log_file), "{} does not exist".format(log_file)
            for line in open(log_file, 'r'):
                if re.search("Overall per-batch training time", line):
                    actual_time[gpu][task].append(float(line.split(' ')[4])) # In ms
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        errors[gpu][task].append([shared_e2e_error, active_error])
                    if re.search("Baseline error:", line):
                        baseline_shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        baseline_active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        errors[gpu][task][-1].extend([baseline_shared_e2e_error, baseline_active_error])
            prediction_file = '{}/{}_{}_distributed_prediction_{}.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            if os.path.exists(prediction_file):
                for line in open(prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        errors[gpu][task][-1].extend([e2e_error])
# shared_e2e, active, baseline, active_baseline, (e2e)

In [None]:
# Overall
overall_shared_total = [abs(c[0]) for device in errors.keys() for task in errors[device].keys() for c in errors[device][task]]
error_shared_total = {
    device: [abs(c[0]) for task in errors[device].keys() for c in errors[device][task]] for device in errors.keys()
}
overall_active = [abs(c[1]) for device in errors.keys() for task in errors[device].keys() for c in errors[device][task]]
error_active = {
    device: [abs(c[1]) for task in errors[device].keys() for c in errors[device][task]] for device in errors.keys()
}
overall_baseline_shared_total = [abs(c[2]) for device in errors.keys() for task in errors[device].keys() for c in errors[device][task]]
error_baseline_shared_total = {
    device: [abs(c[2]) for task in errors[device].keys() for c in errors[device][task]] for device in errors.keys()
}

# For Latex
s1 = ["Shared E2E & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_shared_total), min(overall_shared_total), max(overall_shared_total))]
s2 = ["Active & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_active), min(overall_active), max(overall_active))]
s3 = ["Baseline E2E & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_baseline_shared_total), min(overall_baseline_shared_total), max(overall_baseline_shared_total))]
for device in error_active.keys():
    s1.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_shared_total[device]), min(error_shared_total[device]), max(error_shared_total[device])))
    s2.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_active[device]), min(error_active[device]), max(error_active[device])))
    s3.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_baseline_shared_total[device]), min(error_baseline_shared_total[device]), max(error_baseline_shared_total[device])))
print(' & '.join(s1) + "\\\\")
print(' & '.join(s2) + "\\\\")
print(' & '.join(s3) + "\\\\")

In [None]:
from matplotlib.gridspec import GridSpec
w = 0.5
task_gap = 6
for device, error in errors.items():
    yticks1 = [-20, -10, 0, 10, 20]
    yticks2 = [-70, -50]
    yticks3 = [6, 8.5, 11, 13.5, 16]
    yticks4 = [0, 2.5]
    yticklabels1 = ["{:.0f}%".format(x) for x in yticks1]

    fig = plt.figure(figsize=(15, 6))
    gs = GridSpec(2, 2, height_ratios=[4, 1])
    ax1 = fig.add_subplot(gs.new_subplotspec((0, 0), colspan=2))
    ax2 = fig.add_subplot(gs.new_subplotspec((1, 0), colspan=2))
    ax3 = ax1.twinx()
    ax4 = ax2.twinx()

    ax1.spines['bottom'].set_visible(False)
    ax1.set_xticks([])
    ax2.spines['top'].set_visible(False)
    ax2.tick_params(labeltop="off")
    ax2.xaxis.tick_bottom()
    ax3.spines['bottom'].set_visible(False)
    ax3.set_xticks([])
    ax4.spines['top'].set_visible(False)
    ax4.tick_params(labeltop="off")
    ax4.xaxis.tick_bottom()

    e0 = [] # shared
    e1 = [] # active
    eb = [] # baseline
    e3 = [] # standalone
    x0 = []
    x1 = []
    xb = []
    x3 = []
    xt = []
    yt = []
    batch_width = 0
    tasks = []
    xticks = []
    for task, e in error.items():
        tasks.append(task)
        xtt = []
        ytt = []
        batch_width += 2
        batch_gap = 0
        first = -1
        last = -1
        for idx, ee in enumerate(e):
            e0.append(ee[0])
            e1.append(ee[1])
            eb.append(ee[2])
            e3.append(ee[3])
            x0.append(task_gap * batch_width - 2 + batch_gap - w * 2)
            xb.append(task_gap * batch_width - 2 + batch_gap - w)
            x1.append(task_gap * batch_width - 2 + batch_gap)
            x3.append(task_gap * batch_width - 2 + batch_gap + w)
            if idx == 0:
                first = x1[-1]
            batch_gap += 2
            xtt.append(task_gap * batch_width - 2 + batch_gap - w * 4)
        uhhh = ax3.plot(xtt, actual_time[device][task], color=plt.get_cmap("tab10")(1), marker="s")
        last = xb[-1]
        xticks.append((first + last) / 2)

    ax1.bar(x0, e0, width=w, color=plt.get_cmap("Blues")(180), label='Prediction')
    ax1.bar(xb, eb, width=w, color=plt.get_cmap("Greys")(120), label='Baseline')
    ax2.bar(xb, eb, width=w, color=plt.get_cmap("Greys")(120), label='Baseline')

    ax1.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax2.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax3.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax4.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax1.set_ylim(yticks1[0]-w*16, yticks1[-1]+w*16)
    ax2.set_ylim(yticks2[0]-w*4, yticks2[-1]+w*4)
    ax3.set_ylim(yticks3[0]-w*4, yticks3[-1]+w*4)
    ax4.set_ylim(yticks4[0]-w, yticks4[-1]+w)

    ax1.set_yticks(yticks1)
    ax2.set_yticks(yticks2)
    ax2.set_xticks(xticks)
    ax2.set_xticklabels(["Task_{}".format(x) for x in range(len(tasks))], fontsize=12, rotation=25)
    ax3.set_yticks(yticks3)
    ax4.set_yticks(yticks4)

    ax1.set_title(device, fontsize=15)
    ax1.grid(axis='y')
    xes = [zip(x0, e0), zip(xb, eb)]
    sign = 1
    for xe in xes:
        for x, e in xe:
            if yticks2[-1]+w*4 < e < yticks1[0]-w*16:
                ax1.text(x+0.3, yticks1[0]+3*sign, "{:.1f}%".format(e), horizontalalignment='center', verticalalignment='center', size=12)
                sign = -sign
    axis_break1 = yticks2[-1] + w*4
    axis_break2 = yticks1[0] - w*16
    l = 2  # "break" line length
    x_min = min(x0) - task_gap
    x_max = max(xb) + task_gap
    kwargs = dict(color="k", clip_on=False, linewidth=1)
    ax1.plot((x_min - l, x_min + l), (axis_break2, axis_break2), **kwargs)# top-left
    ax1.plot((x_max - l, x_max + l), (axis_break2, axis_break2), **kwargs)# top-right
    ax2.plot((x_min - l, x_min + l), (axis_break1, axis_break1), **kwargs)# bottom-left
    ax2.plot((x_max - l, x_max + l), (axis_break1, axis_break1), **kwargs)# bottom-right

    handles, labels = ax1.get_legend_handles_labels()
    handles += uhhh
    labels += ["Actual Time"]
    fig.legend(handles, labels, loc=[0.7, 0.88], ncol=len(labels))
    fig.text(-0.01, 0.5, 'Prediction Error', va='center', rotation=90, fontsize=14)
    fig.text(0.99, 0.5, 'Iteration Time (ms)', va='center', rotation=-90, fontsize=14)
    plt.tight_layout()
    plt.savefig('{}/data/multi_gpu_e2e.pdf'.format(PM_HOME), bbox_inches='tight')
    plt.savefig('{}/data/multi_gpu_e2e.png'.format(PM_HOME), bbox_inches='tight')

In [None]:
# [(idx, t) for idx, t in enumerate(tasks)]

### NLP

In [None]:
# Automatically get errors from logs
other_errors = {}
other_time = {}
models = ["bert", "gpt2"]
for gpu in gpus:
    other_errors[gpu] = {}
    other_time[gpu] = {}
    for model in models:
        prefix = '{}/data/{}/e2e/{}/barrier_bucketed_allreduce/25'.format(PM_HOME, gpu, model)
        if model not in errors[gpu].keys():
            other_errors[gpu][model] = []
        if model not in actual_time[gpu].keys():
            other_time[gpu][model] = []
        for batch_size in nlp_batch_sizes:
            if (model == "bert" and batch_size == 64): # OOM
                continue
            log_file = '{}/{}_{}_distributed.log'.format(prefix, GPU_COUNT, batch_size)
            if not os.path.exists(log_file):
                continue
            for line in open(log_file, 'r'):
                if re.search("Overall per-batch training time", line):
                    other_time[gpu][model].append(float(line.split(' ')[4])) # In ms
                    break
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        other_errors[gpu][model].append([shared_e2e_error, active_error])
                    if re.search("Baseline error:", line):
                        baseline_shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        baseline_active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        other_errors[gpu][model][-1].extend([baseline_shared_e2e_error, baseline_active_error])
            prediction_file = '{}/{}_{}_distributed_prediction_{}.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            if os.path.exists(prediction_file):
                for line in open(prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        e2e_error = float(line.split(' ')[3].rstrip(',%\n'))
                        other_errors[gpu][model][-1].append(e2e_error)
# shared_e2e, active, baseline, active_baseline, (e2e)

In [None]:
fig = plt.figure(figsize=(10, 4))
ax = fig.gca()
model_gap = 3.2
nlp_xticklabels = nlp_batch_sizes[:-1] + nlp_batch_sizes
nlp_yticks1 = [-30, -20, -10, 0, 10]
nlp_yticks2 = [200, 400, 600, 800, 1000]
yticklabels = ["{:.0f}%".format(x) for x in nlp_yticks1]
for device, error in other_errors.items():
    ax2 = ax.twinx()
    e1 = []
    e2 = []
    x1 = []
    x2 = []
    c = 0
    for model, e in error.items():
        xtt = []
        c += 0.5
        cc = 0
        for ee in e:
            e1.append(ee[0])
            e2.append(ee[2])
            x1.append(model_gap * c - 2 + cc - 0.1)
            x2.append(model_gap * c - 2 + cc)
            cc += 0.5
            xtt.append(model_gap * c - 2 + cc - 0.5)

        ax.text((xtt[0] + xtt[-1]) / 2, nlp_yticks1[-1]+2, model, fontsize=12, transform=ax.transData)
        ax2.plot(xtt, other_time[device][model], color=plt.get_cmap("tab10")(1), marker="s")

    ax.bar(x1, e1, width=0.1, color=plt.get_cmap("Blues")(180), label="Prediction")
    ax.bar(x2, e2, width=0.1, color=plt.get_cmap("Greys")(120), label="Baseline")
    ax.grid(axis='y')
    ax.set_xticks([(x+y)/2 for x,y in zip(x1,x2)])
    ax.set_xticklabels(nlp_xticklabels, fontsize=12)
    ax.set_title(device, fontsize=15)
    ax.set_yticks(nlp_yticks1)
    ax.set_yticklabels(yticklabels)
    ax.set_ylim(nlp_yticks1[0]-5, nlp_yticks1[-1]+5)
    for xe in [zip(x1, e1), zip(x2, e2)]:
        for x, e in xe:
            if e > nlp_yticks1[-1]+5:
                ax.text(x-0.1, nlp_yticks1[-1]-2, "{:.1f}%".format(e), horizontalalignment='center', verticalalignment='center', size=12)
    ax2.set_ylim(nlp_yticks2[0]-100, nlp_yticks2[-1]+100)
    ax2.set_yticks(nlp_yticks2)

handles, labels = ax.get_legend_handles_labels()
handles += uhhh
labels += ["Actual Time"]
fig.legend(handles, labels, loc=[0.08, 0.16], ncol=len(labels))
fig.text(-0.01, 0.5, 'Prediction Error', va='center', rotation=90, fontsize=14)
fig.text(0.99, 0.5, 'Iteration Time (ms)', va='center', rotation=-90, fontsize=14)
fig.text(0.45, -0.01, "Batch Size", fontsize=14)
plt.tight_layout()
plt.savefig('{}/data/multi_e2e_others.pdf'.format(PM_HOME), bbox_inches='tight')
plt.savefig('{}/data/multi_e2e_others.png'.format(PM_HOME), bbox_inches='tight')

### Sharding Config Selection

In [None]:
# Automatically get errors from logs
batch_size = 4096
# errors = {}
actual_time = {}
predicted_time = {}
for gpu in gpus:
    # errors[gpu] = {}
    actual_time[gpu] = {}
    predicted_time[gpu] = {}
    dirs = glob.glob('{}/data/{}/e2e/DLRM_open_source/*/*/f/*/barrier_bucketed_allreduce/25'.format(PM_HOME, gpu))
    ff = filter(lambda f: os.path.isdir(f), dirs)
    for prefix in ff:
        sharder = prefix.split('/')[-3]
        task = "{},{}".format(prefix.split('/')[-6], prefix.split('/')[-5])
        # if task not in errors[gpu].keys():
        #     errors[gpu][task] = {}
        if task not in actual_time[gpu].keys():
            actual_time[gpu][task] = {}
        if task not in predicted_time[gpu].keys():
            predicted_time[gpu][task] = {}
        log_file = '{}/{}_{}_distributed.log'.format(prefix, GPU_COUNT, batch_size)
        if not os.path.exists(log_file):
            continue
        assert os.path.exists(log_file), "{} does not exist".format(log_file)
        shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
        if not os.path.exists(shared_prediction_file):
            continue
        assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
        for line in open(log_file, 'r'):
            if re.search("Overall per-batch training time", line):
                actual_time[gpu][task][sharder] = float(line.split(' ')[4]) # In ms
        if os.path.exists(shared_prediction_file):
            for line in open(shared_prediction_file, 'r'):
                predicted_time[gpu][task][sharder] = float(line.split(' ')[2].strip(',')) / 1e3 # In ms
                break

In [None]:
fig, axes = plt.subplots(nrows=len(gpus), ncols=1, figsize=(20, 7))
count = 0
w = 0.5
task_gap = 10
yticks = [-30, -20, -10, 0, 10, 20]
yticks2 = [5, 7.5, 10, 12.5, 15]
yticklabels = ["{:.0f}%".format(x) for x in yticks]
for idx, gpu in enumerate(gpus):
    ax = axes if len(gpus) == 1 else axes[idx]
    batch_width = 0
    for task, e in predicted_time[gpu].items():
        xtt = []
        batch_width += 2
        sharder_gap = 0
        for sharder, ee in e.items():
            sharder_gap += 1.5
            xtt.append(task_gap * batch_width - 2 + sharder_gap - w * 4)
        ax1.plot(xtt, [t for _, t in actual_time[gpu][task].items()], color=plt.get_cmap("tab10")(1), marker="s")
        ax1.plot(xtt, [t for _, t in predicted_time[gpu][task].items()], color=plt.get_cmap("Blues")(180), marker="s")
    ax1.set_yticks(yticks2)

handles, labels = (axes[-1] if isinstance(axes, list) else axes).get_legend_handles_labels()
fig.legend(handles, labels, loc=[0.7, 0.95], ncol=len(labels))
fig.text(0.45, -0.01, "Batch Size", fontsize=14)
plt.tight_layout()
plt.savefig('{}/data/selection.pdf'.format(PM_HOME), bbox_inches='tight')
plt.savefig('{}/data/selection.png'.format(PM_HOME), bbox_inches='tight')