In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from IPython.display import display, HTML
from scipy.stats.mstats import gmean
from pprint import pprint
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import os, re, sys, glob

plt.rcParams['figure.max_open_warning'] = 50
display(HTML("<style>.container { width:100% !important; }</style>"))
PM_HOME = os.getcwd() + "/../../"

In [None]:
GPU_COUNT = 4
# GPU_NAME = "GV100"
gpus = ["GV100", "A100"]
nodes = {
    "GV100": "bowser",
    "A100": "wario",
}
tasks = {
    "GV100": None,
    "A100": None,
}
batch_sizes = [512, 1024, 2048, 4096]
nlp_batch_sizes = [8, 16, 32, 64]
trimmed_iters = 30
home_template = "/home/m092926/{}/Documents/ml_perf_model"

### Multi-GPU DLRM E2E Plot

In [None]:
# Automatically get errors from logs
errors = {}
actual_time = {}
for gpu in gpus:
    home = home_template.format(nodes[gpu])
    with open("{}/benchmark/tasks_{}x{}_permanent.txt".format(home, GPU_COUNT, gpu), 'r') as f:
        tasks[gpu] = [line.strip() for line in f.readlines()]
    errors[gpu] = []
    actual_time[gpu] = []
    for task in tasks[gpu]:
        tmp = task.split(',')
        prefix = "{}/data/{}/e2e/DLRM_open_source/{}/{}/f/size_lookup_greedy/barrier_bucketed_allreduce/25".format(home, gpu, tmp[0], tmp[1])
        tmp1 = []
        tmp2 = []
        for batch_size in batch_sizes:
            log_file = '{}/{}_{}_distributed.log'.format(prefix, GPU_COUNT, batch_size)
            if not os.path.exists(log_file):
                continue
            assert os.path.exists(log_file), "{} does not exist".format(log_file)
            for line in open(log_file, 'r'):
                if re.search("Overall per-batch training time", line):
                    tmp2.append(float(line.split(' ')[4])) # In ms
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        tmp1.append([shared_e2e_error, active_error])
                    if re.search("Baseline error:", line):
                        baseline_shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        baseline_active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        tmp1[-1].extend([baseline_shared_e2e_error, baseline_active_error])
            prediction_file = '{}/{}_{}_distributed_prediction_{}.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            if os.path.exists(prediction_file):
                for line in open(prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        tmp1[-1].extend([e2e_error])
        errors[gpu].append(tmp1)
        actual_time[gpu].append(tmp2)
# shared_e2e, active, baseline, active_baseline, (e2e)

In [None]:
# Overall
overall_shared_total = [abs(c[0]) for gpu in errors.keys() for task in errors[gpu] for c in task]
error_shared_total = {
    gpu: [abs(c[0]) for task in errors[gpu] for c in task] for gpu in errors.keys()
}
overall_active = [abs(c[1]) for gpu in errors.keys() for task in errors[gpu] for c in task]
error_active = {
    gpu: [abs(c[1]) for task in errors[gpu] for c in task] for gpu in errors.keys()
}
overall_baseline_shared_total = [abs(c[2]) for gpu in errors.keys() for task in errors[gpu] for c in task]
error_baseline_shared_total = {
    gpu: [abs(c[2]) for task in errors[gpu] for c in task] for gpu in errors.keys()
}

# For Latex
s1 = ["Shared E2E & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_shared_total), min(overall_shared_total), max(overall_shared_total))]
s2 = ["Active & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_active), min(overall_active), max(overall_active))]
s3 = ["Baseline E2E & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_baseline_shared_total), min(overall_baseline_shared_total), max(overall_baseline_shared_total))]
for gpu in error_active.keys():
    s1.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_shared_total[gpu]), min(error_shared_total[gpu]), max(error_shared_total[gpu])))
    s2.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_active[gpu]), min(error_active[gpu]), max(error_active[gpu])))
    s3.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_baseline_shared_total[gpu]), min(error_baseline_shared_total[gpu]), max(error_baseline_shared_total[gpu])))
print(' & '.join(s1) + "\\\\")
print(' & '.join(s2) + "\\\\")
print(' & '.join(s3) + "\\\\")

In [None]:
w = 0.5
task_gap = 6
for gpu, error in errors.items():
    yticks1 = [-20, -10, 0, 10, 20]
    yticks2 = [-70, -50]
    yticks3 = [6, 8.5, 11, 13.5, 16]
    yticks4 = [0, 2.5]
    yticklabels1 = ["{:.0f}%".format(x) for x in yticks1]

    fig = plt.figure(figsize=(18, 5))
    gs = GridSpec(2, 2, height_ratios=[4, 1])
    ax1 = fig.add_subplot(gs.new_subplotspec((0, 0), colspan=2))
    ax2 = fig.add_subplot(gs.new_subplotspec((1, 0), colspan=2))
    ax3 = ax1.twinx()
    ax4 = ax2.twinx()

    ax1.spines['bottom'].set_visible(False)
    ax1.set_xticks([])
    ax2.spines['top'].set_visible(False)
    ax2.tick_params(labeltop="off")
    ax2.xaxis.tick_bottom()
    ax3.spines['bottom'].set_visible(False)
    ax3.set_xticks([])
    ax4.spines['top'].set_visible(False)
    ax4.tick_params(labeltop="off")
    ax4.xaxis.tick_bottom()

    e0 = [] # shared
    e1 = [] # active
    eb = [] # baseline
    e3 = [] # standalone
    x0 = []
    x1 = []
    xb = []
    x3 = []
    xt = []
    yt = []
    batch_width = 0
    xticks = []
    for idy, e in enumerate(error):
        xtt = []
        ytt = []
        batch_width += 2
        batch_gap = 0
        first = -1
        last = -1
        for idx, ee in enumerate(e):
            e0.append(ee[0])
            e1.append(ee[1])
            eb.append(ee[2])
            e3.append(ee[3])
            x0.append(task_gap * batch_width - 2 + batch_gap - w * 2)
            xb.append(task_gap * batch_width - 2 + batch_gap - w)
            x1.append(task_gap * batch_width - 2 + batch_gap)
            x3.append(task_gap * batch_width - 2 + batch_gap + w)
            if idx == 0:
                first = x1[-1]
            batch_gap += 2
            xtt.append(task_gap * batch_width - 2 + batch_gap - w * 4)
        uhhh = ax3.plot(xtt, actual_time[gpu][idy], color=plt.get_cmap("tab10")(1), marker="s")
        last = xb[-1]
        xticks.append((first + last) / 2)

    ax1.bar(x0, e0, width=w, color=plt.get_cmap("Blues")(180), label='Prediction')
    ax1.bar(xb, eb, width=w, color=plt.get_cmap("Greys")(120), label='Baseline')
    ax2.bar(xb, eb, width=w, color=plt.get_cmap("Greys")(120), label='Baseline')

    ax1.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax2.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax3.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax4.set_xlim(min(x0) - task_gap, max(xb) + task_gap)
    ax1.set_ylim(yticks1[0]-w*16, yticks1[-1]+w*16)
    ax2.set_ylim(yticks2[0]-w*4, yticks2[-1]+w*4)
    ax3.set_ylim(yticks3[0]-w*4, yticks3[-1]+w*4)
    ax4.set_ylim(yticks4[0]-w, yticks4[-1]+w)

    ax1.set_yticks(yticks1)
    ax2.set_yticks(yticks2)
    ax2.set_xticks(xticks)
    ax2.set_xticklabels(["Task_{}".format(x) for x in range(len(tasks[gpu]))], fontsize=12)
    heavy_label_idx = list(range(len(tasks[gpu])//4)) + list(range((len(tasks[gpu])//4*2), (len(tasks[gpu])//4*3)))
    for i in heavy_label_idx:
        ax2.get_xticklabels()[i].set_color("Purple")
    ax3.set_yticks(yticks3)
    ax4.set_yticks(yticks4)

    ax1.set_title("{}x{}".format(GPU_COUNT, gpu), fontsize=15)
    ax1.grid(axis='y')
    xes = [zip(x0, e0), zip(xb, eb)]
    count = 0
    for xe in xes:
        for x, e in xe:
            if yticks2[-1]+w*4 < e < yticks1[0]-w*16:
                ax1.text(x+0.3, yticks1[0]+4*(count % 3 - 1), "{:.1f}%".format(e), horizontalalignment='center', verticalalignment='center', size=12)
                count += 1
    axis_break1 = yticks2[-1] + w*4
    axis_break2 = yticks1[0] - w*16
    l = 2  # "break" line length
    x_min = min(x0) - task_gap
    x_max = max(xb) + task_gap
    kwargs = dict(color="k", clip_on=False, linewidth=1)
    ax1.plot((x_min - l, x_min + l), (axis_break2, axis_break2), **kwargs) # top-left
    ax1.plot((x_max - l, x_max + l), (axis_break2, axis_break2), **kwargs) # top-right
    ax2.plot((x_min - l, x_min + l), (axis_break1, axis_break1), **kwargs) # bottom-left
    ax2.plot((x_max - l, x_max + l), (axis_break1, axis_break1), **kwargs) # bottom-right

    handles, labels = ax1.get_legend_handles_labels()
    handles += uhhh
    labels += ["Actual Time"]
    fig.legend(handles, labels, loc=[0.74, 0.88], ncol=len(labels))
    fig.text(-0.01, 0.5, 'Prediction Error', va='center', rotation=90, fontsize=14)
    fig.text(0.99, 0.5, 'Iteration Time (ms)', va='center', rotation=-90, fontsize=14)
    plt.tight_layout()
    plt.savefig('{}/data/multi_e2e_{}.pdf'.format(PM_HOME, gpu.lower()), bbox_inches='tight')
    plt.savefig('{}/data/multi_e2e_{}.png'.format(PM_HOME, gpu.lower()), bbox_inches='tight')

### NLP

In [None]:
# Automatically get errors from logs
other_errors = {}
other_time = {}
models = ["BERT", "GPT2", "XLNet"]
xticklabels = []
for gpu in gpus:
    home = home_template.format(nodes[gpu])
    other_errors[gpu] = {}
    other_time[gpu] = {}
    for model in models:
        prefix = '{}/data/{}/e2e/{}/barrier_bucketed_allreduce/25'.format(home, gpu, model.lower())
        if model not in other_errors[gpu].keys():
            other_errors[gpu][model] = []
        if model not in other_time[gpu].keys():
            other_time[gpu][model] = []
        for batch_size in nlp_batch_sizes:
            if model == "XLNet":
                batch_size = batch_size // 2
            if (model == "BERT" and batch_size == 64) or (model == "XLNet" and batch_size == 32): # OOM
                continue
            xticklabels.append(batch_size)
            log_file = '{}/{}_{}_distributed.log'.format(prefix, GPU_COUNT, batch_size)
            if not os.path.exists(log_file):
                continue
            for line in open(log_file, 'r'):
                if re.search("Overall per-batch training time", line):
                    other_time[gpu][model].append(float(line.split(' ')[4])) # In ms
                    break
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        other_errors[gpu][model].append([shared_e2e_error, active_error])
                    if re.search("Baseline error:", line):
                        baseline_shared_e2e_error = float(line.split(' ')[2].rstrip(',%\n'))
                        baseline_active_error = float(line.split(' ')[3].rstrip(',%\n'))
                        other_errors[gpu][model][-1].extend([baseline_shared_e2e_error, baseline_active_error])
            prediction_file = '{}/{}_{}_distributed_prediction_{}.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            if os.path.exists(prediction_file):
                for line in open(prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        e2e_error = float(line.split(' ')[3].rstrip(',%\n'))
                        other_errors[gpu][model][-1].append(e2e_error)
# shared_e2e, active, baseline, active_baseline, (e2e)

In [None]:
batch_gap = 0.9
nlp_xticklabels = xticklabels[:(len(xticklabels)//2)]
nlp_yticks1 = [-40, -30, -20, -10, 0, 10]
nlp_yticks2 = [0, 350, 700, 1050, 1400, 1750]
yticklabels = ["{:.0f}%".format(x) for x in nlp_yticks1]
bar_width = 0.2
for gpu, error in other_errors.items():
    fig = plt.figure(figsize=(10, 4))
    ax = fig.gca()
    ax2 = ax.twinx()
    e1 = []
    e2 = []
    x1 = []
    x2 = []
    c = 0
    for model, e in error.items():
        model_gap = batch_gap * len(e) + 0.5
        xtt = []
        cc = 0
        for ee in e:
            e1.append(ee[0]) # Prediction
            e2.append(ee[2]) # Baseline
            x1.append(c - 2 + cc - bar_width)
            x2.append(c - 2 + cc)
            xtt.append(c - 2 + cc - (bar_width // 2))
            cc += batch_gap
        c += model_gap

        ax.text((xtt[0] + xtt[-1] - batch_gap) / 2, nlp_yticks1[-1]+2, model, fontsize=12, transform=ax.transData)
        uhhh = ax2.plot(xtt, other_time[gpu][model], color=plt.get_cmap("tab10")(1), marker="s")

    ax.bar(x1, e1, width=bar_width, color=plt.get_cmap("Blues")(180), label="Prediction")
    ax.bar(x2, e2, width=bar_width, color=plt.get_cmap("Greys")(120), label="Baseline")
    ax.grid(axis='y')
    ax.set_xticks([(x+y)/2 for x,y in zip(x1,x2)])
    ax.set_xticklabels(nlp_xticklabels, fontsize=12)
    ax.set_title("{}x{}".format(GPU_COUNT, gpu), fontsize=15)
    ax.set_yticks(nlp_yticks1)
    ax.set_yticklabels(yticklabels)
    ax.set_ylim(nlp_yticks1[0]-5, nlp_yticks1[-1]+5)
    for xe in [zip(x1, e1), zip(x2, e2)]:
        for x, e in xe:
            if e > nlp_yticks1[-1]+5:
                ax.text(x-0.1, nlp_yticks1[-1]-2, "{:.1f}%".format(e), horizontalalignment='center', verticalalignment='center', size=12)
    ax2.set_ylim(nlp_yticks2[0]-175, nlp_yticks2[-1]+175)
    ax2.set_yticks(nlp_yticks2)
    ax2.set_yticklabels([str(x) for x in nlp_yticks2])

    handles, labels = ax.get_legend_handles_labels()
    handles += uhhh
    labels += ["Actual Time"]
    fig.legend(handles, labels, loc=[0.08, 0.16], ncol=len(labels))
    fig.text(-0.01, 0.5, 'Prediction Error', va='center', rotation=90, fontsize=14)
    fig.text(0.99, 0.5, 'Iteration Time (ms)', va='center', rotation=-90, fontsize=14)
    fig.text(0.45, -0.01, "Batch Size", fontsize=14)
    plt.tight_layout()
    plt.savefig('{}/data/multi_e2e_others_{}.pdf'.format(PM_HOME, gpu.lower()), bbox_inches='tight')
    plt.savefig('{}/data/multi_e2e_others_{}.png'.format(PM_HOME, gpu.lower()), bbox_inches='tight')

### Sharding Config Selection

In [None]:
# Automatically get errors from logs
batch_size = 4096
selection_actual_time = {}
selection_predicted_time = {}
for gpu in gpus:
    home = home_template.format(nodes[gpu])
    selection_actual_time[gpu] = []
    selection_predicted_time[gpu] = []
    for idx, task in enumerate(tasks[gpu]):
        if idx % 10 >= 5: # Heavy tasks only
            continue
        tmp = task.split(',')
        tmp1 = {}
        tmp2 = {}
        dirs = glob.glob('{}/data/{}/e2e/DLRM_open_source/{}/{}/f/*/barrier_bucketed_allreduce/25'.format(home, gpu, tmp[0], tmp[1]))
        ff = filter(lambda f: os.path.isdir(f), dirs)
        for prefix in ff:
            sharder = prefix.split('/')[-3]
            log_file = '{}/{}_{}_distributed.log'.format(prefix, GPU_COUNT, batch_size)
            if not os.path.exists(log_file):
                continue
            assert os.path.exists(log_file), "{} does not exist".format(log_file)
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            if not os.path.exists(shared_prediction_file):
                continue
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            for line in open(log_file, 'r'):
                if re.search("Overall per-batch training time", line):
                    tmp1[sharder] = float(line.split(' ')[4]) # In ms
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    tmp2[sharder] = float(line.split(' ')[2].strip(',')) / 1e3 # In ms
                    break
        selection_actual_time[gpu].append(tmp1)
        selection_predicted_time[gpu].append(tmp2)

In [None]:
count = 0
w = 0.5
task_gap = 10
yticks = [6, 9, 12, 15, 18, 21]
for idy, gpu in enumerate(gpus):
    fig = plt.figure(figsize=(18, 5))
    ax = fig.gca()
    batch_width = 0
    xticks = []
    for idx, e in enumerate(selection_predicted_time[gpu]):
        xtt = []
        batch_width += 2
        sharder_gap = 0
        for sharder, ee in e.items():
            sharder_gap += 1.5
            xtt.append(task_gap * batch_width - 2 + sharder_gap - w * 4)
        h1, = ax.plot(xtt, [t for _, t in selection_predicted_time[gpu][idx].items()], color=plt.get_cmap("Blues")(180), marker="s", label='Prediction')
        h2, = ax.plot(xtt, [t for _, t in selection_actual_time[gpu][idx].items()], color=plt.get_cmap("tab10")(1), marker="s", label='Actual Time')
        xticks.append((xtt[0] + xtt[-1]) / 2)
    ax.set_xticks(xticks)
    ax.set_xticklabels(["Task_{}".format(x) for x in range(len(tasks[gpu])) if x % 10 < 5], fontsize=12)
    ax.set_yticks(yticks)
    ax.set_title("{}x{}".format(GPU_COUNT, gpu), fontsize=15)
    ax.grid(axis='y')
    fig.legend(handles=[h1, h2], loc=[0.78, 0.85], ncols=2)
    fig.text(0.1, 0.5, 'Iteration Time (ms)', va='center', rotation=90, fontsize=14)
    plt.savefig('{}/data/config_selection_{}.pdf'.format(PM_HOME, gpu.lower()), bbox_inches='tight')
    plt.savefig('{}/data/config_selection_{}.png'.format(PM_HOME, gpu.lower()), bbox_inches='tight')

### eg_comm

In [None]:
# Automatically get errors from logs
eg_comm = {}
actual_eg_comm = {}
for gpu in gpus:
    home = home_template.format(nodes[gpu])
    with open("{}/benchmark/tasks_{}x{}_permanent.txt".format(home, GPU_COUNT, gpu), 'r') as f:
        tasks[gpu] = [line.strip() for line in f.readlines()]
    eg_comm[gpu] = []
    actual_eg_comm[gpu] = []
    for task in tasks[gpu]:
        tmp = task.split(',')
        prefix = "{}/data/{}/e2e/DLRM_open_source/{}/{}/f/size_lookup_greedy/barrier_bucketed_allreduce/25".format(home, gpu, tmp[0], tmp[1])
        tmp1 = []
        tmp2 = []
        for batch_size in batch_sizes:
            log_file = '{}/{}_{}_distributed_0_summary_{}.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            if not os.path.exists(log_file):
                continue
            # assert os.path.exists(log_file), "{} does not exist".format(log_file)
            for line in open(log_file, 'r'):
                if re.search("eg_comm", line):
                    tmp2.append(float(line.split(' ')[-1]))
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    if re.search("eg_comm:", line):
                        tmp1.append([float(line.split(' ')[-1])])
            prediction_file = '{}/{}_{}_distributed_prediction_{}.log'.format(prefix, GPU_COUNT, batch_size, trimmed_iters)
            if os.path.exists(prediction_file):
                for line in open(prediction_file, 'r'):
                    if re.search("eg_comm:", line):
                        tmp1[-1].extend([float(line.split(' ')[-1])])
        eg_comm[gpu].append(tmp1)
        actual_eg_comm[gpu].append(tmp2)

In [None]:
eg_comm

In [None]:
actual_eg_comm