In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from IPython.display import display, HTML
from scipy.stats.mstats import gmean
from pprint import pprint
import matplotlib.pyplot as plt
import os, re, sys, glob

plt.rcParams['figure.max_open_warning'] = 50
display(HTML("<style>.container { width:100% !important; }</style>"))
PM_HOME = os.getcwd() + "/../../"

### Multi-GPU DLRM E2E Plot

In [None]:
# Automatically get errors from logs
batch_sizes = [512, 1024, 2048, 4096]
num_gpus = 2
trimmed_iters = 30
gpus = ["A100"]
errors = {}
actual_time = {}
for gpu in gpus:
    errors[gpu] = {}
    actual_time[gpu] = {}
    dirs = glob.glob('{}/data/{}/e2e/DLRM_open_source/*/*/f/size_lookup_greedy/barrier_bucketed_allreduce/25'.format(PM_HOME, gpu))
    ff = filter(lambda f: os.path.isdir(f), dirs)
    for prefix in ff:
        task = prefix.split('/')[-5]
        if task not in errors[gpu].keys():
            errors[gpu][task] = []
        if task not in actual_time[gpu].keys():
            actual_time[gpu][task] = []
        for batch_size in batch_sizes:
            log_file = '{}/{}_{}_distributed.log'.format(prefix, num_gpus, batch_size)
            if not os.path.exists(log_file):
                continue
            assert os.path.exists(log_file), "{} does not exist".format(log_file)
            for line in open(log_file, 'r'):
                if re.search("Overall per-batch training time", line):
                    actual_time[gpu][task].append(float(line.split(' ')[4])) # In ms
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, num_gpus, batch_size, trimmed_iters)
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        active_error = float(line.split(' ')[2].rstrip(',%\n'))
                        kernel_only_error = float(line.split(' ')[4].rstrip(',%\n'))
                        shared_e2e_error = float(line.split(' ')[3].rstrip(',%\n'))
                        errors[gpu][task].append([active_error, kernel_only_error, shared_e2e_error])
            prediction_file = '{}/{}_{}_distributed_prediction_{}.log'.format(prefix, num_gpus, batch_size, trimmed_iters)
            if os.path.exists(prediction_file):
                for line in open(prediction_file, 'r'):
                    if re.search("Prediction error:", line):
                        e2e_error = float(line.split(' ')[3].rstrip(',%\n'))
                        errors[gpu][task][-1].append(e2e_error)
# active, kernel_only, shared_e2e, (e2e)

In [None]:
# Overall
overall_active = [abs(c[0]) for device in errors.keys() for task in errors[device].keys() for c in errors[device][task]]
# overall_total = [abs(c[1]) for device in errors.keys() for task in errors[device].keys() for c in errors[device][task]]
error_active_devices = {
    device: [abs(c[0]) for task in errors[device].keys() for c in errors[device][task]] for device in errors.keys()
}
# error_total_devices = {
#     device: [abs(c[1]) for task in errors[device].keys() for c in errors[device][task]] for device in errors.keys()
# }

# print("Overall GPU (geo, min, max): {:.2f} & {:.2f} & {:.2f}".format(gmean(overall_active), min(overall_active), max(overall_active)))
# print("Overall total (geo, min, max): {:.2f} & {:.2f} & {:.2f}".format(gmean(overall_total), min(overall_total), max(overall_total)))

# for device in error_active_devices.keys():
#     print("{} GPU (geo, min, max): {:.2f} & {:.2f} & {:.2f}".format(device, gmean(error_active_devices[device]), min(error_active_devices[device]), max(error_active_devices[device])))
#     print("{} total (geo, min, max): {:.2f} & {:.2f} & {:.2f}".format(device, gmean(error_total_devices[device]), min(error_total_devices[device]), max(error_total_devices[device])))


overall_shared_total = [abs(c[2]) for device in errors.keys() for task in errors[device].keys() for c in errors[device][task]]
error_shared_total_devices = {
    device: [abs(c[2]) for task in errors[device].keys() for c in errors[device][task]] for device in errors.keys()
}

# print("Active & \bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_active), min(overall_active), max(overall_active)))
# # print("E2E & \bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_total), min(overall_total), max(overall_total)))
# print("Shared E2E & \bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_shared_total), min(overall_shared_total), max(overall_shared_total)))

# For Latex
s1 = ["Active & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_active), min(overall_active), max(overall_active))]
# s2 = ["E2E & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_total), min(overall_total), max(overall_total))]
s3 = ["Shared E2E & \\bf{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(overall_shared_total), min(overall_shared_total), max(overall_shared_total))]
for device in error_active_devices.keys():
    s1.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_active_devices[device]), min(error_active_devices[device]), max(error_active_devices[device])))
    # s2.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_total_devices[device]), min(error_total_devices[device]), max(error_total_devices[device])))
    s3.append("{:.2f}\% & {:.2f}\% & {:.2f}\%".format(gmean(error_shared_total_devices[device]), min(error_shared_total_devices[device]), max(error_shared_total_devices[device])))
print(' & '.join(s1) + "\\\\")
# print(' & '.join(s2) + "\\\\")
print(' & '.join(s3) + "\\\\")

In [None]:
batch_sizes = [512, 1024, 2048, 4096]
fig, axes = plt.subplots(nrows=len(gpus), ncols=1, figsize=(15, 7))
count = 0
w = 0.5
task_gap = 6
yticks = [-30, -20, -10, 0, 10, 20]
yticks2 = [2.5, 5, 7.5, 10, 12.5, 15]
yticklabels = ["{:.0f}%".format(x) for x in yticks]
for device, error in errors.items():
    ax = axes
    ax2 = ax.twinx()
    e1 = [] # active
    eb = [] # baseline
    e2 = [] # shared
    e3 = [] # standalone
    x1 = []
    x2 = []
    xb = []
    x3 = []
    xt = []
    yt = []
    batch_width = 0
    tasks = []
    xticks = []
    for task, e in error.items():
        tasks.append(task)
        xtt = []
        ytt = []
        batch_width += 2
        batch_gap = 0
        first = -1
        last = -1
        for idx, ee in enumerate(e):
            e1.append(ee[0])
            eb.append(ee[1])
            e2.append(ee[2])
            if len(ee) == 4:
                e3.append(ee[3])
                x1.append(task_gap * batch_width - 2 + batch_gap - w * 2)
                x2.append(task_gap * batch_width - 2 + batch_gap - w)
                x3.append(task_gap * batch_width - 2 + batch_gap)
                xb.append(task_gap * batch_width - 2 + batch_gap + w)
            else:
                x1.append(task_gap * batch_width - 2 + batch_gap - w)
                x2.append(task_gap * batch_width - 2 + batch_gap)
                xb.append(task_gap * batch_width - 2 + batch_gap + w)
            if idx == 0:
                first = x1[-1]
            batch_gap += 2
            xtt.append(task_gap * batch_width - 2 + batch_gap - w * 4)
        # ax.text(task_gap * c - 1.6, 13, task, fontsize=12, transform=ax.transData)
        ax2.plot(xtt, actual_time[device][task], color=plt.get_cmap("tab10")(1), marker="s")
        last = xb[-1]
        xticks.append((first + last) / 2)

    ax.bar(x2, e2, width=w, color=plt.get_cmap("Blues")(180), label='Prediction')
    ax.bar(xb, eb, width=w, color=plt.get_cmap("Greys")(120), label='Baseline')
    if e3:
        ax.bar(x3, e3, width=w, color=plt.get_cmap("Blues")(240), label='E2E')
    ax.grid(axis='y')
    ax.set_xticks(xticks)
    ax.set_xticklabels(["Task_{}".format(x) for x in range(len(tasks))], fontsize=12)
    ax.set_title(device, fontsize=15)
    ax.set_yticks(yticks)
    ax.set_yticklabels(yticklabels)
    ax.set_ylim(yticks[0]-5, yticks[-1]+5)
    xes = [zip(x1, e1), zip(xb, eb), zip(x2, e2)]
    if e3:
        xes.insert(-2, zip(x3, e3))
    for xe in xes:
        for x, e in xe:
            if e < yticks[0]-5:
                ax.text(x+0.3, yticks[0]+5, "{:.1f}%".format(e), horizontalalignment='center', verticalalignment='center', size=12)
    ax2.set_yticks(yticks2)
    count += 1

handles, labels = (axes[-1] if isinstance(axes, list) else axes).get_legend_handles_labels()
fig.legend(handles, labels, loc=[0.8, 0.95], ncol=len(labels))
fig.text(-0.01, 0.5, 'Prediction Error', va='center', rotation=90, fontsize=14)
fig.text(0.99, 0.5, 'Iteration Time (ms)', va='center', rotation=-90, fontsize=14)
plt.tight_layout()
plt.savefig('{}/data/multi_gpu_e2e.pdf'.format(PM_HOME), bbox_inches='tight')
plt.savefig('{}/data/multi_gpu_e2e.png'.format(PM_HOME), bbox_inches='tight')

### Sharding Config Selection

In [None]:
# Automatically get errors from logs
batch_sizes = [4096]
num_gpus = 2
trimmed_iters = 30
gpus = ["A100"]
# errors = {}
actual_time = {}
predicted_time = {}
for gpu in gpus:
    # errors[gpu] = {}
    actual_time[gpu] = {}
    predicted_time[gpu] = {}
    dirs = glob.glob('{}/data/{}/e2e/DLRM_open_source/*/*/f/*/barrier_bucketed_allreduce/25'.format(PM_HOME, gpu))
    ff = filter(lambda f: os.path.isdir(f), dirs)
    for prefix in ff:
        sharder = prefix.split('/')[-3]
        task = prefix.split('/')[-5]
        # if task not in errors[gpu].keys():
        #     errors[gpu][task] = {}
        if task not in actual_time[gpu].keys():
            actual_time[gpu][task] = {}
        if task not in predicted_time[gpu].keys():
            predicted_time[gpu][task] = {}
        for batch_size in batch_sizes:
            log_file = '{}/{}_{}_distributed.log'.format(prefix, num_gpus, batch_size)
            if not os.path.exists(log_file):
                continue
            assert os.path.exists(log_file), "{} does not exist".format(log_file)
            shared_prediction_file = '{}/{}_{}_distributed_prediction_{}_shared.log'.format(prefix, num_gpus, batch_size, trimmed_iters)
            if not os.path.exists(shared_prediction_file):
                continue
            assert os.path.exists(shared_prediction_file), "{} does not exist".format(shared_prediction_file)
            for line in open(log_file, 'r'):
                if re.search("Overall per-batch training time", line):
                    actual_time[gpu][task][sharder] = float(line.split(' ')[4]) # In ms
            if os.path.exists(shared_prediction_file):
                for line in open(shared_prediction_file, 'r'):
                    predicted_time[gpu][task][sharder] = float(line.split(' ')[2].strip(',')) / 1e3 # In ms
                    break
            # x = []
            # if os.path.exists(shared_prediction_file):
            #     for line in open(shared_prediction_file, 'r'):
            #         if re.search("Prediction error:", line):
            #             active_error = float(line.split(' ')[2].rstrip(',%\n'))
            #             kernel_only_error = float(line.split(' ')[4].rstrip(',%\n'))
            #             shared_e2e_error = float(line.split(' ')[3].rstrip(',%\n'))
            #             x = [active_error, kernel_only_error, shared_e2e_error]
            # prediction_file = '{}/{}_{}_distributed_prediction_{}.log'.format(prefix, num_gpus, batch_size, trimmed_iters)
            # if os.path.exists(prediction_file):
            #     for line in open(prediction_file, 'r'):
            #         if re.search("Prediction error:", line):
            #             e2e_error = float(line.split(' ')[3].rstrip(',%\n'))
            #             x.append(e2e_error)
            # errors[gpu][task][sharder] = tuple(x)
# active, kernel_only, shared_e2e, (e2e)

In [None]:
fig, axes = plt.subplots(nrows=len(gpus), ncols=1, figsize=(20, 7))
count = 0
w = 0.5
task_gap = 10
yticks = [-30, -20, -10, 0, 10, 20]
yticks2 = [5, 7.5, 10, 12.5, 15]
yticklabels = ["{:.0f}%".format(x) for x in yticks]
for idx, gpu in enumerate(gpus):
    ax = axes if len(gpus) == 1 else axes[idx]
    batch_width = 0
    for task, e in predicted_time[gpu].items():
        xtt = []
        batch_width += 2
        sharder_gap = 0
        for sharder, ee in e.items():
            sharder_gap += 1.5
            xtt.append(task_gap * batch_width - 2 + sharder_gap - w * 4)
        ax.plot(xtt, [t for _, t in actual_time[gpu][task].items()], color=plt.get_cmap("tab10")(1), marker="s")
        ax.plot(xtt, [t for _, t in predicted_time[gpu][task].items()], color=plt.get_cmap("Blues")(180), marker="s")
    ax.set_yticks(yticks2)

handles, labels = (axes[-1] if isinstance(axes, list) else axes).get_legend_handles_labels()
fig.legend(handles, labels, loc=[0.7, 0.95], ncol=len(labels))
fig.text(0.45, -0.01, "Batch Size", fontsize=14)
plt.tight_layout()
plt.savefig('{}/data/selection.pdf'.format(PM_HOME), bbox_inches='tight')
plt.savefig('{}/data/selection.png'.format(PM_HOME), bbox_inches='tight')