In [None]:
import argparse
import csv
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
from scipy.interpolate import interp1d
from scipy.stats import pearsonr, spearmanr
import sys
import time
import torch
from torch.distributed.pipeline.sync import Pipe
import tqdm

In [None]:
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath("__file__"))))
from examples.mlp import mlp
from dist_ir.ir import Topology
from dist_ir.executor import infer_types, Simulator
from dist_ir.executor.cost_model import CostModel

In [None]:
DEVICE_THROUGHPUT = 6.7e12 # FLOPS
DRAM_BANDWIDTH = 2.7e12 # ???
PCIE_BANDWIDTH = 15.76 # Gbps
WORLD_SIZE = 2
BATCH_SIZE = 1024

In [None]:
def mlp_pytorch(num_hidden_layers, hidden_dim):
    layers = []
    num_blocks_per_device = num_hidden_layers // WORLD_SIZE
    for i in range(num_hidden_layers):
        device = i // num_blocks_per_device
        layers.append(
            torch.nn.Linear(hidden_dim, hidden_dim, bias=False).to(f"cuda:{device}")
        )
        layers.append(torch.nn.ReLU().to(f"cuda:{device}"))
    return torch.nn.Sequential(*layers)

In [None]:
def run(
    num_hidden_layers,
    hidden_dim,
    num_microbatches,
    num_warmup_steps=10,
    num_profiling_steps=100,
):
    model = mlp_pytorch(num_hidden_layers, hidden_dim)
    model = Pipe(model, chunks=num_microbatches)
    loss_fn = torch.nn.MSELoss()
    x = torch.randn(size=(BATCH_SIZE, hidden_dim)).to("cuda:0")
    labels = torch.randn(size=(BATCH_SIZE, hidden_dim)).to("cuda:0")
    runtimes = []
    for i in tqdm.tqdm(range(num_warmup_steps + num_profiling_steps)):
        start = time.time()
        y = model(x)
        loss_fn(y, labels).backward()
        duration = time.time() - start
        runtimes.append(duration)
    return np.median(runtimes[num_warmup_steps:])

In [None]:
def simulate(num_hidden_layers, hidden_dim, num_microbatches):
    topology = Topology()
    d0 = topology.add_device(
        "gpu", throughput=DEVICE_THROUGHPUT, dram_bandwidth=DRAM_BANDWIDTH
    )
    for i in range(WORLD_SIZE):
        di = topology.add_device(
            "gpu", throughput=DEVICE_THROUGHPUT, dram_bandwidth=DRAM_BANDWIDTH
        )
        topology.set_bandwidth(d0, di, float("inf"))
        for j in range(1, i + 1):
            dj = topology.devices[j]
            topology.set_bandwidth(di, dj, PCIE_BANDWIDTH)
    function = mlp(
        BATCH_SIZE, hidden_dim, hidden_dim, hidden_dim, num_hidden_layers, d0
    )
    function = infer_types(function, function.inputs)
    pp_function = mlp_dhp_transform(
        function, 1, 1, WORLD_SIZE, topology.devices, num_microbatches
    )
    pp_function = infer_types(pp_function, pp_function.inputs)
    simulator = Simulator(CostModel(topology))
    simulation = simulator.interpret(
        function,
        (v.type for v in function.inputs),
    )
    return max([simulation.timestamps[d] for d in simulation.timestamps])

In [None]:
def sweep(
    all_num_hidden_layers,
    all_hidden_dims,
    all_num_microbatches,
    filename,
    func,
):
    fieldnames = ["num_hidden_layers", "hidden_dim", "num_microbatches", "runtime"]
    with open(filename, "w") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for num_hidden_layers in all_num_hidden_layers:
            for hidden_dim in all_hidden_dims:
                for num_microbatches in all_num_microbatches:
                    runtime = func(num_hidden_layers, hidden_dim, num_microbatches)
                    writer.writerow(
                        {
                            "num_hidden_layers": num_hidden_layers,
                            "hidden_dim": hidden_dim,
                            "num_microbatches": num_microbatches,
                            "runtime": runtime,
                        }
                    )

In [None]:
def parse_csv(filename):
    results = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            results.append(
                (
                    row["num_hidden_layers"],
                    row["num_microbatches"],
                    row["hidden_dim"],
                    row["runtime"],
                )
            )
    return results

In [None]:
all_num_hidden_layers = [8, 16, 32]
all_num_microbatches = [2, 4, 8]
all_hidden_dims = [1024, 2048, 4096]
sweep(
    all_num_hidden_layers,
    all_hidden_dims,
    all_num_microbatches,
    "pipeline_parallel_runtimes.csv",
    run,
)
sweep(
    all_num_hidden_layers,
    all_hidden_dims,
    all_num_microbatches,
    "pipeline_parallel_simulated_runtimes.csv",
    simulate,
)

In [None]:
real_results = parse_csv("pipeline_parallel_runtimes.csv")
simulated_results = parse_csv("pipeline_parallel_simulated_runtimes.csv")

In [None]:
real_throughputs = []
simulated_throughputs = []
model_sizes = []
print(f"# layers,hidden dim,num_microbatches,model size (MB),real,simulated,ratio")
for real, simulated in zip(real_results, simulated_results):
    num_hidden_layers = int(real[0])
    hidden_dim = int(real[1])
    num_microbatches = int(real[2])
    real_runtime = float(real[3])
    simulated_runtime = float(simulated[-1])
    model_size = hidden_dim * hidden_dim * num_hidden_layers * 4 / (1024 * 1024)
    print(
        f"{num_hidden_layers},{hidden_dim},{num_microbatches}"
        f"{model_size},{real_runtime*1000:.2f},"
        f"{simulated_runtime*1000:.2f},"
        f"{(simulated_runtime/real_runtime):.2f}x"
    )
    real_throughputs.append(BATCH_SIZE / real_runtime / 1000)
    simulated_throughputs.append(BATCH_SIZE / simulated_runtime / 1000)
    model_sizes.append(model_size)

In [None]:
simulated_throughputs = np.array(simulated_throughputs)
real_throughputs = np.array(real_throughputs)
r, p = pearsonr(simulated_throughputs, real_throughputs)
print(f"Pearson's correlation: {r} (p={p})")
r, p = spearmanr(simulated_throughputs, real_throughputs)
print(f"Spearman's correlation: {r} (p={p})")
m, b = np.polyfit(simulated_throughputs, real_throughputs, 1)
x_new = np.linspace(simulated_throughputs.min(), simulated_throughputs.max(), 500)
f = interp1d(simulated_throughputs, m * simulated_throughputs + b, kind="linear")
y_smooth = f(x_new)
plt.plot(x_new, y_smooth, label="Best fit")
plt.plot(x_new, x_new, label="Ideal")
colors=["b", "orange", "g", "purple"]
scaled_model_sizes = np.array(model_sizes, dtype=np.float32)
scaled_model_sizes -= np.min(model_sizes)
scaled_model_sizes *= 1.0 / (np.max(model_sizes) - np.min(model_sizes))
scaled_model_sizes *= 224
scaled_model_sizes += 32
plt.scatter(
    simulated_throughputs,
    real_throughputs,
    s=scaled_model_sizes,
    facecolors="none",
    edgecolors="b",
)
plt.xlabel("Simulated throughputs (1000 samples / second)")
plt.ylabel("Real throughputs (1000 samples / second)")
plt.legend()
plt.savefig("pipeline_parallel_simulation_performance.pdf", dpi=600, bbox_inches="tight")