In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### DNNStall Figure 5

In [None]:
folder = "./rtx6000/"
batch_size = 256
epoch_size = 2560
batches_per_epoch = epoch_size // batch_size

runs = {}

for filename in os.listdir(folder):
    path = folder + filename
    
    name = filename.split('-')
    model = name[0]
    cores = int(name[2])
    if not model in runs:
        runs[model] = []
    
    epoch_time = pd.read_csv(path, '\t').sum().sum()
    samples_per_sec = epoch_size / epoch_time
    
    runs[model].append((cores, samples_per_sec))

for model, data in runs.items():
    x, y = zip(*sorted(data))
    plt.plot(x, y, label = model, marker='o')
    
plt.title("RTX6000 Pre-process Cores vs. Performance")
plt.ylabel("Samples/sec")
plt.xlabel("Cores")
plt.legend()
plt.show()

### DNNStall Figure 4

In [None]:
folder = "./data/p100-resnet18/"
n_epochs = 1

runs = []

for filename in os.listdir(folder):
    path = folder + filename
    
    name = filename.split('-')
    model = name[0]
    batch_size = name[1]
    workers = name[3]
    memory = name[5]
    cached = name[7]
    
    data = pd.read_csv(path, '\t')
    io_time = sum(data['io_time']) / n_epochs
    cpu2gpu_time = sum(data['cpu2gpu_time']) / n_epochs
    gpu_time = sum(data['gpu_time']) / n_epochs
    
    runs.append((float(name[7][:-1]), cached + '\n(' + memory + ')', io_time, cpu2gpu_time, gpu_time))

_, cached, io, cpu2gpu, gpu = zip(*sorted(runs))
bars = np.arange(len(cached))
plt.bar(cached, io, bottom=np.array(cpu2gpu) + np.array(gpu), label='io')
plt.bar(cached, cpu2gpu, bottom=gpu, label='cpu2gpu')
plt.bar(cached, gpu, label='gpu')
plt.title("P100 Memory vs. Performance (resnet18)")
plt.ylabel("Epoch time (s)")
plt.xlabel("Cache %")
plt.legend()
plt.show()

### DNNStall Figure 6 (prep stall)

In [None]:
folder = "./data/p100-fig6/"
n_epochs = 1

prep_stall_pct = []
prep_stall_abs = {}

nicknames = {
    "alexnet":"AN",
    "mobilenet_v2":"MN",
    "resnet18":"RN18",
    "shufflenet_v2_x0_5":"ShN",
    "squeezenet1_0":"SqN",
    "vgg11":"V11",
}

for filename in os.listdir(folder):
    path = folder + filename
    
    name = filename.split('-')
    model = nicknames[name[0]]
    batch_size = name[1]
    workers = name[3]
    memory = name[5]
    cached = name[7]
    
    data = pd.read_csv(path, '\t')
    io_time = sum(data['io_time']) / n_epochs
    cpu2gpu_time = sum(data['cpu2gpu_time']) / n_epochs
    gpu_time = sum(data['gpu_time']) / n_epochs
    
    prep_stall_pct.append((100 * io_time / (io_time + cpu2gpu_time + gpu_time), model))
    prep_stall_abs[model] = io_time

times, models = zip(*sorted(prep_stall_pct, reverse=True))

plt.bar(models, times)
plt.title("P100 Prep Stalls")
plt.ylabel("Prep Stall % of Epoch Time")
plt.xlabel("Model")
plt.show()

### DNNStall Figure 3 (fetch stall)

In [None]:
folder = "./data/p100-fig3/"
n_epochs = 1

fetch_stall_pct = []

for filename in os.listdir(folder):
    path = folder + filename
    
    name = filename.split('-')
    model = nicknames[name[0]]
    batch_size = name[1]
    workers = name[3]
    memory = name[5]
    cached = name[7]
    
    data = pd.read_csv(path, '\t')
    io_time = sum(data['io_time']) / n_epochs
    cpu2gpu_time = sum(data['cpu2gpu_time']) / n_epochs
    gpu_time = sum(data['gpu_time']) / n_epochs
    
    fetch_stall_pct.append((100 * (io_time - prep_stall_abs[model]) / (io_time + cpu2gpu_time + gpu_time), model))

nicknames = {
    "alexnet":"AN",
    "mobilenet_v2":"MN",
    "resnet18":"RN18",
    "shufflenet_v2_x0_5":"ShN",
    "squeezenet1_0":"SqN",
    "vgg11":"V11",
}

times, models = zip(*sorted(fetch_stall_pct, reverse=True))

plt.bar(models, times)
plt.title("P100 Fetch Stalls (~38% cached)")
plt.ylabel("Fetch Stall % of Epoch Time")
plt.xlabel("Model")
plt.show()

### Worker/Mem Heatmap

In [None]:
folder = "./data/p100-worker-mem-alexnet/"
n_epochs = 1

mem_cfgs = ["4G", "6G", "8G", "10G", "12G", "14G", "16G", "18G", "20G", "22G", "24G"]
worker_cfgs = ["1", "2", "4", "6", "8", "10", "12"]

out_cached = np.zeros((len(mem_cfgs), len(worker_cfgs)), dtype=float)
out_timing = np.zeros((len(mem_cfgs), len(worker_cfgs)), dtype=float)

for filename in os.listdir(folder):
    path = folder + filename
    
    name = filename.split('-')
    model = name[0]
    batch_size = name[1]
    workers = name[3]
    memory = name[5]
    cached = name[7]
    
    data = pd.read_csv(path, '\t')
    io_time = sum(data['io_time']) / n_epochs
    cpu2gpu_time = sum(data['cpu2gpu_time']) / n_epochs
    gpu_time = sum(data['gpu_time']) / n_epochs
    total_time = io_time + cpu2gpu_time + gpu_time
    
    out_cached[mem_cfgs.index(memory), worker_cfgs.index(workers)] = float(cached[:-1])
    out_timing[mem_cfgs.index(memory), worker_cfgs.index(workers)] = io_time / total_time

missing_data = [(4, 0), (5, 0), (6, 0), (5, 1), (6, 1)]
for i, j in missing_data:
    out_cached[j, i] = float('nan')
    out_timing[j, i] = float('nan')

In [None]:
# Caching plot

fig, ax = plt.subplots()
cmap = plt.cm.get_cmap('RdYlGn')
cmap.set_bad(color='gray')
im = ax.imshow(out_cached, cmap=cmap)

ax.set_yticks(np.arange(len(mem_cfgs)))
ax.set_yticklabels(mem_cfgs)

ax.set_xticks(np.arange(len(worker_cfgs)))
ax.set_xticklabels(worker_cfgs)

for i in range(len(mem_cfgs)):
    for j in range(len(worker_cfgs)):
        if not ((j, i) in missing_data):
            text = ax.text(j, i, out_cached[i, j], ha="center", va="center", color="black")
        else:
            text = ax.text(j, i, "N/A", ha="center", va="center", color="black")
  
plt.title("% Cached")
plt.xlabel("Workers")
plt.ylabel("Memory Limit")

fig = plt.gcf()
fig.set_size_inches(8, 8)
fig.savefig("cached.png", dpi=100)

In [None]:
# Timing plot

fig, ax = plt.subplots()
cmap = plt.cm.get_cmap('RdYlGn').reversed()
cmap.set_bad(color='gray')
im = ax.imshow(out_timing, cmap=cmap)

ax.set_yticks(np.arange(len(mem_cfgs)))
ax.set_yticklabels(mem_cfgs)

ax.set_xticks(np.arange(len(worker_cfgs)))
ax.set_xticklabels(worker_cfgs)

for i in range(len(mem_cfgs)):
    for j in range(len(worker_cfgs)):
        if not ((j, i) in missing_data):
            text = ax.text(j, i, round(out_timing[i, j] * 100, 1), ha="center", va="center", color="black")
        else:
            text = ax.text(j, i, "N/A", ha="center", va="center", color="black")
  
plt.title("Data Stall % of Epoch Time")
plt.xlabel("Workers")
plt.ylabel("Memory Limit")
fig = plt.gcf()
fig.set_size_inches(8, 8)
fig.savefig("data_stall.png", dpi=100)