In [6]:
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision.models import resnet50, densenet121, mobilenet_v2, convnext_tiny
from torch.utils.data import TensorDataset, DataLoader
import torchvision
import matplotlib.pyplot as plt
import torch._dynamo
from collections import defaultdict
from functools import reduce

In [7]:
def load_mnist_imgs_and_labels(imgs_path, labels_path) -> tuple[np.ndarray, np.ndarray]:
	i_hand = open(imgs_path, 'rb')
	l_hand = open(labels_path, 'rb')

	i_hand.seek(4, 0) # skipping "magic" numbers
	l_hand.seek(4, 0)

	n_imgs = int.from_bytes(i_hand.read(4), 'big')

	imgs = np.frombuffer(i_hand.read(), np.uint8, offset=8)
	imgs = (255 - imgs) / 255
	imgs = imgs.reshape(n_imgs, 28 * 28)

	labels = np.frombuffer(l_hand.read(), np.uint8, offset=4)

	i_hand.close()
	l_hand.close()

	return imgs, labels


def get_mnist_loaders(batch_size, test_batch_size=None, cutoff=1, flatten=True):
	if test_batch_size is None: test_batch_size = batch_size * 2

	x_train, y_train = load_mnist_imgs_and_labels(
		'./drive/MyDrive/colab/datasets/mnist-digits/train-images-idx3-ubyte',
		'./drive/MyDrive/colab/datasets/mnist-digits/train-labels-idx1-ubyte'
	)

	x_train, x_val = np.split(x_train, [int(len(x_train) * cutoff)])
	y_train, y_val = np.split(y_train, [int(len(y_train) * cutoff)])

	x_test, y_test = load_mnist_imgs_and_labels(
		'./drive/MyDrive/colab/datasets/mnist-digits/t10k-images-idx3-ubyte',
		'./drive/MyDrive/colab/datasets/mnist-digits/t10k-labels-idx1-ubyte'
	)

	if not flatten:
		x_train, x_test = map(
			lambda x: x.reshape(-1, 1, 28, 28),
			(x_train, x_test)
		)

	x_train, y_train, x_val, y_val, x_test, y_test = map(
		torch.tensor,
		(x_train, y_train, x_val, y_val, x_test, y_test)
	)

	x_train, x_val, x_test = map(
		lambda x: x.to(torch.float32),
		(x_train, x_val, x_test)
	)

	y_train, y_val, y_test = map(
			lambda y: y.to(torch.int64),
			(y_train, y_val, y_test)
		)

	train_ds = TensorDataset(x_train, y_train)
	val_ds = TensorDataset(x_val, y_val)
	test_ds = TensorDataset(x_test, y_test)

	train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
	val_dl = DataLoader(val_ds, batch_size=test_batch_size)
	test_dl = DataLoader(test_ds, batch_size=test_batch_size)

	return train_dl, val_dl, test_dl


def get_cifar10_loaders(batch_size, test_batch_size=None, image_size=32):
	if test_batch_size is None: test_batch_size = batch_size * 2

	transform = torchvision.transforms.Compose([
		torchvision.transforms.Resize(image_size),
		torchvision.transforms.ToTensor(),
		torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
	])

	train_ds = torchvision.datasets.CIFAR10(root='./drive/MyDrive/colab/datasets/cifar-10-py/', train=True, download=True, transform=transform)
	test_ds = torchvision.datasets.CIFAR10(root='./drive/MyDrive/colab/datasets/cifar-10-py/', train=False, download=True, transform=transform)

	train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True)
	test_dl = torch.utils.data.DataLoader(test_ds, batch_size=test_batch_size, shuffle=False, num_workers=2, drop_last=True)

	return train_dl, test_dl


def fit_step(model, xb, yb, loss_func, optimizer):
	optimizer.zero_grad()
	pred = model(xb)
	loss = loss_func(pred, yb)
	loss.backward()
	optimizer.step()
	return loss.item()


def test_step(model, xb, yb, loss_func):
	pred = model(xb)
	test_loss = loss_func(pred, yb, reduction='sum').item()
	pred = pred.argmax(dim=1, keepdim=True)
	correct_pred = pred.eq(yb.view_as(pred)).sum().item()
	return test_loss, correct_pred


def test(model, device, loader, loss_func, start, end, test_step_func) -> tuple[float, float, float]:
	model.eval()
	test_loss = 0.0
	correct_pred = 0

	start.record()
	with torch.no_grad():
		for xb, yb in loader:
			xb, yb = xb.to(device), yb.to(device)
			tloss, cpred = test_step_func(model, xb, yb, loss_func)
			test_loss += tloss
			correct_pred += cpred
	end.record()
	torch.cuda.synchronize()

	test_loss /= len(loader.dataset)
	correct_pred /= len(loader.dataset)
	elapsed_time = start.elapsed_time(end)

	return test_loss, correct_pred, elapsed_time


class FullyConnectedNet(nn.Module):

	def __init__(self, layers=[784, 800, 10]):
		super(FullyConnectedNet, self).__init__()
		self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(layers[:-1], layers[1:])])

	def forward(self, x):
		for layer in self.layers[:-1]:
			x = F.relu(layer(x))
		x = self.layers[-1](x)
		return F.log_softmax(x, dim=1)


class SimpleConvNet(nn.Module):

	def __init__(self, num_classes=10):
		super().__init__()
		self.conv1 = nn.Sequential(
			nn.Conv2d(1, 16, 5, 1, 2),
			nn.ReLU(),
			nn.MaxPool2d(2)
		)
		self.conv2 = nn.Sequential(
			nn.Conv2d(16, 32, 5, 1, 2),
			nn.ReLU(),
			nn.MaxPool2d(2),
		)
		self.dense = nn.Linear(32 * 7 * 7, 500)
		self.classifier = nn.Linear(500, num_classes)

	def forward(self, x):
		x = self.conv1(x)
		x = self.conv2(x)
		x = torch.flatten(x, 1)
		x = F.relu(self.dense(x))
		return F.log_softmax(self.classifier(x), dim=1)


def env_builder(name: str, num_classes: int, batch_size: int, test_batch_size: int):
	if name == 'FullyConnectedNet':
		model = FullyConnectedNet()
	elif name == 'SimpleConvNet':
		model = SimpleConvNet()
	elif name == 'ResNet-50':
		model = resnet50()
		model.fc = nn.Linear(in_features=2048, out_features=num_classes, bias=True)
	elif name == 'DenseNet-121':
		model = densenet121()
		model.classifier = nn.Linear(in_features=1024, out_features=num_classes, bias=True)
	elif name == 'MobileNet-v2':
		model = mobilenet_v2()
		model.classifier[1] = nn.Linear(in_features=1280, out_features=num_classes, bias=True)
	elif name == 'ConvNeXt-Tiny':
		model = convnext_tiny()
		model.classifier[2] = nn.Linear(in_features=768, out_features=num_classes, bias=True)
	else:
		raise ValueError('Invalid model name')

	if name == 'FullyConnectedNet':
		train_dl, _, test_dl = get_mnist_loaders(batch_size, test_batch_size)
		loss_func = F.nll_loss
	elif name == 'SimpleConvNet':
		train_dl, _, test_dl = get_mnist_loaders(batch_size, test_batch_size, flatten=False)
		loss_func = F.nll_loss
	else:
		train_dl, test_dl = get_cifar10_loaders(batch_size, test_batch_size)
		loss_func = F.cross_entropy

	return model, train_dl, test_dl, loss_func

def latency(model, sample):
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)
	start.record()
	_ = model(sample)
	end.record()
	torch.cuda.synchronize()
	return start.elapsed_time(end)

In [8]:
if torch.cuda.is_available():
    print('device count:', torch.cuda.device_count())
    device = torch.device(0)
    device_cap = torch.cuda.get_device_capability()
    print(f"GPU {torch.cuda.get_device_name(0)} available with compatibility {device_cap}")
    if device_cap not in ((7, 0), (8, 0), (9, 0)):
        print("GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.")
else:
    device = torch.device("cpu")
    print("GPU unavailable")

device count: 1
GPU Tesla T4 available with compatibility (7, 5)
GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.


In [9]:
batch_size = 96
test_batch_size = 128
epochs = 2
lr = 1e-2
momentum = 0.9
num_classes = 10
log_interval = 200
timestamp = time.time_ns()
results_eager_filepath = f'./drive/MyDrive/colab/results/pytorch-clf-eager-{timestamp}.csv'
results_compile_filepath = f'./drive/MyDrive/colab/results/pytorch-clf-compile-{timestamp}.csv'
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
# clfs = ['FullyConnectedNet', 'SimpleConvNet', 'ResNet-50', 'DenseNet-121', 'MobileNet-v2', 'ConvNeXt-Tiny']
clfs = ['FullyConnectedNet', 'ConvNeXt-Tiny']

## Training - eager mode

In [5]:
telemetry_eager = defaultdict(list)

# train all models - no comp
for model_name in clfs:
  print(f'Eager benchmarks for {model_name} begin')

  model, train_dl, test_dl, loss_func = env_builder(model_name, num_classes, batch_size, test_batch_size)
  model = model.to(device)
  opt = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

  for epoch in range(1, epochs + 1):
    running_loss = 0.0
    start.record()
    for xb, yb in train_dl:
      xb, yb = xb.to(device), yb.to(device)
      running_loss += fit_step(model, xb, yb, loss_func, opt)
    end.record()
    torch.cuda.synchronize()

    telemetry_eager['model_name'].append(model_name)
    telemetry_eager['phase'].append('training')
    telemetry_eager['epoch'].append(epoch)
    telemetry_eager['loss'].append(running_loss / len(train_dl))
    telemetry_eager['performance'].append(-1)
    telemetry_eager['elapsed_time'].append(start.elapsed_time(end) * 1e6)
    pd.DataFrame(telemetry_eager).to_csv(results_eager_filepath, index=False)
    print(f'Epoch {epoch} finished with execution time of {start.elapsed_time(end) / 1e3}s')

Eager benchmarks for FullyConnectedNet begin
Epoch 1 finished with execution time of 1.964793212890625s
Epoch 2 finished with execution time of 1.7931392822265626s
Eager benchmarks for ConvNeXt-Tiny begin
Files already downloaded and verified
Files already downloaded and verified


  self.pid = os.fork()
  self.pid = os.fork()


Epoch 1 finished with execution time of 37.22212890625s
Epoch 2 finished with execution time of 38.20326953125s


In [13]:
from pathlib import Path
Path('./drive/MyDrive/colab/datasets/celeba_tiny').absolute()
next(iter(Path('./drive/MyDrive/colab/datasets/celeba_tiny').glob('*')))

PosixPath('drive/MyDrive/colab/datasets/celeba_tiny/img_align_celeba')

## Latency - both modes

In [None]:
# setup for compile training but its needed for saving latency results too
telemetry_compile = defaultdict(list)
# telemetry_eager = defaultdict(list) # do usuniecia !!!!!!!!!!!!!

telemetry_compile_by_model = []

In [None]:
for k in telemetry_eager.keys():
  print(k, len(telemetry_eager[k]), telemetry_eager[k])

model_name 4 ['FullyConnectedNet', 'FullyConnectedNet', 'ConvNeXt-Tiny', 'ConvNeXt-Tiny']
phase 4 ['training', 'training', 'training', 'training']
epoch 4 [1, 2, 1, 2]
loss 4 [0.5556255227088929, 0.31790154234170914, 1.8624580731758704, 1.7259240872584856]
performance 4 [-1, -1, -1, -1]
elapsed_time 4 [2318406738.28125, 3975225585.9375, 33795152343.75, 33870371093.75]


In [None]:
for model_name in clfs:
  telemetry_eager_times = []
  telemetry_compile_times = []

  # prepare models
  model, _, test_dl, _ = env_builder(model_name, num_classes, batch_size, test_batch_size)
  model = model.to(device)
  torch._dynamo.reset()
  model_comp = torch.compile(model, mode='reduce-overhead')
  # model, model_comp = model.to(device), model_comp.to(device)

  # prepare samples
  batch = next(iter(test_dl))
  batch = batch[0].to(device)
  sample = batch[0].unsqueeze(dim=0)

  # measure compilation time
  with torch.no_grad():
    e = latency(model, sample)
    print('compilation - eager mode:', e)
    telemetry_eager_times.append(e)

    c = latency(model_comp, sample)
    print('compilation - compile mode:', c)
    telemetry_compile_times.append(c)

    # measure latency (with warm-up runs in case of recompilation)
    for i in range(epochs + 10):
      sample = batch[i].unsqueeze(dim=0)

      e = latency(model, sample)
      c = latency(model_comp, sample)

      if i >= 10:
        telemetry_eager_times.append(e)
        telemetry_compile_times.append(c)

    em = np.median(telemetry_eager_times)
    cm = np.median(telemetry_compile_times)
    print(f'median exec time (e/c): {em} / {cm}')
    print(f'compiled graph is on average {em / cm} times faster than eager execution')

  # after completing latency benchmarks, save gathered telemetry
  # lists -> aggregated list -> append to dicts

  telemetry_compile_by_model.append((model_name, telemetry_eager_times, telemetry_compile_times))

compilation - eager mode: 0.6730560064315796
compilation - compile mode: 684.1569213867188
median exec time (e/c): 0.36003199219703674 / 0.3521920144557953
compiled graph is on average 1.0222605210210565 times faster than eager execution
Files already downloaded and verified
Files already downloaded and verified
compilation - eager mode: 23.43926429748535
compilation - compile mode: 11417.072265625
median exec time (e/c): 6.752992153167725 / 2.512576103210449
compiled graph is on average 2.6876766616299004 times faster than eager execution


In [None]:
for model_name, telemetry_eager_times, telemetry_compile_times in telemetry_compile_by_model:

  for telemetry, telemetry_times, results_filepath in [
      (telemetry_eager, telemetry_eager_times, results_eager_filepath),
      (telemetry_compile, telemetry_compile_times, results_compile_filepath)
    ]:
    telemetry['model_name'].append(model_name)
    telemetry['phase'].append('graph_compilation')
    telemetry['epoch'].append(1)
    telemetry['loss'].append(-1)
    telemetry['performance'].append(-1)
    telemetry['elapsed_time'].append(telemetry_times[0])

    telemetry['model_name'].extend([model_name] * epochs)
    telemetry['phase'].extend(['latency'] * epochs)
    telemetry['epoch'].extend(list(range(1, epochs + 1)))
    telemetry['loss'].extend([-1] * epochs)
    telemetry['performance'].extend([-1] * epochs)
    telemetry['elapsed_time'].extend(telemetry_times[1:])

    pd.DataFrame(telemetry).to_csv(results_filepath, index=False)

### Latency on batch - both modes
bc on single sample compile sucks apparently

In [None]:
telemetry_compile_by_model = []

In [None]:
# clfs = ['FullyConnectedNet', 'SimpleConvNet', 'ResNet-50', 'DenseNet-121', 'MobileNet-v2', 'ConvNeXt-Tiny']
for model_name in clfs:
  telemetry_eager_times = []
  telemetry_compile_times = []

  # prepare models
  model, _, test_dl, _ = env_builder(model_name, num_classes, batch_size, test_batch_size)
  model = model.to(device)
  torch._dynamo.reset()
  model_comp = torch.compile(model, mode='reduce-overhead')
  # model, model_comp = model.to(device), model_comp.to(device)

  # prepare samples
  batch = next(iter(test_dl))
  batch = batch[0].to(device)

  # measure compilation time
  with torch.no_grad():
    e = latency(model, batch)
    print('compilation - eager mode:', e)
    telemetry_eager_times.append(e)

    c = latency(model_comp, batch)
    print('compilation - compile mode:', c)
    telemetry_compile_times.append(c)

    # measure latency (with warm-up runs in case of recompilation)
    test_dl_state = iter(test_dl)
    for i in range(epochs + 10):
      batch = next(test_dl_state)
      batch = batch[0].to(device)

      e = latency(model, batch)
      c = latency(model_comp, batch)

      if i >= 10:
        telemetry_eager_times.append(e)
        telemetry_compile_times.append(c)

    em = np.median(telemetry_eager_times)
    cm = np.median(telemetry_compile_times)
    print(f'median exec time (e/c): {em} / {cm}')
    print(f'compiled graph is on average {em / cm} times faster than eager execution')

  # after completing latency benchmarks, save gathered telemetry
  # lists -> append to dicts

telemetry_compile_by_model.append((model_name, telemetry_eager_times, telemetry_compile_times))

compilation - eager mode: 0.9705280065536499
compilation - compile mode: 490.14581298828125
median exec time (e/c): 0.4360960125923157 / 0.4285759925842285
compiled graph is on average 1.0175465264928698 times faster than eager execution
Files already downloaded and verified
Files already downloaded and verified
compilation - eager mode: 24.78780746459961
compilation - compile mode: 19594.263671875
median exec time (e/c): 36.413822174072266 / 8.255359649658203
compiled graph is on average 4.410931045939338 times faster than eager execution


In [None]:
for model_name, telemetry_eager_times, telemetry_compile_times in telemetry_compile_by_model:

  for telemetry, telemetry_times, results_filepath in [
      (telemetry_eager, telemetry_eager_times, results_eager_filepath),
      (telemetry_compile, telemetry_compile_times, results_compile_filepath)
    ]:
    telemetry['model_name'].append(model_name)
    telemetry['phase'].append('graph_compilation_batch')
    telemetry['epoch'].append(1)
    telemetry['loss'].append(-1)
    telemetry['performance'].append(-1)
    telemetry['elapsed_time'].append(telemetry_times[0])

    telemetry['model_name'].extend([model_name] * epochs)
    telemetry['phase'].extend(['latency_batch'] * epochs)
    telemetry['epoch'].extend(list(range(1, epochs + 1)))
    telemetry['loss'].extend([-1] * epochs)
    telemetry['performance'].extend([-1] * epochs)
    telemetry['elapsed_time'].extend(telemetry_times[1:])

    pd.DataFrame(telemetry).to_csv(results_filepath, index=False)

## Training - compile mode
with some special setup since it's rather unstable

In [None]:
telemetry_compile_by_model = []

In [None]:
# clfs = ['FullyConnectedNet', 'SimpleConvNet', 'ResNet-50', 'DenseNet-121', 'MobileNet-v2', 'ConvNeXt-Tiny']
model_name = clfs[1]

In [None]:
tc_part = defaultdict(list)

# train all models - compiled nets and funcs
print(f'Compiled benchmarks for {model_name} begin')

model, train_dl, test_dl, loss_func = env_builder(model_name, num_classes, batch_size, test_batch_size)
model = model.to(device)
torch._dynamo.reset()
model = torch.compile(model, mode='reduce-overhead')
opt = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
fit_step_compiled = torch.compile(fit_step, mode="reduce-overhead")

for epoch in range(1, epochs + 1):
  running_loss = 0.0
  start.record()
  for xb, yb in train_dl:
    xb, yb = xb.to(device), yb.to(device)
    running_loss += fit_step_compiled(model, xb, yb, loss_func, opt)
  end.record()
  torch.cuda.synchronize()

  tc_part['model_name'].append(model_name)
  tc_part['phase'].append('training')
  tc_part['epoch'].append(epoch)
  tc_part['loss'].append(running_loss / len(train_dl))
  tc_part['performance'].append(-1)
  tc_part['elapsed_time'].append(start.elapsed_time(end) * 1e6)
  print(f'Epoch {epoch} finished with execution time of {start.elapsed_time(end) / 1e3}s')
  telemetry_compile_by_model.append(tc_part)

Compiled benchmarks for ConvNeXt-Tiny begin
Files already downloaded and verified
Files already downloaded and verified
Epoch 1 finished with execution time of 66.1982421875s
Epoch 2 finished with execution time of 28.592625s


In [None]:
def reduce_telemetry_dicts(agg_t, update_t):
	for key in update_t.keys():
		agg_t[key].extend(update_t[key])
	return agg_t

telemetry_compile_full = reduce(reduce_telemetry_dicts, telemetry_compile_by_model, telemetry_compile)

pd.DataFrame(telemetry_compile_full).to_csv(results_compile_filepath, index=False)