In [7]:
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torchvision
import matplotlib.pyplot as plt
import torch._dynamo
from collections import defaultdict
from functools import reduce
from PIL import Image
from sklearn.model_selection import train_test_split

In [15]:
def load_adam_image(df, idx, root='./drive/MyDrive/colab/datasets/ADAM/Training1200'):
	image_name = df.loc[idx, 'imgName']
	data_type = 'AMD' if image_name.startswith('A') else 'Non-AMD'
	image_path = f'{root}/{data_type}/{image_name}'
	image = Image.open(image_path)
	bbox = (df.loc[idx, 'Fovea_X'], df.loc[idx, 'Fovea_Y'])
	return image, bbox


def build_adam_dataset(df, image_size=256):
  imgs, bboxes = [], []

  for idx in df.index:
    img, bbox = load_adam_image(df, idx)

    img_arr = np.array(img).transpose((2,0,1)).astype(np.float32) / 255 # uint8::max
    imgs.append(img_arr)

    bbox_arr = np.array(bbox).astype(np.float32) / image_size
    bboxes.append(bbox_arr)

  imgs = map(lambda x: torch.tensor(x, dtype=torch.float32), imgs)
  bboxes = map(lambda x: torch.tensor(x, dtype=torch.float32), bboxes)

  return [(i, b) for i, b in zip(imgs, bboxes)]


def get_adam_loaders_from_memory(batch_size, test_batch_size=None, cutoff=1, root='../../datasets/ADAM/Training1200'):
	if test_batch_size is None: test_batch_size = batch_size * 2

	fovea_df = pd.read_csv(f'{root}/fovea_location.csv').drop(['ID'], axis=1)
	train_df, test_df = train_test_split(fovea_df, test_size=1-cutoff, shuffle=True)

	train_ds = build_adam_dataset(train_df)
	test_ds = build_adam_dataset(test_df)

	train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
	test_dl = torch.utils.data.DataLoader(test_ds, batch_size=test_batch_size, shuffle=False, num_workers=2)

	return train_dl, test_dl


class ResBlock(nn.Module):
  def __init__(self, in_channels, out_channels):
    super().__init__()
    self.base1 = nn.Sequential(
      nn.Conv2d(in_channels, in_channels, kernel_size=3, padding='same'),
      nn.BatchNorm2d(in_channels),
      nn.ReLU(True)
    )
    self.base2 = nn.Sequential(
      nn.Conv2d(in_channels, out_channels, kernel_size=3, padding='same'),
      nn.BatchNorm2d(out_channels),
      nn.ReLU(True)
    )
    self.mpool = nn.MaxPool2d(2)

  def forward(self, x):
    x = self.base1(x) + x
    x = self.base2(x)
    x = self.mpool(x)
    return x


class SODNet(nn.Module):
  def __init__(self, in_channels, first_output_channels):
    super().__init__()
    self.main = nn.Sequential(
      ResBlock(in_channels, first_output_channels),
      ResBlock(first_output_channels, 2 * first_output_channels),
      ResBlock(2 * first_output_channels, 4 * first_output_channels),
      ResBlock(4 * first_output_channels, 8 * first_output_channels),

      nn.Conv2d(8 * first_output_channels, 16 * first_output_channels, kernel_size=3),
      nn.MaxPool2d(2),
      nn.Flatten(),
      nn.Linear(7 * 7 * 16 * first_output_channels, 2)
    )

  def forward(self, x):
    return self.main(x)


def fit_sodnet_step(model, image_batch, bbox_batch, loss_func, optimizer):
  optimizer.zero_grad()
  output = model(image_batch)
  loss = loss_func(output, bbox_batch)
  loss.backward()
  optimizer.step()
  return loss.item()


def latency(model, sample):
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)
	start.record()
	_ = model(sample)
	end.record()
	torch.cuda.synchronize()
	return start.elapsed_time(end)

In [4]:
if torch.cuda.is_available():
    print('device count:', torch.cuda.device_count())
    device = torch.device(0)
    device_cap = torch.cuda.get_device_capability()
    print(f"GPU {torch.cuda.get_device_name(0)} available with compatibility {device_cap}")
    if device_cap not in ((7, 0), (8, 0), (9, 0)):
        print("GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.")
else:
    device = torch.device("cpu")
    print("GPU unavailable")

device count: 1
GPU Tesla T4 available with compatibility (7, 5)
GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.


In [5]:
epochs = 8
lr = 1e-2
timestamp = time.time_ns()
results_eager_filepath = f'./drive/MyDrive/colab/results/pytorch-sodnet-eager-{timestamp}.csv'
results_compile_filepath = f'./drive/MyDrive/colab/results/pytorch-sodnet-compile-{timestamp}.csv'
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

## Teaining - eager mode

In [8]:
telemetry_eager = defaultdict(list)

train_dl, test_dl = get_adam_loaders_from_memory(8, cutoff=0.8, root='./drive/MyDrive/colab/datasets/ADAM/Training1200')
model = SODNet(3, 16).to(device)
model.train()
loss_func = nn.SmoothL1Loss(reduction="sum")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

for epoch in range(1, epochs + 1):
  train_loss = 0.0
  start.record()
  for image_batch, bbox_batch in train_dl:
    image_batch, bbox_batch = image_batch.to(device), bbox_batch.to(device)
    train_loss += fit_sodnet_step(model, image_batch, bbox_batch, loss_func, optimizer)
  end.record()
  torch.cuda.synchronize()

  telemetry_eager['model_name'].append('SODNet')
  telemetry_eager['phase'].append('training')
  telemetry_eager['epoch'].append(epoch)
  telemetry_eager['loss'].append(train_loss / len(train_dl))
  telemetry_eager['performance'].append(-1)
  telemetry_eager['elapsed_time'].append(start.elapsed_time(end) * 1e6)
  pd.DataFrame(telemetry_eager).to_csv(results_eager_filepath, index=False)
  print(f'Epoch {epoch} finished with execution time of {start.elapsed_time(end) / 1e3}s')

Epoch 1 finished with execution time of 6.06935498046875s
Epoch 2 finished with execution time of 4.29643505859375s
Epoch 3 finished with execution time of 4.38639599609375s
Epoch 4 finished with execution time of 3.825910888671875s
Epoch 5 finished with execution time of 4.44194091796875s
Epoch 6 finished with execution time of 4.41042626953125s
Epoch 7 finished with execution time of 3.82102197265625s
Epoch 8 finished with execution time of 4.23613671875s


## Latency - both modes

In [11]:
telemetry_compile = defaultdict(list)

In [20]:
warmup = 10
adam, _ = get_adam_loaders_from_memory(epochs + warmup + 1, cutoff=0.8, root='./drive/MyDrive/colab/datasets/ADAM/Training1200')
adam = next(iter(adam))[0].to(device)
model = SODNet(3, 16).to(device)
torch._dynamo.reset()
model_comp = torch.compile(model, mode='reduce-overhead')
model.eval()

telemetry_eager_times = []
telemetry_compile_times = []

# compilation
with torch.no_grad():
  e = latency(model, adam[-1].unsqueeze(0))
  print('compilation - eager mode:', e)
  telemetry_eager_times.append(e)

  c = latency(model_comp, adam[-1].unsqueeze(0))
  print('compilation - compile mode:', c)
  telemetry_compile_times.append(c)

  for i in range(epochs + warmup):
    # warmup
    e = latency(model, adam[i].unsqueeze(0))
    c = latency(model_comp, adam[i].unsqueeze(0))

    # latency
    if i >= warmup:
        telemetry_eager_times.append(e)
        telemetry_compile_times.append(c)

em = np.median(telemetry_eager_times)
cm = np.median(telemetry_compile_times)
print(f'median exec time (e/c): {em} / {cm}')
print(f'compiled graph is on average {em / cm} times faster than eager execution')


# telemetry
for telemetry, telemetry_times in [(telemetry_eager, telemetry_eager_times),
                                   (telemetry_compile, telemetry_compile_times)]:
  telemetry['model_name'].extend(["SODNet"] * (epochs + 1))
  telemetry['phase'].extend(['graph_compilation'] + ['latency'] * epochs)
  telemetry['epoch'].extend([1] + list(range(1, epochs + 1)))
  telemetry['loss'].extend([-1] * (epochs + 1))
  telemetry['performance'].extend([-1] * (epochs + 1))
  telemetry['elapsed_time'].extend(telemetry_times)

pd.DataFrame(telemetry_compile).to_csv(results_compile_filepath, index=False)
pd.DataFrame(telemetry_eager).to_csv(results_eager_filepath, index=False)

compilation - eager mode: 29.676544189453125
compilation - compile mode: 16316.23046875
median exec time (e/c): 2.8970561027526855 / 0.8697919845581055
compiled graph is on average 3.3307459187779522 times faster than eager execution


## Latency on batch - both modes

In [23]:
warmup = 10
adam, _ = get_adam_loaders_from_memory(8, cutoff=0.8, root='./drive/MyDrive/colab/datasets/ADAM/Training1200')
adam = iter(adam)
model = SODNet(3, 16).to(device)
torch._dynamo.reset()
model_comp = torch.compile(model, mode='reduce-overhead')
model.eval()

telemetry_eager_times = []
telemetry_compile_times = []

# compilation
with torch.no_grad():
  batch = next(adam)[0].to(device)
  e = latency(model, batch)
  print('compilation - eager mode:', e)
  telemetry_eager_times.append(e)

  c = latency(model_comp, batch)
  print('compilation - compile mode:', c)
  telemetry_compile_times.append(c)

  for i in range(epochs + warmup):
    batch = next(adam)[0].to(device)
    # warmup
    e = latency(model, batch)
    c = latency(model_comp, batch)

    # latency
    if i >= warmup:
        telemetry_eager_times.append(e)
        telemetry_compile_times.append(c)

em = np.median(telemetry_eager_times)
cm = np.median(telemetry_compile_times)
print(f'median exec time (e/c): {em} / {cm}')
print(f'compiled graph is on average {em / cm} times faster than eager execution')


# telemetry
for telemetry, telemetry_times in [(telemetry_eager, telemetry_eager_times),
                                   (telemetry_compile, telemetry_compile_times)]:
  telemetry['model_name'].extend(["SODNet"] * (epochs + 1))
  telemetry['phase'].extend(['graph_compilation_batch'] + ['latency_batch'] * epochs)
  telemetry['epoch'].extend([1] + list(range(1, epochs + 1)))
  telemetry['loss'].extend([-1] * (epochs + 1))
  telemetry['performance'].extend([-1] * (epochs + 1))
  telemetry['elapsed_time'].extend(telemetry_times)

pd.DataFrame(telemetry_compile).to_csv(results_compile_filepath, index=False)
pd.DataFrame(telemetry_eager).to_csv(results_eager_filepath, index=False)

compilation - eager mode: 8.954272270202637
compilation - compile mode: 7802.685546875
median exec time (e/c): 4.854368209838867 / 4.329376220703125
compiled graph is on average 1.121262732174955 times faster than eager execution


## Training - compile mode

In [24]:
train_dl, test_dl = get_adam_loaders_from_memory(8, cutoff=0.8, root='./drive/MyDrive/colab/datasets/ADAM/Training1200')
model = SODNet(3, 16).to(device)
torch._dynamo.reset()
model_comp = torch.compile(model, mode='reduce-overhead')
model.train()
loss_func = nn.SmoothL1Loss(reduction="sum")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
fit_sodnet_step_comp = torch.compile(fit_sodnet_step, mode="reduce-overhead")

for epoch in range(1, epochs + 1):
  train_loss = 0.0
  start.record()
  for image_batch, bbox_batch in train_dl:
    image_batch, bbox_batch = image_batch.to(device), bbox_batch.to(device)
    train_loss += fit_sodnet_step_comp(model_comp, image_batch, bbox_batch, loss_func, optimizer)
  end.record()
  torch.cuda.synchronize()

  telemetry_compile['model_name'].append('SODNet')
  telemetry_compile['phase'].append('training')
  telemetry_compile['epoch'].append(epoch)
  telemetry_compile['loss'].append(train_loss / len(train_dl))
  telemetry_compile['performance'].append(-1)
  telemetry_compile['elapsed_time'].append(start.elapsed_time(end) * 1e6)
  pd.DataFrame(telemetry_compile).to_csv(results_compile_filepath, index=False)
  print(f'Epoch {epoch} finished with execution time of {start.elapsed_time(end) / 1e3}s')

Epoch 1 finished with execution time of 108.7909375s
Epoch 2 finished with execution time of 6.79148095703125s
Epoch 3 finished with execution time of 4.13099462890625s
Epoch 4 finished with execution time of 3.879884521484375s
Epoch 5 finished with execution time of 4.067753173828125s
Epoch 6 finished with execution time of 4.41709521484375s
Epoch 7 finished with execution time of 3.645645751953125s
Epoch 8 finished with execution time of 3.629512451171875s
