In [1]:
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision.models import resnet50, densenet121, mobilenet_v2, convnext_tiny
from torch.utils.data import TensorDataset, DataLoader
import torchvision
import matplotlib.pyplot as plt
import torch._dynamo
from collections import defaultdict
from functools import reduce

In [18]:
def get_celeba_loader(batch_size, image_size=64, root='./drive/MyDrive/colab/datasets/celeba'):
    transform = torchvision.transforms.Compose([
        torchvision.transforms.Resize(image_size),
        torchvision.transforms.CenterCrop(image_size),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

    ds = torchvision.datasets.ImageFolder(root=root, transform=transform)
    dl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=2)

    return dl


def get_celeba_loader_from_memory(batch_size, image_size=64, root='./drive/MyDrive/colab/datasets/celeba'):
	dl = get_celeba_loader(batch_size, image_size=image_size, root=root)
	collected_batches = [batch for batch in dl]
	return collected_batches


def fit_dcgan_step(generator, discriminator, device, real, loss_func, optimizerG, optimizerD, latent_vec_size):
  real_label = 1.
  fake_label = 0.

  discriminator.zero_grad()
  b_size = real.size(0)
  label = torch.full((b_size,), real_label, dtype=torch.float, device=device)

  output = discriminator(real).view(-1)
  errD_real = loss_func(output, label)
  errD_real.backward()
  D_x = output.mean().item()

  noise = torch.randn(b_size, latent_vec_size, 1, 1, device=device)
  fake = generator(noise)
  label.fill_(fake_label)
  output = discriminator(fake.detach()).view(-1)
  errD_fake = loss_func(output, label)
  errD_fake.backward()
  D_G_z1 = output.mean().item()
  errD = errD_real + errD_fake
  optimizerD.step()

  generator.zero_grad()
  label.fill_(real_label)
  output = discriminator(fake).view(-1)
  errG = loss_func(output, label)
  errG.backward()
  D_G_z2 = output.mean().item()
  optimizerG.step()

  return [errG.item(), errD.item(), D_x, D_G_z1, D_G_z2]


def dcgan_weights_init(model):
  classname = model.__class__.__name__
  if classname.find('Conv') != -1:
      nn.init.normal_(model.weight.data, 0.0, 0.02)
  elif classname.find('BatchNorm') != -1:
      nn.init.normal_(model.weight.data, 1.0, 0.02)
      nn.init.constant_(model.bias.data, 0)


class Generator(nn.Module):
    def __init__(self, n_channels, latent_vec_size, feat_map_size):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.ConvTranspose2d(latent_vec_size, feat_map_size * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(feat_map_size * 8),
            nn.ReLU(True),

            nn.ConvTranspose2d(feat_map_size * 8, feat_map_size * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feat_map_size * 4),
            nn.ReLU(True),

            nn.ConvTranspose2d(feat_map_size * 4, feat_map_size * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feat_map_size * 2),
            nn.ReLU(True),

            nn.ConvTranspose2d(feat_map_size * 2, feat_map_size, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feat_map_size),
            nn.ReLU(True),

            nn.ConvTranspose2d(feat_map_size, n_channels, 4, 2, 1, bias=False),
            nn.Tanh()
        )

    def forward(self, input):
        return self.main(input)


class Discriminator(nn.Module):
    def __init__(self, n_channels, feat_map_size):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv2d(n_channels, feat_map_size, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(feat_map_size, feat_map_size * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feat_map_size * 2),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(feat_map_size * 2, feat_map_size * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feat_map_size * 4),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(feat_map_size * 4, feat_map_size * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feat_map_size * 8),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(feat_map_size * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input)

def latency(model, sample):
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)
	start.record()
	_ = model(sample)
	end.record()
	torch.cuda.synchronize()
	return start.elapsed_time(end)

In [3]:
if torch.cuda.is_available():
    print('device count:', torch.cuda.device_count())
    device = torch.device(0)
    device_cap = torch.cuda.get_device_capability()
    print(f"GPU {torch.cuda.get_device_name(0)} available with compatibility {device_cap}")
    if device_cap not in ((7, 0), (8, 0), (9, 0)):
        print("GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.")
else:
    device = torch.device("cpu")
    print("GPU unavailable")

device count: 1
GPU Tesla T4 available with compatibility (7, 5)
GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.


In [4]:
nc = 3
nz = 100
ngf = 64
ndf = 64
latent_vec_size = 100

batch_size = 96
lr = 1e-4
gen_batch_size = 1024
epochs = 2
log_interval = 200
timestamp = time.time_ns()
results_eager_filepath = f'./drive/MyDrive/colab/results/pytorch-dcgan-eager-{timestamp}.csv'
results_compile_filepath = f'./drive/MyDrive/colab/results/pytorch-dcgan-compile-{timestamp}.csv'
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

In [13]:
celeba_dl = get_celeba_loader_from_memory(batch_size=batch_size, root='./drive/MyDrive/colab/datasets/celeba_tiny')
# takes a loooot of time thus separate code block

  self.pid = os.fork()


## Training - eager mode

In [28]:
telemetry_eager = defaultdict(list)

netG = Generator(nc, nz, ngf).to(device)
netD = Discriminator(nc, ndf).to(device)
netG.apply(dcgan_weights_init)
netD.apply(dcgan_weights_init)
netG.train()
netD.train()

loss_func = nn.BCELoss()
optG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.999))
optD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.999))

for epoch in range(1, epochs + 1):
  running_stats = [0 for _ in range(5)]
  start.record()
  for xb in celeba_dl:
    xb = xb[0].to(device)
    stats = fit_dcgan_step(netG, netD, device, xb, loss_func, optG, optD, latent_vec_size)

    for i, stat in enumerate(stats):
      running_stats[i] += stat
  end.record()
  torch.cuda.synchronize()

  for i, stat in enumerate(running_stats):
    running_stats[i] = stat / len(celeba_dl)

  telemetry_eager['model_name'].append('DCGAN')
  telemetry_eager['phase'].append('training')
  telemetry_eager['epoch'].append(epoch)
  telemetry_eager['loss'].append(f'{running_stats[0]}|{running_stats[1]}')
  telemetry_eager['performance'].append(f'{running_stats[2]}|{running_stats[3]}|{running_stats[4]}')
  telemetry_eager['elapsed_time'].append(start.elapsed_time(end) * 1e6)
  pd.DataFrame(telemetry_eager).to_csv(results_eager_filepath, index=False)
  print(f'Epoch {epoch} finished with execution time of {start.elapsed_time(end) / 1e3}s')

Epoch 1 finished with execution time of 22.94624609375s
Epoch 2 finished with execution time of 23.364908203125s


## Latency - both modes

In [29]:
telemetry_compile = defaultdict(list)

In [30]:
warmup = 10
netG = Generator(nc, nz, ngf).to(device)
netG.apply(dcgan_weights_init)
torch._dynamo.reset()
netG_comp = torch.compile(netG, mode='reduce-overhead')
netG.eval()

latent_vecs = torch.randn(epochs + warmup + 1, latent_vec_size, 1, 1, device=device)
# latent_vecs[0].unsqueeze(0).size()

telemetry_eager_times = []
telemetry_compile_times = []

# compilation
with torch.no_grad():
  e = latency(netG, latent_vecs[-1].unsqueeze(0))
  print('compilation - eager mode:', e)
  telemetry_eager_times.append(e)

  c = latency(netG_comp, latent_vecs[-1].unsqueeze(0))
  print('compilation - compile mode:', c)
  telemetry_compile_times.append(c)

  for i in range(epochs + warmup):
    # warmup
    e = latency(netG, latent_vecs[1].unsqueeze(0))
    c = latency(netG_comp, latent_vecs[i].unsqueeze(0))

    # latency
    if i >= warmup:
        telemetry_eager_times.append(e)
        telemetry_compile_times.append(c)

em = np.median(telemetry_eager_times)
cm = np.median(telemetry_compile_times)
print(f'median exec time (e/c): {em} / {cm}')
print(f'compiled graph is on average {em / cm} times faster than eager execution')


# telemetry
for telemetry, telemetry_times in [(telemetry_eager, telemetry_eager_times),
                                   (telemetry_compile, telemetry_compile_times)]:
  telemetry['model_name'].extend(["DCGAN"] * (epochs + 1))
  telemetry['phase'].extend(['graph_compilation'] + ['latency'] * epochs)
  telemetry['epoch'].extend([1] + list(range(1, epochs + 1)))
  telemetry['loss'].extend([-1] * (epochs + 1))
  telemetry['performance'].extend([-1] * (epochs + 1))
  telemetry['elapsed_time'].extend(telemetry_times)

pd.DataFrame(telemetry_compile).to_csv(results_compile_filepath, index=False)
pd.DataFrame(telemetry_eager).to_csv(results_eager_filepath, index=False)

compilation - eager mode: 0.8944960236549377
compilation - compile mode: 950.4771728515625
median exec time (e/c): 0.6674559712409973 / 1.007423996925354
compiled graph is on average 0.6625372963896681 times faster than eager execution
model_name 3
phase 3
epoch 3
loss 3
performance 3
elapsed_time 3


## Latency on batch - both modes

In [32]:
warmup = 50
netG = Generator(nc, nz, ngf).to(device)
netG.apply(dcgan_weights_init)
torch._dynamo.reset()
netG_comp = torch.compile(netG, mode='reduce-overhead')
netG.eval()

telemetry_eager_times = []
telemetry_compile_times = []

# compilation
with torch.no_grad():
  latent_vec = torch.randn(1024, latent_vec_size, 1, 1, device=device)
  e = latency(netG, latent_vecs)
  print('compilation - eager mode:', e)
  telemetry_eager_times.append(e)

  c = latency(netG_comp, latent_vec)
  print('compilation - compile mode:', c)
  telemetry_compile_times.append(c)

  for i in range(epochs + warmup):
    latent_vec = torch.randn(1024, latent_vec_size, 1, 1, device=device)
    # warmup
    e = latency(netG, latent_vec)
    c = latency(netG_comp, latent_vec)

    # latency
    if i >= warmup:
        telemetry_eager_times.append(e)
        telemetry_compile_times.append(c)

em = np.median(telemetry_eager_times)
cm = np.median(telemetry_compile_times)
print(f'median exec time (e/c): {em} / {cm}')
print(f'compiled graph is on average {em / cm} times faster than eager execution')


# telemetry
for telemetry, telemetry_times in [(telemetry_eager, telemetry_eager_times),
                                   (telemetry_compile, telemetry_compile_times)]:
  telemetry['model_name'].extend(["DCGAN"] * (epochs + 1))
  telemetry['phase'].extend(['graph_compilation_batch'] + ['latency_batch'] * epochs)
  telemetry['epoch'].extend([1] + list(range(1, epochs + 1)))
  telemetry['loss'].extend([-1] * (epochs + 1))
  telemetry['performance'].extend([-1] * (epochs + 1))
  telemetry['elapsed_time'].extend(telemetry_times)

pd.DataFrame(telemetry_compile).to_csv(results_compile_filepath, index=False)
pd.DataFrame(telemetry_eager).to_csv(results_eager_filepath, index=False)

compilation - eager mode: 5.798272132873535
compilation - compile mode: 1588.83837890625
median exec time (e/c): 159.6262969970703 / 163.96524047851562
compiled graph is on average 0.9735374188530291 times faster than eager execution


## Training - compile mode

In [None]:
netG = Generator(nc, nz, ngf).to(device)
netD = Discriminator(nc, ndf).to(device)
netG.apply(dcgan_weights_init)
netD.apply(dcgan_weights_init)
torch._dynamo.reset()
netG = torch.compile(netG, mode='reduce-overhead')
netD = torch.compile(netD, mode='reduce-overhead')
netG.train()
netD.train()

loss_func = nn.BCELoss()
optG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.999))
optD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.999))

fit_dcgan_step_compiled = torch.compile(fit_dcgan_step, mode="reduce-overhead")

for epoch in range(1, epochs + 1):
  running_stats = [0 for _ in range(5)]
  start.record()
  for xb in celeba_dl:
    xb = xb[0].to(device)
    stats = fit_dcgan_step_compiled(netG, netD, device, xb, loss_func, optG, optD, latent_vec_size)

    for i, stat in enumerate(stats):
      running_stats[i] += stat
  end.record()
  torch.cuda.synchronize()

  for i, stat in enumerate(running_stats):
    running_stats[i] = stat / len(celeba_dl)

  telemetry_compile['model_name'].append('DCGAN')
  telemetry_compile['phase'].append('training')
  telemetry_compile['epoch'].append(epoch)
  telemetry_compile['loss'].append(f'{running_stats[0]}|{running_stats[1]}')
  telemetry_compile['performance'].append(f'{running_stats[2]}|{running_stats[3]}|{running_stats[4]}')
  telemetry_compile['elapsed_time'].append(start.elapsed_time(end) * 1e6)
  pd.DataFrame(telemetry_compile).to_csv(results_compile_filepath, index=False)
  print(f'Epoch {epoch} finished with execution time of {start.elapsed_time(end) / 1e3}s')