In [2]:
from clf_funcs import fit, get_mnist_loaders, SimpleConvNet

import time
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
batch_size = 96
test_batch_size = 128
epochs = 3
lr = 1e-2
momentum = 0.9
num_classes = 10
log_interval = 300
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f'CUDA enabled: {use_cuda}')

CUDA enabled: True


In [4]:
model = SimpleConvNet()
model = model.to(device)

opt = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
loss_func = F.nll_loss

train_dl, _, test_dl = get_mnist_loaders(batch_size, test_batch_size, flatten=False)

In [5]:
time_elapsed = []

for epoch in range(1, epochs + 1):
	start = time.time_ns()
	train_history = fit(model, device, train_dl, loss_func, epoch, optimizer=opt, log_interval=log_interval, silent=False)
	end = time.time_ns()

	time_elapsed.append((end - start) / 10e9)

print('times without CUDA event sync: ', time_elapsed)

times without CUDA event sync:  [6.8949208757, 6.6692102669, 6.6709405586]


In [9]:
time_elapsed = []
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

for epoch in range(1, epochs + 1):
	start.record()
	train_history = fit(model, device, train_dl, loss_func, epoch, optimizer=opt, log_interval=log_interval, silent=False)
	end.record()
	torch.cuda.synchronize()

	time_elapsed.append(start.elapsed_time(end))

print('times with CUDA event sync: ', time_elapsed)

times with CUDA event sync:  [66461.9140625, 66703.78125, 66742.078125]


for whatever reason using cuda events for profiling makes the training take significantly more time than it should for this one model only?? turning sync off results in the following error on `Event::elapsed_time`  

```
---> 11 	time_elapsed.append(start.elapsed_time(end))
...
RuntimeError: CUDA error: device not ready
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
```

Event sync also makes DCGAN borderline impossible (as shown in some other notebook idc) and generally makes training loops behave unpredictable. Considering refactoring pytorch code to use `time::time_ns`