In [9]:
import sys
sys.path.append('..')
from load_datasets import load_mnist_imgs_and_labels
from clf_funcs import fit, get_mnist_loaders, SimpleConvNet

import time
import datetime
import numpy as np
import pandas as pd
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
batch_size = 96
test_batch_size = 128
epochs = 3
lr = 1e-2
momentum = 0.9
num_classes = 10
log_interval = 300

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f'CUDA enabled: {use_cuda}')

CUDA enabled: True


In [6]:
def telemetry_fit(model, device, train_dl, opt) -> list:
	time_elapsed = []

	for epoch in range(1, epochs + 1):
		print(f'epoch {epoch} start: {datetime.datetime.fromtimestamp(time.time()).strftime("%H:%M:%S")}')
		
		start = time.perf_counter_ns()
		train_history = fit(model, device, train_dl, F.nll_loss, epoch, optimizer=opt, log_interval=log_interval, silent=False)
		end = time.perf_counter_ns()

		print(f'epoch {epoch} end: {datetime.datetime.fromtimestamp(time.time()).strftime("%H:%M:%S")}')	
		print(f'seconds elapsed: {(end - start) / 1e9}')

		time_elapsed.append((end - start))

	return time_elapsed

In [4]:
model = SimpleConvNet()
model = model.to(device)

opt = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
train_dl, _, test_dl = get_mnist_loaders(batch_size, test_batch_size, flatten=False)

time_elapsed = telemetry_fit(model, device, train_dl, opt)

print('times without CUDA event sync: ', time_elapsed)

epoch 1 start: 19:31:40
epoch 1 end: 19:32:47
seconds elapsed: 66.685224503
epoch 2 start: 19:32:47
epoch 2 end: 19:33:53
seconds elapsed: 66.033280051
epoch 3 start: 19:33:53
epoch 3 end: 19:34:59
seconds elapsed: 66.163545291
times without CUDA event sync:  [66685224503, 66033280051, 66163545291]


In [5]:
time_elapsed_sync = []
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

for epoch in range(1, epochs + 1):
	print(f'epoch {epoch} start: {datetime.datetime.fromtimestamp(time.time()).strftime("%H:%M:%S")}')

	start.record()
	train_history = fit(model, device, train_dl, F.nll_loss, epoch, optimizer=opt, log_interval=log_interval, silent=False)
	end.record()
	torch.cuda.synchronize()

	print(f'epoch {epoch} end: {datetime.datetime.fromtimestamp(time.time()).strftime("%H:%M:%S")}')	
	print(f'seconds elapsed: {start.elapsed_time(end) / 1e3}')

	time_elapsed_sync.append(start.elapsed_time(end))

print('times with CUDA event sync: ', time_elapsed_sync)

epoch 1 start: 19:34:59
epoch 1 end: 19:36:05
seconds elapsed: 66.1485
epoch 2 start: 19:36:05
epoch 2 end: 19:37:11
seconds elapsed: 66.1470859375
epoch 3 start: 19:37:11
epoch 3 end: 19:38:18
seconds elapsed: 66.12184375
times with CUDA event sync:  [66148.5, 66147.0859375, 66121.84375]


scvnet in pytorch trains about 10 times slower than the same model on the same dataset in tensorflow or even libtorch which is surprising. my guess was some issues with cuda events but apparently thats not it. turning sync off results in the following error on `Event::elapsed_time`  

```
---> 16 	time_elapsed_sync.append(start.elapsed_time(end))
...
RuntimeError: CUDA error: device not ready
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
```

However, event sync still makes DCGAN borderline impossible (as shown in some other notebook idc) and generally makes training loops behave unpredictable. Considering refactoring pytorch code to use `time::perf_counter_ns`

Next best guess would be lazy dataset loading, but that is also not the case since MNIST dataset is being read eagerly from binary files into tensors in memory before dataloader creation. Below `list[tuple[Tensor, Tensor]]` is used to create `DataLoader` (in lieu of `TensorDataset`) but it obviously changes nothing.

In [7]:
time_elapsed_listds = []
train_dl_list, _, _ = get_mnist_loaders(batch_size, test_batch_size, flatten=False, pt_ds=False)

time_elapsed_listds = telemetry_fit(model, device, train_dl_list, opt)

print('times with list based dataset: ', time_elapsed_listds)

epoch 1 start: 20:21:16
epoch 1 end: 20:22:23
seconds elapsed: 66.268160705
epoch 2 start: 20:22:23
epoch 2 end: 20:23:29
seconds elapsed: 66.41212843
epoch 3 start: 20:23:29
epoch 3 end: 20:24:36
seconds elapsed: 66.605874152
times with list based dataset:  [66268160705, 66412128430, 66605874152]


How about completely omitting `torch.DataLoader` and passing in a list of dataclasses? Maybe the DL does some memory shenanigans im not aware of

In [20]:
x_train, y_train = load_mnist_imgs_and_labels(
	'../../datasets/mnist-digits/train-images-idx3-ubyte',
	'../../datasets/mnist-digits/train-labels-idx1-ubyte'
)

x_train = x_train.reshape(-1, 1, 28, 28)

x_train, y_train = map(
	lambda x: torch.tensor(x).split(batch_size, 0),
	(x_train, y_train)
)

tuple_loader = [(x, y) for x, y in zip(x_train, y_train)]
len(tuple_loader), tuple_loader[0][0].shape, type(tuple_loader[0][0])

time_elapsed_tupledl = telemetry_fit(model, device, tuple_loader, opt)

print('times with list based dataset: ', time_elapsed_tupledl)

epoch 1 start: 20:42:09
epoch 1 end: 20:43:15
seconds elapsed: 65.997850772
epoch 2 start: 20:43:15
epoch 2 end: 20:44:22
seconds elapsed: 66.169891201
epoch 3 start: 20:44:22
epoch 3 end: 20:45:28
seconds elapsed: 66.415712678
times with list based dataset:  [65997850772, 66169891201, 66415712678]


mkay so its not loading data, now lets check model architecture (although the internet knows nothing about conv2d in pytorch having worse performance)

In [21]:
class NewSCVNet(nn.Module):

	def __init__(self, num_classes=10):
		super().__init__()
		self.conv1 = nn.Sequential(         
			nn.Conv2d(1, 16, 5, 1, 2), # no dtype map
			nn.ReLU(),    
			nn.MaxPool2d(2)
		)
		self.conv2 = nn.Sequential(         
			nn.Conv2d(16, 32, 5, 1, 2), # no dtype map
			nn.ReLU(),
			nn.MaxPool2d(2),
		)
		self.dense = nn.Linear(32 * 7 * 7, 500) # no dtype map
		self.classifier = nn.Linear(500, num_classes) # no dtype map

	def forward(self, x):
		x = self.conv1(x)
		x = self.conv2(x)
		x = torch.flatten(x, 1)
		x = F.relu(self.dense(x))
		return F.log_softmax(self.classifier(x), dim=1)

In [29]:
x_train, y_train = load_mnist_imgs_and_labels(
	'../../datasets/mnist-digits/train-images-idx3-ubyte',
	'../../datasets/mnist-digits/train-labels-idx1-ubyte'
)

x_train = x_train.reshape(-1, 1, 28, 28)

x_train, y_train = map(
	lambda x: torch.tensor(x).split(batch_size, 0),
	(x_train, y_train)
)
x_train = map(lambda x: x.to(torch.float32), x_train) # np.float64 to torch.float32

tuple_loader = [(x, y) for x, y in zip(x_train, y_train)]

model = NewSCVNet()
model = model.to(device)

time_elapsed_dtype_f32 = telemetry_fit(model, device, tuple_loader, opt)

print('times without dtype mapping: ', time_elapsed_dtype_f32)

epoch 1 start: 21:22:55
epoch 1 end: 21:23:00
seconds elapsed: 5.401223702
epoch 2 start: 21:23:00
epoch 2 end: 21:23:03
seconds elapsed: 3.334495395
epoch 3 start: 21:23:03
epoch 3 end: 21:23:07
seconds elapsed: 3.270358371
times without dtype mapping:  [5401223702, 3334495395, 3270358371]


for fucks sake