In [None]:
import datetime
start = datetime.datetime.now().isoformat()

<a href="https://colab.research.google.com/github/omarsar/pytorch_notebooks/blob/master/pytorch_quick_start.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Lets start with some imports...

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.cuda.amp.grad_scaler import GradScaler
from torch import autocast
import megaclite

- Now we load the megaclite extension to provide remote GPU access

In [None]:
%reload_ext megaclite.client
#%tag_benchmark mnist_remote

In [None]:
BATCH_SIZE = 32

## transformations
transform = transforms.Compose([transforms.ToTensor()])

## download and load training dataset
trainset = torchvision.datasets.MNIST(
    root="./data", train=True, download=True, transform=transform
)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2
)

## download and load testing dataset
testset = torchvision.datasets.MNIST(
    root="./data", train=False, download=True, transform=transform
)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2
)

- all of this still runs localy

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        # 28x28x1 => 26x26x32
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3)
        self.d1 = nn.Linear(26 * 26 * 32, 128)
        self.d2 = nn.Linear(128, 10)

    def forward(self, x):
        # 32x1x28x28 => 32x32x26x26
        x = self.conv1(x)
        x = F.relu(x)

        # flatten => 32 x (32*26*26)
        x = x.flatten(start_dim=1)

        # 32 x (32*26*26) => 32x128
        x = self.d1(x)
        x = F.relu(x)

        # logits => 32x10
        logits = self.d2(x)
        out = F.softmax(logits, dim=1)
        return out

In [None]:
## test the model with 1 batch
model = MyModel()
for images, labels in trainloader:
    print("batch size:", images.shape)
    out = model(images)
    print(out.shape)
    break

In [None]:
def get_accuracy(logit, target, batch_size):
    """Obtain accuracy for training round"""
    corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100.0 * corrects / batch_size
    return accuracy.item()

In [None]:
learning_rate = 0.001
num_epochs = 3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MyModel()
device

- But we can use the `train_remote` magic to run the following cell on the remote gpu server.
- Don't forget to forward port 6001 from the GPU server you want to use.

In [None]:
%%train_remote
global result
global end

try: 

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scaler = GradScaler()
    for epoch in range(num_epochs):
        train_running_loss = 0.0
        train_acc = 0.0

        model = model.train()

        ## training step
        for i, (images, labels) in enumerate(trainloader):

            images = images.to(device)
            labels = labels.to(device)

            ## forward + backprop + loss
            with autocast(device_type='cuda', dtype=torch.float16):
                logits = model(images)
                loss = criterion(logits, labels)

            optimizer.zero_grad()
            scaler.scale(loss).backward()

            ## update model params
            scaler.step(optimizer)
            scaler.update()

            train_running_loss += loss.detach().item()
            train_acc += get_accuracy(logits, labels, BATCH_SIZE)

        model.eval()
        print('Epoch: %d | Loss: %.4f | Train Accuracy: %.2f' \
                %(epoch, train_running_loss / i, train_acc/i))
        
        test_acc = 0.0
        for i, (images, labels) in enumerate(testloader, 0):
            outputs = model(images.to(device))
            test_acc += get_accuracy(outputs, labels.to(device), BATCH_SIZE)

        test_acc/=i   
        print('Test Accuracy: %.2f'%( test_acc))

except torch.cuda.OutOfMemoryError:
    result = "out of memory"
    print("oom")
except RuntimeError as error:
    error_message = str(error)
    if error_message.startswith("CUDA error: out of memory"):
        result = "out of memory"
    print(error_message)
else:
    result = float(test_acc)
    print(result)
end = datetime.datetime.now().isoformat()

In [None]:
%%time
%%without_cuda
model = model.to(device)
model.eval()
test_acc = 0.0
for i, (images, labels) in enumerate(testloader, 0):
    outputs = model(images.to(device))
    test_acc += get_accuracy(outputs, labels.to(device), BATCH_SIZE)
        
print('Test Accuracy: %.2f'%( test_acc/i))