In [1]:
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
import time

import torch.profiler

In [2]:
model = models.resnet101(pretrained=True)
model.cuda()
cudnn.benchmark = True

In [3]:
num_of_gpus = torch.cuda.device_count()
print(num_of_gpus)

2


In [4]:
device = torch.device("cuda:0")

In [5]:
def get_data(batch_size):

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.RandomCrop((32,32), padding=4),
         transforms.RandomHorizontalFlip(p=0.5),
         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994,0.2010))])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                              shuffle=True, num_workers=1)
    
    return trainloader

In [6]:
def train(net, trainloader):
    
    losses = []
    
    optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, dampening=0, weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss()
    
    net.train()
    
    for epoch in range(1): 
        
        start = time.time()

        running_loss = 0.0
        
        with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=2),
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./result_gpu2_w1_rtx8000'),
    record_shapes=True,
    profile_memory=True,  # This will take 1 to 2 minutes. Setting it to False could greatly speedup.
    with_stack=False
) as p:
        
            for i, data in enumerate(trainloader, 0):

                inputs, labels = data[0].to(device), data[1].to(device)

                optimizer.zero_grad()

                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                
                p.step()

        print('Epoch {} : Loss : {}'.format(str(epoch+1),str(running_loss/(i+1))))

        losses.append(running_loss/(i+1))
        
    end = time.time()
    compute_time = end - start
    print('Finished Training')
    
    return compute_time

In [7]:
if device.type == 'cuda':
    model = torch.nn.DataParallel(model) # make parallel
    cudnn.benchmark = True

In [8]:
start = time.time()
trainloader = get_data(128)
dataload_time = time.time() - start
compute_time = train(model, trainloader)
total_time = dataload_time + compute_time
print("\n\nCompute Time : {}\n Total Time : {}".format(compute_time,total_time))

Files already downloaded and verified
Epoch 1 : Loss : 2.7167921471778693
Finished Training


Compute Time : 482.570369720459
 Total Time : 485.54662823677063
