In [20]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import torchvision
from torch.optim.lr_scheduler import StepLR

import time


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, 1)
        self.conv2 = nn.Conv2d(16, 32, 3, 1)
        self.conv3 = nn.Conv2d(32, 32, 3, 1)
        self.conv4 = nn.Conv2d(32, 64, 3, 1)
        self.conv5 = nn.Conv2d(64, 64, 3, 1)
        self.conv6 = nn.Conv2d(64, 128, 3, 1)
        
        self.fc1 = nn.Linear(4608, 256)
        self.fc2 = nn.Linear(256, 10)
        
        self.bn1 = nn.BatchNorm2d(16)
        self.bn2 = nn.BatchNorm2d(32)
        self.bn3 = nn.BatchNorm2d(32)
        self.bn4 = nn.BatchNorm2d(64)
        self.bn5 = nn.BatchNorm2d(64)
        self.bn6 = nn.BatchNorm2d(128)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = F.relu(x)
        
        x = self.conv4(x)
        x = self.bn4(x)
        x = F.relu(x)
        
        x = F.max_pool2d(x, 2)
        
        x = self.conv5(x)
        x = self.bn5(x)
        x = F.relu(x)
        
        x = self.conv6(x)
        x = self.bn6(x)
        x = F.relu(x)

        
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        
        output = F.softmax(x, dim=1)
        return output


def train(model, train_loader, optimizer, epoch, dry=False):
    NUM_ACCUMULATION_STEPS = 8

    model.train()
    criterion = nn.CrossEntropyLoss(reduction='mean')
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        
        
        # with torch.autocast(device_type='cuda', dtype=torch.float16):
        output = model(data)
        loss = criterion(output, target)
        loss = loss / NUM_ACCUMULATION_STEPS
        
        loss.backward()
        # if ((batch_idx + 1) % NUM_ACCUMULATION_STEPS == 0) or (batch_idx + 1 == len(train_loader)):
        optimizer.step()
        optimizer.zero_grad()

        # optimizer.step()
        # optimizer.zero_grad(set_to_none=True)
        
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

        if dry:
            break


def test(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    criterion = nn.CrossEntropyLoss(reduction='mean')
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.cuda(), target.cuda()
            output = model(data)
            # test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            test_loss = criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


In [21]:
class VGG16(nn.Module):
    def __init__(self, num_classes=102):
        super(VGG16, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(7*7*512, 1024),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(1024, 1024),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(1024, num_classes))
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        # out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
        # out = self.layer9(out)
        out = self.layer10(out)
        out = self.layer11(out)
        # out = self.layer12(out)
        # out = self.layer13(out)
        # out = out.reshape(out.size(0), -1)
        out = torch.flatten(out, 1)
        
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [9]:
train_kwargs = {'batch_size': 4}
test_kwargs = {'batch_size': 4}

accel_kwargs = {'num_workers': 1,
                'persistent_workers': True,
               # 'pin_memory': True,
               'shuffle': True}

train_kwargs.update(accel_kwargs)
test_kwargs.update(accel_kwargs)

transform=transforms.Compose([
    transforms.ToTensor(),
    torchvision.models.VGG16_BN_Weights.IMAGENET1K_V1.transforms(),
    # transforms.Normalize((0.1307,), (0.3081,))
    ])

dataset1 = datasets.Flowers102('./data', split="train", download=True,
                   transform=transform)
dataset2 = datasets.Flowers102('./data', split="val", download=True,
                   transform=transform)

train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, drop_last =True, **test_kwargs)

model = VGG16().cuda()
# optimizer = optim.Adadelta(model.parameters(), lr=0.1)

optimizer = optim.SGD(model.parameters(), lr=0.01)

# scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

# train(model, train_loader, optimizer, epoch, True)
epoch = 0

train(model, train_loader, optimizer, epoch, dry=True)
times = []

for epoch in range(5):
    torch.cuda.synchronize()
    start_epoch = time.time()
    
    train(model, train_loader, optimizer, epoch)
        
    test(model, test_loader)
    end_epoch = time.time()
    elapsed = end_epoch - start_epoch
    times.append(elapsed)

avg_time = sum(times)/len(times)
print(avg_time)


Test set: Average loss: 0.0044, Accuracy: 20/1020 (2%)


Test set: Average loss: 0.0043, Accuracy: 40/1020 (4%)


Test set: Average loss: 0.0040, Accuracy: 57/1020 (6%)


Test set: Average loss: 0.0037, Accuracy: 57/1020 (6%)


Test set: Average loss: 0.0042, Accuracy: 95/1020 (9%)

73.59332995414734


In [12]:
train_kwargs = {'batch_size': 4}
test_kwargs = {'batch_size': 4}

accel_kwargs = {'num_workers': 1,
                'persistent_workers': True,
               # 'pin_memory': True,
               'shuffle': True}

train_kwargs.update(accel_kwargs)
test_kwargs.update(accel_kwargs)

transform=transforms.Compose([
    transforms.ToTensor(),
    torchvision.models.VGG16_BN_Weights.IMAGENET1K_V1.transforms(),
    # transforms.Normalize((0.1307,), (0.3081,))
    ])

dataset1 = datasets.Flowers102('./data', split="train", download=True,
                   transform=transform)
dataset2 = datasets.Flowers102('./data', split="val", download=True,
                   transform=transform)

train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, drop_last =True, **test_kwargs)

model = VGG16().cuda()
# optimizer = optim.Adadelta(model.parameters(), lr=0.1)

optimizer = optim.SGD(model.parameters(), lr=0.01)

# scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

# train(model, train_loader, optimizer, epoch, True)
epoch = 0

train(model, train_loader, optimizer, epoch, dry=True)
times = []

for epoch in range(5):
    torch.cuda.synchronize()
    start_epoch = time.time()
    
    train(model, train_loader, optimizer, epoch)
        
    test(model, test_loader)
    end_epoch = time.time()
    elapsed = end_epoch - start_epoch
    times.append(elapsed)

avg_time = sum(times)/len(times)
print(avg_time)


Test set: Average loss: 0.0043, Accuracy: 31/1020 (3%)


Test set: Average loss: 0.0044, Accuracy: 29/1020 (3%)


Test set: Average loss: 0.0035, Accuracy: 36/1020 (4%)


Test set: Average loss: 0.0039, Accuracy: 50/1020 (5%)


Test set: Average loss: 0.0033, Accuracy: 86/1020 (8%)

58.088413286209104


In [19]:
train_kwargs = {'batch_size': 4}
test_kwargs = {'batch_size': 4}

accel_kwargs = {'num_workers': 1,
                'persistent_workers': True,
               # 'pin_memory': True,
               'shuffle': True}

train_kwargs.update(accel_kwargs)
test_kwargs.update(accel_kwargs)

transform=transforms.Compose([
    transforms.ToTensor(),
    torchvision.models.VGG16_BN_Weights.IMAGENET1K_V1.transforms(),
    # transforms.Normalize((0.1307,), (0.3081,))
    ])

dataset1 = datasets.Flowers102('./data', split="train", download=True,
                   transform=transform)
dataset2 = datasets.Flowers102('./data', split="val", download=True,
                   transform=transform)

train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, drop_last =True, **test_kwargs)

model = VGG16().cuda()
# optimizer = optim.Adadelta(model.parameters(), lr=0.1)

optimizer = optim.SGD(model.parameters(), lr=0.01)

# scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

# train(model, train_loader, optimizer, epoch, True)
epoch = 0

train(model, train_loader, optimizer, epoch, dry=True)
times = []

for epoch in range(100):
    torch.cuda.synchronize()
    start_epoch = time.time()
    
    train(model, train_loader, optimizer, epoch)
        
    test(model, test_loader)
    end_epoch = time.time()
    elapsed = end_epoch - start_epoch
    times.append(elapsed)

avg_time = sum(times)/len(times)
print(avg_time)


Test set: Average loss: 0.0045, Accuracy: 25/1020 (2%)


Test set: Average loss: 0.0046, Accuracy: 42/1020 (4%)


Test set: Average loss: 0.0044, Accuracy: 51/1020 (5%)


Test set: Average loss: 0.0043, Accuracy: 55/1020 (5%)


Test set: Average loss: 0.0040, Accuracy: 72/1020 (7%)


Test set: Average loss: 0.0036, Accuracy: 99/1020 (10%)


Test set: Average loss: 0.0037, Accuracy: 104/1020 (10%)


Test set: Average loss: 0.0037, Accuracy: 123/1020 (12%)


Test set: Average loss: 0.0030, Accuracy: 124/1020 (12%)


Test set: Average loss: 0.0032, Accuracy: 173/1020 (17%)


Test set: Average loss: 0.0048, Accuracy: 165/1020 (16%)


Test set: Average loss: 0.0029, Accuracy: 199/1020 (20%)


Test set: Average loss: 0.0041, Accuracy: 215/1020 (21%)


Test set: Average loss: 0.0048, Accuracy: 238/1020 (23%)


Test set: Average loss: 0.0042, Accuracy: 199/1020 (20%)


Test set: Average loss: 0.0024, Accuracy: 255/1020 (25%)


Test set: Average loss: 0.0024, Accuracy: 258/1020 (25%)


Test se

KeyboardInterrupt: 

In [None]:
train_kwargs = {'batch_size': 4}
test_kwargs = {'batch_size': 4}

accel_kwargs = {'num_workers': 1,
                'persistent_workers': True,
               # 'pin_memory': True,
               'shuffle': True}

train_kwargs.update(accel_kwargs)
test_kwargs.update(accel_kwargs)

transform=transforms.Compose([
    transforms.ToTensor(),
    torchvision.models.VGG16_BN_Weights.IMAGENET1K_V1.transforms(),
    # transforms.Normalize((0.1307,), (0.3081,))
    ])

dataset1 = datasets.Flowers102('./data', split="train", download=True,
                   transform=transform)
dataset2 = datasets.Flowers102('./data', split="val", download=True,
                   transform=transform)

train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, drop_last =True, **test_kwargs)

model = VGG16().cuda()
# optimizer = optim.Adadelta(model.parameters(), lr=0.1)

optimizer = optim.SGD(model.parameters(), lr=0.01)

# scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

# train(model, train_loader, optimizer, epoch, True)
epoch = 0

train(model, train_loader, optimizer, epoch, dry=True)
times = []

for epoch in range(100):
    torch.cuda.synchronize()
    start_epoch = time.time()
    
    train(model, train_loader, optimizer, epoch)
        
    test(model, test_loader)
    end_epoch = time.time()
    elapsed = end_epoch - start_epoch
    times.append(elapsed)

avg_time = sum(times)/len(times)
print(avg_time)

NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0




NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0



Test set: Average loss: 0.0046, Accuracy: 27/1020 (3%)

