In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

import time


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        
        output = F.log_softmax(x, dim=1)
        return output


def train(model, train_loader, optimizer, epoch, dry=False):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

        if dry:
            break


def test(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.cuda(), target.cuda()
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


In [2]:
train_kwargs = {'batch_size': 128}
test_kwargs = {'batch_size': 128}

accel_kwargs = {'num_workers': 1,
                'persistent_workers': True,
               'pin_memory': True,
               'shuffle': True}
train_kwargs.update(accel_kwargs)
test_kwargs.update(accel_kwargs)

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])

dataset1 = datasets.MNIST('./data', train=True, download=True,
                   transform=transform)
dataset2 = datasets.MNIST('./data', train=False,
                   transform=transform)

train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, drop_last =True, **test_kwargs)

model = Net().cuda()
optimizer = optim.Adadelta(model.parameters(), lr=0.1)
epoch = 0

train(model, train_loader, optimizer, epoch)
times = []

for epoch in range(10):
    torch.cuda.synchronize()
    start_epoch = time.time()
    test(model, test_loader)
    end_epoch = time.time()
    elapsed = end_epoch - start_epoch
    times.append(elapsed)

avg_time = sum(times)/len(times)
print(avg_time)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0551, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0550, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0550, Accuracy: 9801/10000 (98%)

3.2278727293014526


<h1>Fused Part</h1>

In [3]:
from torch.autograd.function import once_differentiable

def convolution_backward(grad_out, X, weight):
    grad_input = F.conv2d(X.transpose(0, 1), grad_out.transpose(0, 1)).transpose(0, 1)
    grad_X = F.conv_transpose2d(grad_out, weight)
    return grad_X, grad_input

class Conv2D(torch.autograd.Function):
    @staticmethod
    def forward(ctx, X, weight):
        ctx.save_for_backward(X, weight)
        return F.conv2d(X, weight)

    # Use @once_differentiable by default unless we intend to double backward
    @staticmethod
    @once_differentiable
    def backward(ctx, grad_out):
        X, weight = ctx.saved_tensors
        return convolution_backward(grad_out, X, weight)

def unsqueeze_all(t):
    # Helper function to ``unsqueeze`` all the dimensions that we reduce over
    return t[None, :, None, None]

def batch_norm_backward(grad_out, X, sum, sqrt_var, N, eps):
    # We use the formula: ``out = (X - mean(X)) / (sqrt(var(X)) + eps)``
    # in batch norm 2D forward. To simplify our derivation, we follow the
    # chain rule and compute the gradients as follows before accumulating
    # them all into a final grad_input.
    #  1) ``grad of out wrt var(X)`` * ``grad of var(X) wrt X``
    #  2) ``grad of out wrt mean(X)`` * ``grad of mean(X) wrt X``
    #  3) ``grad of out wrt X in the numerator`` * ``grad of X wrt X``
    # We then rewrite the formulas to use as few extra buffers as possible
    tmp = ((X - unsqueeze_all(sum) / N) * grad_out).sum(dim=(0, 2, 3))
    tmp *= -1
    d_denom = tmp / (sqrt_var + eps)**2  # ``d_denom = -num / denom**2``
    # It is useful to delete tensors when you no longer need them with ``del``
    # For example, we could've done ``del tmp`` here because we won't use it later
    # In this case, it's not a big difference because ``tmp`` only has size of (C,)
    # The important thing is avoid allocating NCHW-sized tensors unnecessarily
    d_var = d_denom / (2 * sqrt_var)  # ``denom = torch.sqrt(var) + eps``
    # Compute ``d_mean_dx`` before allocating the final NCHW-sized grad_input buffer
    d_mean_dx = grad_out / unsqueeze_all(sqrt_var + eps)
    d_mean_dx = unsqueeze_all(-d_mean_dx.sum(dim=(0, 2, 3)) / N)
    # ``d_mean_dx`` has already been reassigned to a C-sized buffer so no need to worry

    # ``(1) unbiased_var(x) = ((X - unsqueeze_all(mean))**2).sum(dim=(0, 2, 3)) / (N - 1)``
    grad_input = X * unsqueeze_all(d_var * N)
    grad_input += unsqueeze_all(-d_var * sum)
    grad_input *= 2 / ((N - 1) * N)
    # (2) mean (see above)
    grad_input += d_mean_dx
    # (3) Add 'grad_out / <factor>' without allocating an extra buffer
    grad_input *= unsqueeze_all(sqrt_var + eps)
    grad_input += grad_out
    grad_input /= unsqueeze_all(sqrt_var + eps)  # ``sqrt_var + eps > 0!``
    return grad_input

class BatchNorm(torch.autograd.Function):
    @staticmethod
    def forward(ctx, X, eps=1e-3):
        # Don't save ``keepdim`` values for backward
        sum = X.sum(dim=(0, 2, 3))
        var = X.var(unbiased=True, dim=(0, 2, 3))
        N = X.numel() / X.size(1)
        sqrt_var = torch.sqrt(var)
        ctx.save_for_backward(X)
        ctx.eps = eps
        ctx.sum = sum
        ctx.N = N
        ctx.sqrt_var = sqrt_var
        mean = sum / N
        denom = sqrt_var + eps
        out = X - unsqueeze_all(mean)
        out /= unsqueeze_all(denom)
        return out

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_out):
        X, = ctx.saved_tensors
        return batch_norm_backward(grad_out, X, ctx.sum, ctx.sqrt_var, ctx.N, ctx.eps)

class FusedConvBN2DFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, X, conv_weight, eps=1e-3):
        assert X.ndim == 4  # N, C, H, W
        # (1) Only need to save this single buffer for backward!
        ctx.save_for_backward(X, conv_weight)

        # (2) Exact same Conv2D forward from example above
        X = F.conv2d(X, conv_weight)
        # (3) Exact same BatchNorm2D forward from example above
        sum = X.sum(dim=(0, 2, 3))
        var = X.var(unbiased=True, dim=(0, 2, 3))
        N = X.numel() / X.size(1)
        sqrt_var = torch.sqrt(var)
        ctx.eps = eps
        ctx.sum = sum
        ctx.N = N
        ctx.sqrt_var = sqrt_var
        mean = sum / N
        denom = sqrt_var + eps
        # Try to do as many things in-place as possible
        # Instead of `out = (X - a) / b`, doing `out = X - a; out /= b`
        # avoids allocating one extra NCHW-sized buffer here
        out = X - unsqueeze_all(mean)
        out /= unsqueeze_all(denom)
        return out

    @staticmethod
    def backward(ctx, grad_out):
        X, conv_weight, = ctx.saved_tensors
        # (4) Batch norm backward
        # (5) We need to recompute conv
        X_conv_out = F.conv2d(X, conv_weight)
        grad_out = batch_norm_backward(grad_out, X_conv_out, ctx.sum, ctx.sqrt_var,
                                       ctx.N, ctx.eps)
        # (6) Conv2d backward
        grad_X, grad_input = convolution_backward(grad_out, X, conv_weight)
        return grad_X, grad_input, None, None, None, None, None

import torch.nn as nn
import math

class FusedConvBN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, exp_avg_factor=0.1,
                 eps=1e-3, device=None, dtype=None):
        super(FusedConvBN, self).__init__()
        factory_kwargs = {'device': device, 'dtype': dtype}
        # Conv parameters
        weight_shape = (out_channels, in_channels, kernel_size, kernel_size)
        self.conv_weight = nn.Parameter(torch.empty(*weight_shape, **factory_kwargs))
        # Batch norm parameters
        num_features = out_channels
        self.num_features = num_features
        self.eps = eps
        # Initialize
        self.reset_parameters()

    def forward(self, X):
        return FusedConvBN2DFunction.apply(X, self.conv_weight, self.eps)

    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self.conv_weight, a=math.sqrt(5))

weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double)
X = torch.rand(2, 3, 4, 4, requires_grad=True, dtype=torch.double)
print('Check if Fussed layers are valid:', torch.autograd.gradcheck(FusedConvBN2DFunction.apply, (X, weight)))

Check if Fussed layers are valid: True


In [4]:
class NetFused(nn.Module):
    def __init__(self):
        super(NetFused, self).__init__()
        self.convbn1 = FusedConvBN(1, 32, 3)
        self.convbn2 = FusedConvBN(32, 64, 3)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)

    def forward(self, x):
        x = self.convbn1(x)
        x = F.relu(x)
        
        x = self.convbn2(x)
        x = F.relu(x)
        
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        
        output = F.log_softmax(x, dim=1)
        return output

In [5]:
model = NetFused().cuda()
optimizer = optim.Adadelta(model.parameters(), lr=0.1)

epoch = 0

train(model, train_loader, optimizer, epoch)
times = []

for epoch in range(10):
    torch.cuda.synchronize()
    start_epoch = time.time()
    test(model, test_loader)
    end_epoch = time.time()
    elapsed = end_epoch - start_epoch
    times.append(elapsed)

avg_time = sum(times)/len(times)
print(avg_time)


Test set: Average loss: 0.0591, Accuracy: 9792/10000 (98%)


Test set: Average loss: 0.0589, Accuracy: 9790/10000 (98%)


Test set: Average loss: 0.0588, Accuracy: 9795/10000 (98%)


Test set: Average loss: 0.0586, Accuracy: 9795/10000 (98%)


Test set: Average loss: 0.0591, Accuracy: 9798/10000 (98%)


Test set: Average loss: 0.0582, Accuracy: 9796/10000 (98%)


Test set: Average loss: 0.0584, Accuracy: 9800/10000 (98%)


Test set: Average loss: 0.0582, Accuracy: 9798/10000 (98%)


Test set: Average loss: 0.0587, Accuracy: 9791/10000 (98%)


Test set: Average loss: 0.0590, Accuracy: 9793/10000 (98%)

2.5884713888168336


<h1>Onnx Part</h1>

In [6]:
with torch.device("cuda"):
    example_inputs = (torch.randn(128, 1, 28, 28),)
    onnx_program = torch.onnx.export(model, example_inputs, "model.onnx", input_names=["x"],)

In [7]:
import onnxruntime
import numpy as np
import tensorrt

session = onnxruntime.InferenceSession("model.onnx", providers=['TensorrtExecutionProvider','CUDAExecutionProvider'])
X = torch.stack([d[0] for d in dataset2]).numpy()

times = []

for epoch in range(5):
    start_epoch = time.time()
    test_dataset = iter(test_loader)
    for i in range((len(X)//128)-1):
        # data = np.float32(next(test_dataset)[0])
        outputs = session.run([], {'x':X[128*i:128*(i+1)]})[0]
        
    end_epoch = time.time()
    times += [end_epoch - start_epoch]
    print(times[-1])

print('Average Time', sum(times)/len(times))

1.554471492767334
1.4571020603179932
1.455916166305542
1.4561893939971924
1.4564645290374756
Average Time 1.4760287284851075


<h1>Prunning Part without Fused Layer</h1>

In [14]:
import torch.nn.utils.prune as prune

model = Net().cuda()

parameters_to_prune = (
    (model.conv1, 'weight'),
    (model.conv2, 'weight'),
    (model.fc1, 'weight'),
    (model.fc2, 'weight')
)

prune.ln_structured(model.conv1, name='weight', amount=0.2, n=2, dim=0)
prune.ln_structured(model.conv2, name='weight', amount=0.2, n=2, dim=0)
prune.ln_structured(model.fc1, name='weight', amount=0.2, n=2, dim=0)
prune.ln_structured(model.fc2, name='weight', amount=0.2, n=2, dim=0)

for module, name in parameters_to_prune:
    prune.remove(module, name)
    

optimizer = optim.Adadelta(model.parameters(), lr=0.1)
epoch = 0

train(model, train_loader, optimizer, epoch)
times = []

for epoch in range(10):
    torch.cuda.synchronize()
    start_epoch = time.time()
    test(model, test_loader)
    end_epoch = time.time()
    elapsed = end_epoch - start_epoch
    times.append(elapsed)

avg_time = sum(times)/len(times)
print(avg_time)


Test set: Average loss: 0.0574, Accuracy: 9805/10000 (98%)


Test set: Average loss: 0.0571, Accuracy: 9807/10000 (98%)


Test set: Average loss: 0.0574, Accuracy: 9805/10000 (98%)


Test set: Average loss: 0.0572, Accuracy: 9806/10000 (98%)


Test set: Average loss: 0.0573, Accuracy: 9805/10000 (98%)


Test set: Average loss: 0.0573, Accuracy: 9806/10000 (98%)


Test set: Average loss: 0.0573, Accuracy: 9805/10000 (98%)


Test set: Average loss: 0.0574, Accuracy: 9805/10000 (98%)


Test set: Average loss: 0.0573, Accuracy: 9805/10000 (98%)


Test set: Average loss: 0.0574, Accuracy: 9805/10000 (98%)

2.670937514305115


In [12]:
with torch.device("cuda"):
    example_inputs = (torch.randn(128, 1, 28, 28),)
    onnx_program = torch.onnx.export(model, example_inputs, "model.onnx", input_names=["x"],)

In [13]:
session = onnxruntime.InferenceSession("model.onnx", providers=['TensorrtExecutionProvider','CUDAExecutionProvider'])
X = torch.stack([d[0] for d in dataset2]).numpy()

times = []

for epoch in range(5):
    start_epoch = time.time()
    test_dataset = iter(test_loader)
    for i in range((len(X)//128)-1):
        # data = np.float32(next(test_dataset)[0])
        outputs = session.run([], {'x':X[128*i:128*(i+1)]})[0]
        
    end_epoch = time.time()
    times += [end_epoch - start_epoch]
    print(times[-1])

print(sum(times)/len(times))

0.8211748600006104
0.6404070854187012
0.6317508220672607
0.631899356842041
0.6345264911651611
0.6719517230987548
