In [1]:
!pip install mxnet-cu100
!pip install torch torchvision



In [2]:
import mxnet as mx
import mxnet.gluon.nn as mxnn
from mxnet import nd
from mxnet import gluon, autograd
from mxnet.gluon.data.vision import transforms as mxT
import time, os, shutil

import torch
import torch.nn as pytorchnn
import math
import torch.nn.functional as F
import torchvision
import torchvision.transforms as pytorchT
from torch import optim as pytorchoptim
from torch.backends import cudnn

cudnn.benchmark = True
ctx = mx.gpu()

print(mx.__version__)
print(torch.__version__)

1.4.1
1.1.0


#Builiding MXNet model

In [0]:
def mx_conv3x3(out_planes, stride=1):
    " 3x3 convolution with padding "
    return mxnn.Conv2D(out_planes, kernel_size=3, strides=stride, padding=1)


class mxBasicBlock(mxnn.HybridBlock):
    expansion = 1

    def __init__(self, planes, stride=1, downsample=None, **kwargs):
        super(mxBasicBlock, self).__init__(**kwargs)
        self.conv1 = mx_conv3x3(planes, stride)
        self.bn1 = mxnn.BatchNorm()
        self.conv2 = mx_conv3x3(planes)
        self.bn2 = mxnn.BatchNorm()
        self.downsample = downsample
        self.stride = stride

    def hybrid_forward(self, F, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = F.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out = F.relu(residual + out)

        return out


class mxResNet_Cifar(mxnn.HybridBlock):

    def __init__(self, block, layers, num_classes=10, **kwargs):
        super(mxResNet_Cifar, self).__init__(**kwargs)

        self.inplanes = 16
        self.conv1 = mxnn.Conv2D(
            16, kernel_size=3, strides=1, padding=1)
        self.bn1 = mxnn.BatchNorm()
        self.layer1 = self._make_layer(block, 16, layers[0])
        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
        self.avgpool = mxnn.AvgPool2D(8, strides=1)
        self.fc = mxnn.Dense(num_classes)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = mxnn.HybridSequential()
            downsample.add(
                mxnn.Conv2D(planes * block.expansion,
                          kernel_size=1, strides=stride)
            )

        layers = mxnn.HybridSequential()
        layers.add(block(planes, stride, downsample))
        for _ in range(1, blocks):
            layers.add(block(planes))

        return layers

    def hybrid_forward(self, F, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = self.fc(x)

        return x


def mx_resnet20_cifar(**kwargs):
    model = mxResNet_Cifar(mxBasicBlock, [3, 3, 3], **kwargs)
    return model

In [4]:
%%time
mxnet = mx_resnet20_cifar()
mxnet.initialize(ctx = ctx)

CPU times: user 1.24 s, sys: 575 ms, total: 1.82 s
Wall time: 1.81 s


In [5]:
x = nd.ones((1000, 3, 32, 32))
x.shape, x.dtype

((1000, 3, 32, 32), numpy.float32)

In [6]:
%%time
#before hybridization
mxnet(x.as_in_context(mx.gpu()))

CPU times: user 110 ms, sys: 51.7 ms, total: 161 ms
Wall time: 102 ms



[[ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 ...
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]]
<NDArray 1000x10 @gpu(0)>

###The above cell took 88.7ms to run, that's quit a lot of time run such a simple program. This is happening because mxnet model first initialize its parameters at the first run and then do the further computaion. But if you run the same cell again as below you'll see better performance becuse now mxnet model has already initialized its parameters.

In [7]:
%%time
#before hybridization
mxnet(x.as_in_context(mx.gpu()))

CPU times: user 13.2 ms, sys: 4.68 ms, total: 17.9 ms
Wall time: 13 ms



[[ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 ...
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]]
<NDArray 1000x10 @gpu(0)>

###So after initialization the final time taken by unhybridized mxnet model is 17.9 ms

###Hybridizing converts mxnet dynamic model to static.

In [0]:
mxnet.hybridize()

In [9]:
%%time
#after hybridization
mxnet(x.as_in_context(mx.gpu()))

CPU times: user 15.4 ms, sys: 2.05 ms, total: 17.4 ms
Wall time: 18 ms



[[ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 ...
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]]
<NDArray 1000x10 @gpu(0)>

###The above cell took alomst 20.2 ms to run. But if you run the same cell again as below you'll see better performance becuse now mxnet model has already initialized its parameters.

In [11]:
%%time
#after hybridization
mxnet(x.as_in_context(mx.gpu()))

CPU times: user 6.23 ms, sys: 124 µs, total: 6.35 ms
Wall time: 5.1 ms



[[ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 ...
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]
 [ 0.00054889 -0.00041725 -0.00197463 ... -0.00700838 -0.00223936
  -0.00076538]]
<NDArray 1000x10 @gpu(0)>

###So you see after hydridizing the model took 3.2 ms while before the model took 17.9 ms. That's a massive 5x performance boost.

#Building Pytorch model

In [0]:
def torch_conv3x3(in_planes, out_planes, stride=1):
    " 3x3 convolution with padding "
    return pytorchnn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1)


class pytorchBasicBlock(pytorchnn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(pytorchBasicBlock, self).__init__()
        self.conv1 = torch_conv3x3(inplanes, planes, stride)
        self.bn1 = pytorchnn.BatchNorm2d(planes)
        self.relu = pytorchnn.ReLU(inplace=True)
        self.conv2 = torch_conv3x3(planes, planes)
        self.bn2 = pytorchnn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class pytorchResNet_Cifar(pytorchnn.Module):

    def __init__(self, block, layers, num_classes=10):
        super(pytorchResNet_Cifar, self).__init__()
        self.inplanes = 16
        self.conv1 = pytorchnn.Conv2d(3, 16, kernel_size=3,
                               stride=1, padding=1)
        self.bn1 = pytorchnn.BatchNorm2d(16)
        self.relu = pytorchnn.ReLU(inplace=True)
        self.layer1 = self._make_layer(block, 16, layers[0])
        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
        self.avgpool = pytorchnn.AvgPool2d(8, stride=1)
        self.fc = pytorchnn.Linear(64 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, pytorchnn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, pytorchnn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = pytorchnn.Sequential(
                pytorchnn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride),
                pytorchnn.BatchNorm2d(planes * block.expansion)
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return pytorchnn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def pytorch_resnet20_cifar(**kwargs):
    model = pytorchResNet_Cifar(pytorchBasicBlock, [3, 3, 3], **kwargs)
    return model

In [13]:
%%time
pytorch = pytorch_resnet20_cifar()
pytorch.cuda()

CPU times: user 1.87 s, sys: 632 ms, total: 2.5 s
Wall time: 2.51 s


In [0]:
x = torch.ones((1000, 3, 32, 32), dtype = torch.float32)

In [15]:
%%time
pytorch(x.cuda())

CPU times: user 30.2 ms, sys: 313 ms, total: 343 ms
Wall time: 703 ms


tensor([[ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        ...,
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421]],
       device='cuda:0', grad_fn=<AddmmBackward>)

###As you can see above pytorch took 832 ms. That is again beacuse of some internal initialization that occurs when you pass a value to your model first time. So I am gonna run above cell again.

In [16]:
%%time
pytorch(x.cuda())

CPU times: user 15.5 ms, sys: 8.28 ms, total: 23.8 ms
Wall time: 25.6 ms


tensor([[ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        ...,
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421],
        [ 0.3616, -0.3197, -1.3828,  ...,  0.2917,  0.0053, -0.8421]],
       device='cuda:0', grad_fn=<AddmmBackward>)

###So we can see that pytorch performance has improved and now its 28 ms.

###We can already see that MXNet's model(3.2 ms) is faster than Pytorch's model(28 ms)

#Training MXNet model

##Getting Dataset and Data loader 

In [0]:
normalize = mxT.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])

batch_size = 128

train_transfrom = mxT.Compose([
    mxT.RandomFlipLeftRight(),
    mxT.ToTensor(),
    normalize
])

val_transform = mxT.Compose([
    mxT.ToTensor(),
    normalize
])

mxtrainset = gluon.data.vision.datasets.CIFAR10(
    './data', train=True).transform_first(train_transfrom)
mxtrainloader = gluon.data.DataLoader(
    mxtrainset, batch_size=batch_size, shuffle=True, num_workers=2)

mxtestset = gluon.data.vision.datasets.CIFAR10(
    './data', train=False).transform_first(val_transform)
mxtestloader = gluon.data.DataLoader(
    mxtestset, batch_size=batch_size, shuffle=False, num_workers=2)

In [18]:
%%time
mxobjective = gluon.loss.SoftmaxCrossEntropyLoss()
mxoptimizer = gluon.Trainer(mxnet.collect_params(), 'adam', {'learning_rate': 0.001})

CPU times: user 3.7 ms, sys: 841 µs, total: 4.54 ms
Wall time: 4.78 ms


In [19]:
%%time
for epoch in range(10):
    for features, labels in mxtrainloader:
        with autograd.record():
            output = mxnet(features.as_in_context(ctx))
            loss = mxobjective(output, labels.as_in_context(ctx))
        loss.backward()
        mxoptimizer.step(batch_size)
    print('Epoch:', epoch, 'done.')

Epoch: 0 done.
Epoch: 1 done.
Epoch: 2 done.
Epoch: 3 done.
Epoch: 4 done.
Epoch: 5 done.
Epoch: 6 done.
Epoch: 7 done.
Epoch: 8 done.
Epoch: 9 done.
CPU times: user 2min 30s, sys: 46.4 s, total: 3min 16s
Wall time: 2min 43s


In [0]:
metric = mx.metric.Accuracy()
metric.reset()

In [22]:
%%time
for features, labels in mxtestloader:
    output = mxnet(features.as_in_context(ctx))
    metric.update(labels.as_in_context(ctx), output)

CPU times: user 1.46 s, sys: 824 ms, total: 2.29 s
Wall time: 2.87 s


In [23]:
metric.get()

('accuracy', 0.8122)

#Training Pytorchh Model

##Getting Dataset and Data loader

In [24]:
normalize = pytorchT.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])

train_transform = pytorchT.Compose([
    pytorchT.RandomHorizontalFlip(),
    pytorchT.ToTensor(),
    normalize
])

val_transform = pytorchT.Compose([
    pytorchT.ToTensor(),
    normalize
])

pytorch_trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform)
pytorch_trainloader = torch.utils.data.DataLoader(
    pytorch_trainset, batch_size=128, shuffle=True, num_workers=2)

pytorch_testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=val_transform)
pytorch_testloader = torch.utils.data.DataLoader(
    pytorch_testset, batch_size=128, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


In [25]:
%%time
pytorch_criterion = pytorchnn.CrossEntropyLoss()
pytorch_optimizer = pytorchoptim.Adam(pytorch.parameters(), 0.001)

CPU times: user 952 µs, sys: 0 ns, total: 952 µs
Wall time: 960 µs


In [26]:
%%time
pytorch.train()
for epoch in range(10):
    for features, labels in pytorch_trainloader:
        output = pytorch(features.cuda())
        loss = pytorch_criterion(output, labels.cuda())
        pytorch_optimizer.zero_grad()
        loss.backward()
        pytorch_optimizer.step()        
    print('Epoch:', epoch, 'done.')

Epoch: 0 done.
Epoch: 1 done.
Epoch: 2 done.
Epoch: 3 done.
Epoch: 4 done.
Epoch: 5 done.
Epoch: 6 done.
Epoch: 7 done.
Epoch: 8 done.
Epoch: 9 done.
CPU times: user 3min 31s, sys: 29.8 s, total: 4min 1s
Wall time: 4min 9s


In [27]:
%%time
pytorch.eval()
correct = 0
total = 0
with torch.no_grad():
    for features, labels in pytorch_testloader:
        outputs = pytorch(features.cuda())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.cuda()).sum().item()

CPU times: user 1.02 s, sys: 373 ms, total: 1.39 s
Wall time: 2.76 s


###I've taken the above code from the official tutorials by pytorch from thier website.

In [28]:
print('Test Accuracy:', correct / total)

Test Accuracy: 0.796


#Conclusion:

*   MXNet's model took 2mins 43 secs!
*   Pytorch's model took 4mins 9 secs!
*   MXNet's model Test Accuracy: 81.2%
*   Pytorch's model Test Accuracy: 79.6%
----
##So according to this benchmark, it looks like MXNet is over 1.5x faster than Pytorch.

I think test accuracies of both models may vary with different training runs, because training depends on one important facter which is shuffling the data.... which happens randomly. So the point is that MXNet's model test accuracy would be always greater than Pytorch's model is clearly a worng statement.