In [None]:
import mxnet as mx
from mxnet.gluon import nn
from mxnet.gluon.contrib import nn as nn_contrib
from mxnet import nd
from mxnet import gluon
import numpy as np
ctx = mx.gpu()

## Temperature Softmax

In [None]:
def softmax(y, temperature=1.0):
    exp = nd.exp(y / temperature)
    partition = nd.sum(exp, axis=1).reshape((-1,1))
    return exp / partition

## Pascal Voc Dataset

In [None]:
from source.NACDVOCDetection import NACDDetection

train_dataset = NACDDetection(splits=[('NACDwNegswAugCropped', 'train'),(2007, 'trainval'), (2012, 'trainval')])
val_dataset = NACDDetection(splits=[('NACDwNegswAugCropped', 'test'),(2007, 'test')])

print('Training images:', len(train_dataset))
print('Test images:', len(val_dataset))

In [None]:
from gluoncv.data.transforms import presets
from gluoncv import utils
from mxnet import nd

In [None]:
width, height = 512, 512  # suppose we use 512 as base training size
train_transform = presets.ssd.SSDDefaultTrainTransform(width, height)
val_transform = presets.ssd.SSDDefaultValTransform(width, height)

In [None]:
from gluoncv.data.batchify import Tuple, Stack, Pad
from mxnet.gluon.data import DataLoader

batch_size = 24
num_workers = 4

batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
train_loader = DataLoader(train_dataset.transform(train_transform), batch_size, shuffle=True,
                          batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
val_loader = DataLoader(val_dataset.transform(val_transform), batch_size, shuffle=False,
                        batchify_fn=batchify_fn, last_batch='keep', num_workers=num_workers)

for ib, batch in enumerate(val_loader):
    if ib > 2:
        break
    print('data:', batch[0].shape, 'label:', batch[1].shape)

## Teacher Network

In [None]:
from gluoncv import model_zoo
resnet50 = model_zoo.get_model('resnet50_v2', pretrained=True, ctx=ctx)

## Student Network

### Scale units

In [None]:
global alpha
alpha = 0.5
num_filter = int(32*alpha)

### Down-sampling Layers

In [None]:
def dp_layer(nfilters, stride, expension_constant):
    out = nn.HybridSequential()
    out.add(nn.Conv2D(nfilters, 3, strides=stride, padding=1, groups=nfilters, use_bias=False))
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    out.add(nn.Conv2D(nfilters*expension_constant, 1, strides=1, padding=0, use_bias=False))
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    return out

### Body

In [None]:
def s16():
    out = nn.HybridSequential()
    with out.name_scope():
        # conv2d
        out.add(nn.Conv2D(num_filter, kernel_size=3, strides=2, padding=1, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        
        # (3) LinearBottleneck
        out.add(dp_layer(num_filter, 1, 1))
        #out.add(nn.Conv2D(num_filter, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter, kernel_size=3, strides=1, padding=1, groups=num_filter, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter/2, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (4) LinearBottleneck
        out.add(dp_layer(num_filter/2, 1, 6))
        #out.add(nn.Conv2D(num_filter*3, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu')) # conv2_2_linear_scale
        out.add(nn.Conv2D(num_filter*3, kernel_size=3, strides=2, padding=1, groups=num_filter*3, use_bias=False))

        out.load_parameters("weights/mobilenet_v2_0_5_s16.params")
        return out

In [None]:
def s32():
    out = nn.HybridSequential()
    with out.name_scope():
        # (4) LinearBottleneck con't
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*3/4, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1)) # conv2_2_linear_scale
        
        # (5) LinearBottleneck
        out.add(dp_layer(num_filter*3/4, 1, 6))
        #out.add(nn.Conv2D(num_filter*9/2, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*9/2, kernel_size=3, strides=1, padding=1, groups=num_filter*9/2, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*6/8, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1)) # conv2_2_linear_scale concatenate

        # (6) LinearBottleneck
        out.add(dp_layer(num_filter*6/8, 1, 6))
        #out.add(nn.Conv2D(num_filter*9/2, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*9/2, kernel_size=3, strides=2, padding=1, groups=num_filter*9/2, use_bias=False))
        
        out.load_parameters("weights/mobilenet_v2_0_5_s32.params")
        return out

In [None]:
def fc():
    out = nn.HybridSequential()
    with out.name_scope():
        # (6) LinearBottleneck con't
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (7) LinearBottleneck
        out.add(dp_layer(num_filter, 1, 6))
        #out.add(nn.Conv2D(num_filter*6, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu')) # conv3_2_linear_scale
        out.add(nn.Conv2D(num_filter*6, kernel_size=3, strides=1, padding=1, groups=num_filter*6, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (8) LinearBottleneck
        out.add(dp_layer(num_filter, 1, 6))
        #out.add(nn.Conv2D(num_filter*6, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*6, kernel_size=3, strides=1, padding=1, groups=num_filter*6, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (9) LinearBottleneck
        out.add(dp_layer(num_filter, 1, 6))
        #out.add(nn.Conv2D(num_filter*6, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu')) # block_4_1
        out.add(nn.Conv2D(num_filter*6, kernel_size=3, strides=1,padding=1, groups=num_filter*6, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*2, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1)) # block_4_1 concanetate
        
        # (10) LinearBottleneck
        out.add(dp_layer(num_filter*2, 1, 6))
        #out.add(nn.Conv2D(num_filter*12, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu')) 
        out.add(nn.Conv2D(num_filter*12, kernel_size=3, strides=1, padding=1, groups=num_filter*12, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*2, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (11) LinearBottleneck
        out.add(dp_layer(num_filter*2, 1, 6))
        #out.add(nn.Conv2D(num_filter*12, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*12, kernel_size=3, strides=1, padding=1, groups=num_filter*12, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*2, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (12) LinearBottleneck
        out.add(dp_layer(num_filter*2, 1, 6))
        #out.add(nn.Conv2D(num_filter*12, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*12, kernel_size=3, strides=1, padding=1, groups=num_filter*12, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*2, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))

        # (13) LinearBottleneck
        out.add(dp_layer(num_filter*2, 1, 6))
        #out.add(nn.Conv2D(num_filter*12, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*12, kernel_size=3, strides=2,padding=1, groups=num_filter*12, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*3, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (14) LinearBottleneck
        out.add(dp_layer(num_filter*3, 1, 6))
        #out.add(nn.Conv2D(num_filter*18, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*18, kernel_size=3, strides=1, padding=1, groups=num_filter*18, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*3, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (15) LinearBottleneck
        out.add(dp_layer(num_filter*3, 1, 6))
        #out.add(nn.Conv2D(num_filter*18, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*18, kernel_size=3, strides=1, padding=1, groups=num_filter*18, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*3, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (16) LinearBottleneck
        out.add(dp_layer(num_filter*3, 1, 6))
        #out.add(nn.Conv2D(num_filter*18, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*18, kernel_size=3, strides=2, padding=1, groups=num_filter*18, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*5, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (17) LinearBottleneck
        out.add(dp_layer(num_filter*5, 1, 6))
        #out.add(nn.Conv2D(num_filter*30, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*30, kernel_size=3, strides=1, padding=1, groups=num_filter*30, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*5, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (18) LinearBottleneck
        out.add(dp_layer(num_filter*5, 1, 6))
        #out.add(nn.Conv2D(num_filter*30, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*30, kernel_size=3, strides=1, padding=1, groups=num_filter*30, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*5, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        # (19) LinearBottleneck
        out.add(dp_layer(num_filter*5, 1, 6))
        #out.add(nn.Conv2D(num_filter*30, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*30, kernel_size=3, strides=1, padding=1, groups=num_filter*30, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.Conv2D(num_filter*10, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        
        out.add(nn.Conv2D(num_filter*80, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
        out.add(nn.Activation('relu'))
        out.add(nn.GlobalAvgPool2D())
        
        out.add(nn.Conv2D(1000, kernel_size=1, strides=1, padding=0, use_bias=False))
        out.add(nn.Flatten())
        out.load_parameters("weights/mobilenet_v2_0_5_fc.params")
        return out

In [None]:
def forward_mobile(x, s16, s32, fc, temperature):
    tic = time.time()
    x = s16(x)
    
    for i in range(16):
        x = s32[i](x)
        if i == 3:
            conv2_2_linear_scale = x
        elif i == 11:
            x = mx.nd.broadcast_add(conv2_2_linear_scale, x)
    print(time.time() - tic)
    
    for i in range(114):
        x = fc[i](x)
        if i == 3:
            conv3_2_linear_scale = x
        elif i == 11:
            x = mx.nd.broadcast_add(conv3_2_linear_scale, x)
            block_4_1 = x
        elif i == 19:
            x = mx.nd.broadcast_add(block_4_1, x)
        elif i == 27:
            conv4_3_linear_scale = x
        elif i == 35:
            x = mx.nd.broadcast_add(conv4_3_linear_scale, x)
            block_4_4 = x
        elif i == 43:
            x = mx.nd.broadcast_add(block_4_4, x)
            block_4_5 = x
        elif i == 51:
            x = mx.nd.broadcast_add(block_4_5, x)
        elif i == 59:
            conv4_7_linear_scale = x
        elif i == 67:
            x = mx.nd.broadcast_add(conv4_7_linear_scale, x)
            block_5_1 = x
        elif i == 75:
            x = mx.nd.broadcast_add(block_5_1, x)
        elif i == 83:
            conv5_3_linear_scale = x
        elif i == 91:
            x = mx.nd.broadcast_add(conv5_3_linear_scale, x)
            block_6_1 = x
        elif i == 99:
            x = mx.nd.broadcast_add(block_6_1, x)

    return x

In [None]:
class mnet(gluon.Block):
    def __init__(self, temperature, **kwargs):
        super(mnet, self).__init__(**kwargs)
        with self.name_scope():
            self.s16 = s16()
            self.s32 = s32()
            self.fc = fc()
            self.temperature = temperature
            
    def forward(self, x):
        return forward_mobile(x, self.s16, self.s32, self.fc, self.temperature)

## Training

In [None]:
start_epoch = 0
epochs = 350

In [None]:
sce = mx.gluon.loss.SoftmaxCrossEntropyLoss(from_logits=True, sparse_label=False)
#l2 = mx.gluon.loss.L2Loss()

In [None]:
import time
import numpy as np
temperature = 16
from mxnet import autograd as ag
net_mobile = mnet(temperature)
#net_mobile.initialize(mx.init.Xavier(magnitude=2), ctx=ctx)
#net_mobile.load_parameters("process/net_mobile_epoch_99.params")
net_mobile.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(net_mobile.collect_params(), 'sgd', {'learning_rate': 1e-1, 'wd': 4e-5})
for epoch in range(start_epoch, epochs):
    # reset iterator and tick
    tic = time.time()
    # iterate through all batch
    train_loss = 0
    train_mae = mx.metric.MAE()
    for i, batch in enumerate(train_loader):
        x = batch[0].as_in_context(ctx)
        slbl = softmax(resnet50(x),temperature=temperature).detach()
        # record gradients
        #tic = time.time()
        with ag.record():
            p = softmax(net_mobile(x),temperature=temperature)
            rloss = sce(nd.log(p), slbl)
            train_loss += nd.sum(rloss).asscalar()
            train_mae.update(preds=p, labels=slbl)
            # backpropagate
            rloss.backward()
        # apply 
        trainer.step(batch_size)
        #print(time.time() - tic)
    btic = time.time()
    # iterate through all batch
    val_loss = 0
    val_mae = mx.metric.MAE()
    for i, batch in enumerate(val_loader):
        x = batch[0].as_in_context(ctx)
        slbl = softmax(resnet50(x),temperature=temperature)
        p = softmax(net_mobile(x),temperature=temperature)
        rloss = sce(nd.log(p), slbl)
        val_loss += nd.sum(rloss).asscalar()
        val_mae.update(preds=p, labels=slbl)
    print("%3d;Loss:%f;Val_loss:%f;Speed:%s;Train_mae:%.6e;Val_mae:%.6e" % (epoch, train_loss/len(train_dataset), val_loss/len(val_dataset), round(len(train_dataset)/(btic-tic)), train_mae.get()[1], val_mae.get()[1]))
    # we can save the trained parameters to disk
    net_mobile.save_parameters('process/net_mobile_epoch_%d.params' % (epoch))