In [None]:
import mxnet as mx
from mxnet.gluon import nn
from mxnet.gluon.contrib import nn as nn_contrib
from mxnet import nd
from mxnet import gluon
import numpy as np
ctx = mx.gpu()

## Temperature Softmax

In [None]:
def softmax(y, temperature=1.0):
    exp = nd.exp(y / temperature)
    partition = nd.sum(exp, axis=1).reshape((-1,1))
    return exp / partition

## Pascal Voc Dataset

In [None]:
from source.NACDVOCDetection import NACDDetection

train_dataset = NACDDetection(splits=[('NACDwNegswAugCropped', 'train'),(2007, 'trainval'), (2012, 'trainval')])
val_dataset = NACDDetection(splits=[('NACDwNegswAugCropped', 'val'),(2007, 'val')])
test_dataset = NACDDetection(splits=[('NACDwNegswAugCropped', 'test'),(2007, 'test')])

print('Training images:', len(train_dataset))
print('Val images:', len(val_dataset))
print('Test images:', len(test_dataset))

In [None]:
from gluoncv.data.transforms import presets
from gluoncv import utils
from mxnet import nd

In [None]:
width, height = 640, 640  # suppose we use 512 as base training size
train_transform = presets.ssd.SSDDefaultTrainTransform(width, height)
val_transform = presets.ssd.SSDDefaultValTransform(width, height)

In [None]:
from gluoncv.data.batchify import Tuple, Stack, Pad
from mxnet.gluon.data import DataLoader

batch_size = 32
num_workers = 4

batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
train_loader = DataLoader(train_dataset.transform(train_transform), batch_size, shuffle=True,
                          batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
val_loader = DataLoader(val_dataset.transform(val_transform), batch_size, shuffle=False,
                        batchify_fn=batchify_fn, last_batch='keep', num_workers=num_workers)
test_loader = DataLoader(test_dataset.transform(val_transform), batch_size, shuffle=False,
                        batchify_fn=batchify_fn, last_batch='keep', num_workers=num_workers)

for ib, batch in enumerate(test_loader):
    if ib > 2:
        break
    print('data:', batch[0].shape, 'label:', batch[1].shape)

## Teacher Network

In [None]:
from gluoncv import model_zoo
resnet50 = model_zoo.get_model('resnet50_v2', pretrained=True, ctx=ctx)

## Student Network

### Scale units

In [None]:
global alpha
alpha = 0.25
num_filters = int(32*alpha)

### Down-sampling Layers

In [None]:
def dp_layer(nfilters, stride, expension_constant):
    out = nn.HybridSequential()
    out.add(nn.Conv2D(nfilters, 3, strides=stride, padding=1, groups=nfilters, use_bias=False))
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    out.add(nn.Conv2D(nfilters*expension_constant, 1, strides=1, padding=0, use_bias=False))
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    return out

### Body

In [None]:
from mxnet import gluon
def s16():
    out = nn.HybridSequential()
    # conv_0 layer
    out.add(nn.Conv2D(num_filters, 3, strides=2, padding=1, use_bias=False))
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    # conv_1 layer
    out.add(dp_layer(num_filters, 1, 2))
    # conv_2 layer
    out.add(dp_layer(num_filters*2, 2, 2))
    # conv_3 layer
    out.add(dp_layer(num_filters*4, 1, 1))
    out.add(nn.Conv2D(num_filters*4, 3, strides=2, padding=1, groups=num_filters*4, use_bias=False))
    #out.load_parameters("weights/mobilenet_0_25_s16_org.params")
    out.hybridize()
    return out

def s32():
    out = nn.HybridSequential()
    # from last layer
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    out.add(nn.Conv2D(num_filters*8, 1, strides=1, padding=0, use_bias=False))
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    # conv_4_layer
    out.add(dp_layer(num_filters*8, 1, 1))
    out.add(nn.Conv2D(num_filters*8, 3, strides=2, padding=1, groups=num_filters*8, use_bias=False))
    #out.load_parameters("weights/mobilenet_0_25_s32_org.params")
    out.hybridize()
    return out

def fc():
    out = nn.HybridSequential()
    # from last layer
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    out.add(nn.Conv2D(num_filters*16, 1, strides=1, padding=0, use_bias=False))
    out.add(nn.BatchNorm(use_global_stats=False, epsilon=1e-05, momentum=0.9, axis=1))
    out.add(nn.Activation('relu'))
    # conv_5_layer
    out.add(dp_layer(num_filters*16, 1, 1))
    # conv_6_layer
    out.add(dp_layer(num_filters*16, 1, 1))
    # conv_7_layer
    out.add(dp_layer(num_filters*16, 1, 1))
    # conv_8_layer
    out.add(dp_layer(num_filters*16, 1, 1))
    # conv_9_layer
    out.add(dp_layer(num_filters*16, 1, 1))
    # conv_10_layer
    out.add(dp_layer(num_filters*16, 2, 2))
    # conv_11_layer
    out.add(dp_layer(num_filters*32, 1, 1))
    out.add(nn.GlobalAvgPool2D())
    out.add(nn.Flatten())
    out.add(nn.Dense(1000))
    #out.load_parameters("weights/mobilenet_0_25_fc_org.params")
    out.hybridize()
    return out

In [None]:
def forward_mobile(x, s16, s32, fc):
    x = s16(x)
    x = s32(x)
    x = fc(x)
    return x

In [None]:
class mnet(gluon.Block):
    def __init__(self, **kwargs):
        super(mnet, self).__init__(**kwargs)
        with self.name_scope():
            self.s16 = s16()
            self.s32 = s32()
            self.fc = fc()
            
    def forward(self, x):
        return forward_mobile(x, self.s16, self.s32, self.fc)

## Training

In [None]:
start_epoch = 0
epochs = 100

In [None]:
sce = mx.gluon.loss.SoftmaxCrossEntropyLoss(from_logits=True, sparse_label=False)
#l2 = mx.gluon.loss.L2Loss()

In [None]:
import time
import numpy as np
from mxnet import autograd as ag
net_mobile = mnet()
#net_mobile.save_parameters('process/net_mobile_temp_%d_init.params' % temperature)
for temperature in range(12,24,4):
    #for optimizer in ("SGD","RMSProp","Adam"):
    for optimizer in ("SGD",):
        for lr in range(1,2,1):
            net_mobile.load_parameters('process/net_mobile_init.params')
            print('temperature=%d #### optimizer=%s #### lr=%.3f ####\n' % (temperature, optimizer, 0.1**lr))
            net_mobile.collect_params().reset_ctx(ctx)
            trainer = gluon.Trainer(net_mobile.collect_params(), optimizer, {'learning_rate': 0.1**lr, 'wd': 4e-5})
            for epoch in range(start_epoch, epochs):
                # reset iterator and tick
                tic = time.time()
                # iterate through all batch
                train_loss = 0
                train_mae = mx.metric.MAE()
                for i, batch in enumerate(train_loader):
                    x = batch[0].as_in_context(ctx)
                    slbl = softmax(resnet50(x),temperature=temperature).detach()
                    # record gradients
                    with ag.record():
                        p = softmax(net_mobile(x),temperature=temperature)
                        rloss = sce(nd.log(p), slbl)
                        train_loss += nd.sum(rloss).asscalar()
                        train_mae.update(preds=p, labels=slbl)
                        # backpropagate
                        rloss.backward()
                    # apply 
                    trainer.step(batch_size)
                btic = time.time()
                # iterate through all batch
                val_loss = 0
                val_mae = mx.metric.MAE()
                for i, batch in enumerate(val_loader):
                    x = batch[0].as_in_context(ctx)
                    slbl = softmax(resnet50(x),temperature=1)
                    p = softmax(net_mobile(x),temperature=1)
                    rloss = sce(nd.log(p), slbl)
                    val_loss += nd.sum(rloss).asscalar()
                    val_mae.update(preds=p, labels=slbl)
                # iterate through all batch
                test_loss = 0
                test_mae = mx.metric.MAE()
                for i, batch in enumerate(test_loader):
                    x = batch[0].as_in_context(ctx)
                    slbl = softmax(resnet50(x),temperature=1)
                    p = softmax(net_mobile(x),temperature=1)
                    rloss = sce(nd.log(p), slbl)
                    test_loss += nd.sum(rloss).asscalar()
                    test_mae.update(preds=p, labels=slbl)
                print("%3d;Loss:%f;Val_loss:%f;Test_loss:%f;Speed:%s;Train_mae:%.6e;Val_mae:%.6e;Test_mae:%.6e" % (epoch, train_loss/len(train_dataset), val_loss/len(val_dataset), test_loss/len(test_dataset), round(len(train_dataset)/(btic-tic)), train_mae.get()[1], val_mae.get()[1], test_mae.get()[1]))
                # we can save the trained parameters to disk
                net_mobile.save_parameters('process/net_mobile_temp_%d_opt_%s_lr_%d_epoch_%d.params' % (temperature, optimizer, lr, epoch))
            train_mae.reset()
            val_mae.reset()
            test_mae.reset()