In [1]:
import os
import math
import mxnet as mx
from mxnet import image
from mxnet import nd, gluon, autograd, init
from mxnet.gluon.data.vision import ImageFolderDataset
from mxnet.gluon.data import DataLoader
from mxnet.gluon import nn
from tensorboardX import SummaryWriter
import numpy as np
import shutil
import _pickle as cPickle
from sklearn import preprocessing
from mxnet.gluon.parameter import Parameter, ParameterDict
from common.util import download_file
import subprocess
import time

from IPython.core.debugger import Tracer

In [2]:
def unpickle(file):
    fo = open(file, 'rb')
    dict = cPickle.load(fo, encoding='bytes')
    fo.close()
    return dict

In [3]:
class Options:
    def __init__(self):
        self.seed_val = 0
        self.nlabels = 128000
        self.batch_size = 16
        self.data_dir = '/tanData/datasets/imagenet/data'
        self.log_dir = '/tanData/logs'
        self.model_dir ='/tanData/models'
        self.exp_name = 'imagenet_resnext_lddrm_mm_pathnorm_nlabels_%i_seed_%i'%(self.nlabels, self.seed_val)
        self.gpus = 2
        self.first_gpu = 1
        
        self.alpha_drm = 0.5
        self.alpha_pn = 1.0
        self.alpha_kl = 0.5
        self.alpha_mm = 0.5
        
opt = Options()

In [4]:
batch_size = opt.batch_size
batch_size *= max(1, opt.gpus)
ctx = [mx.gpu(i+opt.first_gpu) for i in range(opt.gpus)] if opt.gpus > 0 else [mx.cpu()]
# ctx = [mx.gpu(1), mx.gpu(4)]
kv = mx.kvstore.create('device')

In [5]:
def gpu_device(ctx=mx.gpu(0)):
    try:
        _ = mx.nd.array([1, 2, 3], ctx=ctx)
    except mx.MXNetError:
        return None
    return ctx

for i, gpu in enumerate(ctx):
    assert gpu_device(gpu), 'GPU device %i is not available!'%(i + self.first_gpu)

In [6]:
log_dir = os.path.join(opt.log_dir, opt.exp_name)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [7]:
train_data_unsup = mx.io.ImageRecordIter(
    path_imgrec=os.path.join(opt.data_dir, 'train_unsup_480.rec'),
    label_width=1,
    data_name='data',
    label_name='softmax_label',
    data_shape=(3, 224, 224),
    batch_size=batch_size // 2,
    mean_r=123.68,
    mean_g=116.779,
    mean_b=103.939,
    std_r=58.395,
    std_g=57.12,
    std_b=57.375,
    pad=0,
    fill_value=0,
    shuffle=True,
    rand_crop=True,
    rand_mirror=True,
    max_random_scale=1.0,
    min_random_scale=0.533,
    max_aspect_ratio=4.0/3.0,
    min_aspect_ratio=3.0/4.0,
    max_rotate_angle=10,
    brightness=0.4,
    contrast=0.4,
    saturation=0.4,
    random_h=9,
    num_parts=kv.num_workers,
    part_index=kv.rank,
    preprocess_threads=32)

train_data_sup = mx.io.ImageRecordIter(
    path_imgrec=os.path.join(opt.data_dir, 'train_sup_480.rec'),
    label_width=1,
    data_name='data',
    label_name='softmax_label',
    data_shape=(3, 224, 224),
    batch_size=batch_size // 2,
    mean_r=123.68,
    mean_g=116.779,
    mean_b=103.939,
    std_r=58.395,
    std_g=57.12,
    std_b=57.375,
    pad=0,
    fill_value=0,
    shuffle=True,
    rand_crop=True,
    rand_mirror=True,
    max_random_scale=1.0,
    min_random_scale=0.533,
    max_aspect_ratio=4.0/3.0,
    min_aspect_ratio=3.0/4.0,
    max_rotate_angle=10,
    brightness=0.4,
    contrast=0.4,
    saturation=0.4,
    random_h=9,
    num_parts=kv.num_workers,
    part_index=kv.rank,
    preprocess_threads=32)

valid_data = mx.io.ImageRecordIter(
    path_imgrec=os.path.join(opt.data_dir,'val_256.rec'),
    label_width=1,
    data_name='data',
    label_name='softmax_label',
    batch_size=batch_size,
    data_shape=(3, 224, 224),
    rand_crop=False,
    rand_mirror=False,
    fill_value=0,
    mean_r=123.68,
    mean_g=116.779,
    mean_b=103.939,
    std_r=58.395,
    std_g=57.12,
    std_b=57.375,
    num_parts=kv.num_workers,
    part_index=kv.rank,
    preprocess_threads=32)

In [8]:
criterion = gluon.loss.SoftmaxCrossEntropyLoss()
L2_loss = gluon.loss.L2Loss()
L1_loss = gluon.loss.L1Loss()

In [9]:
class Normal(mx.init.Initializer):
    """Initializes weights with random values sampled from a normal distribution
    with a mean and standard deviation of `sigma`.
    """
    def __init__(self, mean=0, sigma=0.01):
        super(Normal, self).__init__(sigma=sigma)
        self.sigma = sigma
        self.mean = mean

    def _init_weight(self, _, arr):
        mx.random.normal(self.mean, self.sigma, out=arr)

In [10]:
# from resnet import ResNet164_v2
from mxnet.gluon.model_zoo import vision as models
from resnext_ld_opt import ResNext

In [11]:
acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)

In [12]:
import datetime
writer = SummaryWriter(os.path.join(opt.log_dir, opt.exp_name))

def test(net, val_data, ctx):
    acc_top1_val = mx.metric.Accuracy()
    acc_top5_val = mx.metric.TopKAccuracy(5)
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
        outputs = []
        for x, y in zip(data, label):
            output, _, _, _ = net(x, y)
            outputs.append(output)
            
        acc_top1_val.update(label, outputs)
        acc_top5_val.update(label, outputs)

    _, top1 = acc_top1_val.get()
    _, top5 = acc_top5_val.get()
    return (top1, top5)

In [13]:
def linear_rampup(current, rampup_length):
    """Linear rampup"""
    assert current >= 0 and rampup_length >= 0
    if current >= rampup_length:
        return 1.0
    else:
        return current / rampup_length

def cosine_rampdown(current, rampdown_length):
    """Cosine rampdown from https://arxiv.org/abs/1608.03983"""
    assert 0 <= current <= rampdown_length
    return float(.5 * (np.cos(np.pi * current / rampdown_length) + 1))

def adjust_learning_rate(optimizer, epoch, step_in_epoch, total_steps_in_epoch):
    lr = opt.lr
    epoch = epoch + step_in_epoch / total_steps_in_epoch

    # LR warm-up to handle large minibatch sizes from https://arxiv.org/abs/1706.02677
    lr = linear_rampup(epoch, opt.lr_rampup) * (opt.lr - opt.initial_lr) + opt.initial_lr

    # Cosine LR rampdown from https://arxiv.org/abs/1608.03983 (but one cycle only)
    if opt.lr_rampdown_epochs:
        lr *= cosine_rampdown(epoch, opt.lr_rampdown_epochs)

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [14]:
def train(net, train_data_sup, train_data_unsup, valid_data, num_epochs, lr, wd, ctx, lr_decay):
    trainer = gluon.Trainer(
        net.collect_params(), 'adam', {'learning_rate': 0.001, 'wd': wd})
    
    prev_time = datetime.datetime.now()
    best_top1_val = 0.
    best_top5_val = 0.
    log_interval = 1000
    
#     # Learning rate decay factor
#     lr_decay = 0.1
#     # Epochs where learning rate decays
#     lr_decay_epoch = [30, 60, 90, np.inf]
    
    for epoch in range(num_epochs):
        train_data_sup.reset()
        train_data_unsup.reset()
        
        tic = time.time()
        btic = time.time()
        acc_top1.reset()
        acc_top5.reset()
        train_loss = 0
        num_batch = 0
        
        if epoch == 2:
            sgd_lr = 0.15
            decay_val = np.exp(np.log(sgd_lr / 0.0001) / (num_epochs - 2 - 2))
            sgd_lr = sgd_lr * decay_val
            
            # Nesterov accelerated gradient descent
            optimizer = 'nag'
            # Set parameters
            optimizer_params = {'learning_rate': sgd_lr , 'wd': wd, 'momentum': 0.9}

            # Define our trainer for net
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
            
        if epoch >= 2:
            trainer.set_learning_rate(trainer.learning_rate / decay_val)
        
        for i, (batch_unsup, batch_sup) in enumerate(zip(train_data_unsup, train_data_sup)):
            bs = batch_unsup.data[0].shape[0]
            
            data_unsup = gluon.utils.split_and_load(batch_unsup.data[0], ctx_list=ctx, batch_axis=0)
            label_unsup = gluon.utils.split_and_load(batch_unsup.label[0], ctx_list=ctx, batch_axis=0)
            data_sup = gluon.utils.split_and_load(batch_sup.data[0], ctx_list=ctx, batch_axis=0)
            label_sup = gluon.utils.split_and_load(batch_sup.label[0], ctx_list=ctx, batch_axis=0)
            
            loss = []
            outputs_unsup = []
            outputs_sup = []
            
            with autograd.record():
                for xuns, yuns, xsup, ysup in zip(data_unsup, label_unsup, data_sup, label_sup):
                    output_unsup, xhat_unsup, loss_mm_unsup, rpn_unsup = net(xuns)
                    loss_drm_unsup = L2_loss(xhat_unsup, xuns)
                    softmax_unsup = nd.softmax(output_unsup)
                    loss_kl_unsup = -nd.sum(nd.log(1000.0*softmax_unsup + 1e-8) * softmax_unsup, axis=1)
                    loss_unsup = opt.alpha_drm * loss_drm_unsup + opt.alpha_kl * loss_kl_unsup + opt.alpha_mm * loss_mm_unsup + opt.alpha_pn * rpn_unsup

                    output_sup, xhat_sup, loss_mm_sup, rpn_sup = net(xsup, ysup)
                    loss_xentropy_sup = criterion(output_sup, ysup)
                    loss_drm_sup = L2_loss(xhat_sup, xsup)
                    softmax_sup = nd.softmax(output_sup)
                    loss_kl_sup = -nd.sum(nd.log(1000.0*softmax_sup + 1e-8) * softmax_sup, axis=1)
                    loss_sup = loss_xentropy_sup + opt.alpha_drm * loss_drm_sup + opt.alpha_kl * loss_kl_sup + opt.alpha_mm * loss_mm_sup + opt.alpha_pn * rpn_sup
    
                    loss.append(loss_unsup + loss_sup)
                    outputs_sup.append(output_sup)
                    outputs_unsup.append(output_unsup)
                    
            for l in loss:
                l.backward()
                
            trainer.step(bs)
            
            Tracer()()
            
            acc_top1.update(label_sup, outputs_sup)
            acc_top5.update(label_sup, outputs_sup)
            train_loss += sum([l.sum().asscalar() for l in loss])
            num_batch += 1
            if log_interval and not (i + 1) % log_interval:
                _, top1 = acc_top1.get()
                _, top5 = acc_top5.get()
                print('Epoch[%d] Batch [%d]     Speed: %f samples/sec   top1-acc=%f     top5-acc=%f'%(
                          epoch, i, batch_size*log_interval/(time.time()-btic), top1, top5))
                btic = time.time()
        
        _, top1 = acc_top1.get()
        _, top5 = acc_top5.get()
        train_loss /= num_batch * batch_size
        writer.add_scalars('acc', {'train_top1': top1}, epoch)
        writer.add_scalars('acc', {'train_top5': top5}, epoch)
        
        top1_val, top5_val = test(ctx, valid_data)
        
        if top1_val > best_top1_val:
            best_top1_val = top1_val
            net.collect_params().save('%s/%s_best_top1.params'%(opt.model_dir, opt.exp_name))
        
        if top5_val > best_top5_val:
            best_top5_val = top5_val
            net.collect_params().save('%s/%s_best_top5.params'%(opt.model_dir, opt.exp_name))
        
        print('[Epoch %d] training: acc-top1=%f acc-top5=%f loss=%f'%(epoch, top1, top5, train_loss))
        print('[Epoch %d] time cost: %f'%(epoch, time.time()-tic))
        print('[Epoch %d] validation: acc-top1=%f acc-top5=%f'%(epoch, top1_val, top5_val))
        
        writer.add_scalars('acc', {'valid_top1': top1_val}, epoch)
        writer.add_scalars('acc', {'valid_top5': top5_val}, epoch)
        
        net.collect_params().save('%s/%s_epoch_%i.params'%(opt.model_dir, opt.exp_name, epoch))
    
    return best_top1_val, best_top5_val

In [15]:
num_epochs = 75
learning_rate = 0.15
weight_decay = 5e-5
lr_decay = 0.1

In [16]:
def run_train(num_exp, ctx):
    valid_acc = 0
    for i in range(num_exp):
        ### CIFAR VGG_DRM    
        model = ResNext('resnext152', cardinality=32, bottleneck_width=4, classes=1000)
        for param in model.collect_params().values():
            if param.name.find('conv') != -1 or param.name.find('dense') != -1:
                if param.name.find('weight') != -1:
                    param.initialize(init=mx.initializer.Xavier(), ctx=ctx)
                else:
                    param.initialize(init=mx.init.Zero(), ctx=ctx)
            elif param.name.find('batchnorm') != -1 or param.name.find('instancenorm') != -1:
                if param.name.find('gamma') != -1:
                    param.initialize(init=Normal(mean=1, sigma=0.02), ctx=ctx)
                else:
                    param.initialize(init=mx.init.Zero(), ctx=ctx)
            elif param.name.find('biasadder') != -1:
                param.initialize(init=mx.init.Zero(), ctx=ctx)
            else:
                param.initialize(init=mx.initializer.Xavier(), ctx=ctx)
                
        # model.hybridize()
        
        best_top1_val, best_top5_val = train(model, train_data_sup, train_data_unsup, valid_data, num_epochs, learning_rate, weight_decay, ctx, lr_decay)

In [None]:
run_train(1, ctx=ctx)

  self.debugger = Pdb(colors)


> [0;32m<ipython-input-14-87bee530da4d>[0m(80)[0;36mtrain[0;34m()[0m
[0;32m     78 [0;31m            [0mTracer[0m[0;34m([0m[0;34m)[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     79 [0;31m[0;34m[0m[0m
[0m[0;32m---> 80 [0;31m            [0macc_top1[0m[0;34m.[0m[0mupdate[0m[0;34m([0m[0mlabel_sup[0m[0;34m,[0m [0moutputs_sup[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     81 [0;31m            [0macc_top5[0m[0;34m.[0m[0mupdate[0m[0;34m([0m[0mlabel_sup[0m[0;34m,[0m [0moutputs_sup[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     82 [0;31m            [0mtrain_loss[0m [0;34m+=[0m [0msum[0m[0;34m([0m[0;34m[[0m[0ml[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0masscalar[0m[0;34m([0m[0;34m)[0m [0;32mfor[0m [0ml[0m [0;32min[0m [0mloss[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0m
[0m
