In [0]:
from fastai.vision import *

In [0]:
path = untar_data(URLs.IMAGEWOOF)

In [0]:
data = (ImageList.from_folder(path).split_by_folder(valid='val')
            .label_from_folder().transform(([flip_lr(p=0.5)], []), size=128)
            .databunch(bs=64, num_workers=2)
            .presize(128, scale=(0.35,1))
            .normalize(imagenet_stats))

In [0]:
import torch, math
from torch.optim.optimizer import Optimizer

# RAdam + LARS
class Ralamb(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        super(Ralamb, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Ralamb, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('Ralamb does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                # Decay the first and second moment running average coefficient
                # m_t
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                # v_t
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                state['step'] += 1
                buffered = self.buffer[int(state['step'] % 10)]

                if state['step'] == buffered[0]:
                    N_sma, radam_step = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        radam_step = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        radam_step = group['lr'] / (1 - beta1 ** state['step'])
                    buffered[2] = radam_step

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
                radam_norm = p_data_fp32.pow(2).sum().sqrt()
                if weight_norm == 0 or radam_norm == 0:
                    trust_ratio = 1
                else:
                    trust_ratio = weight_norm / radam_norm

                state['weight_norm'] = weight_norm
                state['adam_norm'] = radam_norm
                state['trust_ratio'] = trust_ratio

                # more conservative since it's an approximated value
                if N_sma >= 5:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-radam_step * trust_ratio, exp_avg, denom)
                else:
                    p_data_fp32.add_(-radam_step * trust_ratio, exp_avg)

                p.data.copy_(p_data_fp32)

        return loss 

In [0]:
def Over9000(params, alpha=0.5, k=6, *args, **kwargs):
     ralamb = Ralamb(params, *args, **kwargs)
     return Lookahead(ralamb, alpha, k)

In [0]:
opt_func=partial(Over9000, betas = (0.9,0.99), eps=1e-6)

In [0]:
from fastai.script import *
from fastai.vision import *
from fastai.callbacks import *
from fastai.distributed import *

In [0]:
lr = 1e-2

In [10]:
import torch.nn as nn
import torch,math,sys
import torch.utils.model_zoo as model_zoo
from functools import partial
#from ...torch_core import Module
from fastai.torch_core import Module

import torch.nn.functional as F  #(uncomment if needed,but you likely already have it)


class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        print("Mish activation loaded...")

    def forward(self, x): 
        
        x = x *( torch.tanh(F.softplus(x)))

        return x

# or: ELU+init (a=0.54; gain=1.55)
act_fn = Mish()#nn.ReLU(inplace=True)

__all__ = ['MXResNet', 'mxresnet18', 'mxresnet34', 'mxresnet50', 'mxresnet101', 'mxresnet152']

# or: ELU+init (a=0.54; gain=1.55)
act_fn = Mish() #nn.ReLU(inplace=True)

class Flatten(Module):
    def forward(self, x): return x.view(x.size(0), -1)

def init_cnn(m):
    if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
    if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
    for l in m.children(): init_cnn(l)

def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

def noop(x): return x

def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    layers = [conv(ni, nf, ks, stride=stride), bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

class ResBlock(Module):
    def __init__(self, expansion, ni, nh, stride=1):
        nf,ni = nh*expansion,ni*expansion
        layers  = [conv_layer(ni, nh, 3, stride=stride),
                   conv_layer(nh, nf, 3, zero_bn=True, act=False)
        ] if expansion == 1 else [
                   conv_layer(ni, nh, 1),
                   conv_layer(nh, nh, 3, stride=stride),
                   conv_layer(nh, nf, 1, zero_bn=True, act=False)
        ]
        self.convs = nn.Sequential(*layers)
        # TODO: check whether act=True works better
        self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
        self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)

    def forward(self, x): return act_fn(self.convs(x) + self.idconv(self.pool(x)))

def filt_sz(recep): return min(64, 2**math.floor(math.log2(recep*0.75)))

class MXResNet(nn.Sequential):
    def __init__(self, expansion, layers, c_in=3, c_out=1000):
        stem = []
        sizes = [c_in,32,64,64]  #modified per Grankin
        for i in range(3):
            stem.append(conv_layer(sizes[i], sizes[i+1], stride=2 if i==0 else 1))
            #nf = filt_sz(c_in*9)
            #stem.append(conv_layer(c_in, nf, stride=2 if i==1 else 1))
            #c_in = nf

        block_szs = [64//expansion,64,128,256,512]
        blocks = [self._make_layer(expansion, block_szs[i], block_szs[i+1], l, 1 if i==0 else 2)
                  for i,l in enumerate(layers)]
        super().__init__(
            *stem,
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            *blocks,
            nn.AdaptiveAvgPool2d(1), Flatten(),
            nn.Linear(block_szs[-1]*expansion, c_out),
        )
        init_cnn(self)

    def _make_layer(self, expansion, ni, nf, blocks, stride):
        return nn.Sequential(
            *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1)
              for i in range(blocks)])

def mxresnet(expansion, n_layers, name, pretrained=False, **kwargs):
    model = MXResNet(expansion, n_layers, **kwargs)
    if pretrained: 
        #model.load_state_dict(model_zoo.load_url(model_urls[name]))
        print("No pretrained yet for MXResNet")
    return model

me = sys.modules[__name__]
for n,e,l in [
    [ 18 , 1, [2,2,2 ,2] ],
    [ 34 , 1, [3,4,6 ,3] ],
    [ 50 , 4, [3,4,6 ,3] ],
    [ 101, 4, [3,4,23,3] ],
    [ 152, 4, [3,8,36,3] ],
]:
    name = f'mxresnet{n}'
    setattr(me, name, partial(mxresnet, expansion=e, n_layers=l, name=name))

Mish activation loaded...
Mish activation loaded...


In [0]:
  
import torch.nn as nn
import torch.nn.functional as F  #(uncomment if needed,but you likely already have it)

#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
#implemented for PyTorch / FastAI by lessw2020 
#github: https://github.com/lessw2020/mish

class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        x = x *( torch.tanh(F.softplus(x)))
        return x

In [0]:
import itertools as it
from torch.optim import Optimizer, Adam

class Lookahead(Optimizer):
    def __init__(self, base_optimizer,alpha=0.5, k=6):
        if not 0.0 <= alpha <= 1.0:
            raise ValueError(f'Invalid slow update rate: {alpha}')
        if not 1 <= k:
            raise ValueError(f'Invalid lookahead steps: {k}')
        self.optimizer = base_optimizer
        self.param_groups = self.optimizer.param_groups
        self.alpha = alpha
        self.k = k
        for group in self.param_groups:
            group["step_counter"] = 0
        self.slow_weights = [[p.clone().detach() for p in group['params']]
                                for group in self.param_groups]

        for w in it.chain(*self.slow_weights):
            w.requires_grad = False

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()
        loss = self.optimizer.step()
        for group,slow_weights in zip(self.param_groups,self.slow_weights):
            group['step_counter'] += 1
            if group['step_counter'] % self.k != 0:
                continue
            for p,q in zip(group['params'],slow_weights):
                if p.grad is None:
                    continue
                q.data.add_(self.alpha,p.data - q.data)
                p.data.copy_(q.data)
        return loss

In [14]:
res = []
num_epoch=20
learn = Learner(data, mxresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.023704,2.017615,0.382,0.826,02:29
1,1.770728,1.832983,0.398,0.882,02:30
2,1.610263,1.530681,0.558,0.918,02:30
3,1.463128,1.397101,0.612,0.956,02:30
4,1.357713,1.426165,0.604,0.928,02:30
5,1.272313,1.200831,0.702,0.956,02:30
6,1.189963,1.251887,0.692,0.946,02:30
7,1.142661,1.39886,0.65,0.936,02:30
8,1.098895,1.058892,0.77,0.966,02:30
9,1.059684,1.146692,0.732,0.974,02:30


In [0]:

loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [16]:
learn = Learner(data, mxresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.059485,2.109135,0.304,0.784,02:30
1,1.804411,3.343117,0.198,0.714,02:30
2,1.620854,1.460293,0.57,0.936,02:30
3,1.463698,1.473934,0.58,0.914,02:30
4,1.366542,1.385709,0.62,0.954,02:30
5,1.257933,1.177579,0.726,0.966,02:30
6,1.213137,1.279688,0.658,0.966,02:30
7,1.158054,1.38202,0.632,0.942,02:30
8,1.103974,1.10371,0.744,0.974,02:31
9,1.08037,1.067681,0.76,0.97,02:30


In [0]:

loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [18]:
learn = Learner(data, mxresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.081718,3.171293,0.142,0.75,02:30
1,1.800678,2.803348,0.298,0.77,02:30
2,1.655875,1.48086,0.566,0.934,02:30
3,1.48125,1.415093,0.592,0.946,02:30
4,1.381016,1.551865,0.56,0.894,02:30
5,1.285699,1.203709,0.72,0.96,02:30
6,1.221502,1.224392,0.712,0.958,02:30
7,1.161773,1.528642,0.572,0.93,02:31
8,1.119028,1.093334,0.764,0.98,02:30
9,1.057888,1.106549,0.748,0.972,02:31


In [0]:
loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [21]:
learn = Learner(data, mxresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.05541,1.986028,0.308,0.798,02:30
1,1.810345,2.042499,0.316,0.846,02:30
2,1.642179,1.60298,0.512,0.916,02:30
3,1.474641,1.607989,0.502,0.922,02:30
4,1.349392,1.691946,0.526,0.908,02:30
5,1.260427,1.255649,0.688,0.954,02:30
6,1.208528,1.331222,0.638,0.954,02:30
7,1.147038,1.274948,0.672,0.966,02:30
8,1.112309,1.022265,0.768,0.978,02:31
9,1.078243,1.126799,0.744,0.972,02:30


In [0]:
loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [23]:
learn = Learner(data, mxresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.021558,2.11593,0.33,0.834,02:30
1,1.745855,2.678405,0.262,0.846,02:30
2,1.588238,1.493388,0.562,0.93,02:30
3,1.447646,1.649811,0.536,0.896,02:30
4,1.338014,1.447098,0.56,0.942,02:30
5,1.258696,1.189628,0.724,0.96,02:30
6,1.187186,1.490146,0.612,0.924,02:30
7,1.129792,1.245394,0.694,0.966,02:30
8,1.119588,1.04079,0.75,0.974,02:30
9,1.072259,1.108689,0.736,0.978,02:30


In [0]:
loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [25]:
np.mean(res)

0.84639996

In [26]:
np.std(res)

0.006621162

In [27]:
res

[array(0.852, dtype=float32),
 array(0.842, dtype=float32),
 array(0.854, dtype=float32),
 array(0.836, dtype=float32),
 array(0.848, dtype=float32)]