In [0]:
from fastai.vision import *

In [0]:
path = untar_data(URLs.IMAGEWOOF)

In [0]:
data = (ImageList.from_folder(path).split_by_folder(valid='val')
            .label_from_folder().transform(([flip_lr(p=0.5)], []), size=128)
            .databunch(bs=64, num_workers=2)
            .presize(128, scale=(0.35,1))
            .normalize(imagenet_stats))

In [0]:
import torch, math
from torch.optim.optimizer import Optimizer

# RAdam + LARS
class Ralamb(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        super(Ralamb, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Ralamb, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('Ralamb does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                # Decay the first and second moment running average coefficient
                # m_t
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                # v_t
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                state['step'] += 1
                buffered = self.buffer[int(state['step'] % 10)]

                if state['step'] == buffered[0]:
                    N_sma, radam_step = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        radam_step = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        radam_step = group['lr'] / (1 - beta1 ** state['step'])
                    buffered[2] = radam_step

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
                radam_norm = p_data_fp32.pow(2).sum().sqrt()
                if weight_norm == 0 or radam_norm == 0:
                    trust_ratio = 1
                else:
                    trust_ratio = weight_norm / radam_norm

                state['weight_norm'] = weight_norm
                state['adam_norm'] = radam_norm
                state['trust_ratio'] = trust_ratio

                # more conservative since it's an approximated value
                if N_sma >= 5:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-radam_step * trust_ratio, exp_avg, denom)
                else:
                    p_data_fp32.add_(-radam_step * trust_ratio, exp_avg)

                p.data.copy_(p_data_fp32)

        return loss 

In [0]:
def Over9000(params, alpha=0.5, k=6, *args, **kwargs):
     ralamb = Ralamb(params, *args, **kwargs)
     return Lookahead(ralamb, alpha, k)

In [0]:
opt_func=partial(Over9000, betas = (0.9,0.99), eps=1e-6)

In [0]:
from fastai.script import *
from fastai.vision import *
from fastai.callbacks import *
from fastai.distributed import *

In [0]:
lr = 1e-2

In [0]:
import torch.nn as nn
import torch,math,sys
import torch.utils.model_zoo as model_zoo
from functools import partial
#from ...torch_core import Module
from fastai.torch_core import Module

import torch.nn.functional as F  #(uncomment if needed,but you likely already have it)


class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        print("Mish activation loaded...")

    def forward(self, x): 
        
        x = x *( torch.tanh(F.softplus(x)))

        return x

# or: ELU+init (a=0.54; gain=1.55)
act_fn = Mish()#nn.ReLU(inplace=True)

__all__ = ['MXResNet', 'mxresnet18', 'mxresnet34', 'mxresnet50', 'mxresnet101', 'mxresnet152']

# or: ELU+init (a=0.54; gain=1.55)
act_fn = Mish() #nn.ReLU(inplace=True)

class Flatten(Module):
    def forward(self, x): return x.view(x.size(0), -1)

def init_cnn(m):
    if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
    if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
    for l in m.children(): init_cnn(l)

def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

def noop(x): return x

def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    layers = [conv(ni, nf, ks, stride=stride), bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

class ResBlock(Module):
    def __init__(self, expansion, ni, nh, stride=1):
        nf,ni = nh*expansion,ni*expansion
        layers  = [conv_layer(ni, nh, 3, stride=stride),
                   conv_layer(nh, nf, 3, zero_bn=True, act=False)
        ] if expansion == 1 else [
                   conv_layer(ni, nh, 1),
                   conv_layer(nh, nh, 3, stride=stride),
                   conv_layer(nh, nf, 1, zero_bn=True, act=False)
        ]
        self.convs = nn.Sequential(*layers)
        # TODO: check whether act=True works better
        self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
        self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)

    def forward(self, x): return act_fn(self.convs(x) + self.idconv(self.pool(x)))

def filt_sz(recep): return min(64, 2**math.floor(math.log2(recep*0.75)))

class MXResNet(nn.Sequential):
    def __init__(self, expansion, layers, c_in=3, c_out=1000):
        stem = []
        sizes = [c_in,32,64,64]  #modified per Grankin
        for i in range(3):
            stem.append(conv_layer(sizes[i], sizes[i+1], stride=2 if i==0 else 1))
            #nf = filt_sz(c_in*9)
            #stem.append(conv_layer(c_in, nf, stride=2 if i==1 else 1))
            #c_in = nf

        block_szs = [64//expansion,64,128,256,512]
        blocks = [self._make_layer(expansion, block_szs[i], block_szs[i+1], l, 1 if i==0 else 2)
                  for i,l in enumerate(layers)]
        super().__init__(
            *stem,
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            *blocks,
            nn.AdaptiveAvgPool2d(1), Flatten(),
            nn.Linear(block_szs[-1]*expansion, c_out),
        )
        init_cnn(self)

    def _make_layer(self, expansion, ni, nf, blocks, stride):
        return nn.Sequential(
            *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1)
              for i in range(blocks)])

def mxresnet(expansion, n_layers, name, pretrained=False, **kwargs):
    model = MXResNet(expansion, n_layers, **kwargs)
    if pretrained: 
        #model.load_state_dict(model_zoo.load_url(model_urls[name]))
        print("No pretrained yet for MXResNet")
    return model

me = sys.modules[__name__]
for n,e,l in [
    [ 18 , 1, [2,2,2 ,2] ],
    [ 34 , 1, [3,4,6 ,3] ],
    [ 50 , 4, [3,4,6 ,3] ],
    [ 101, 4, [3,4,23,3] ],
    [ 152, 4, [3,8,36,3] ],
]:
    name = f'mxresnet{n}'
    setattr(me, name, partial(mxresnet, expansion=e, n_layers=l, name=name))

Mish activation loaded...
Mish activation loaded...


In [0]:
  
import torch.nn as nn
import torch.nn.functional as F  #(uncomment if needed,but you likely already have it)

#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
#implemented for PyTorch / FastAI by lessw2020 
#github: https://github.com/lessw2020/mish

class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        x = x *( torch.tanh(F.softplus(x)))
        return x

In [0]:
import itertools as it
from torch.optim import Optimizer, Adam

class Lookahead(Optimizer):
    def __init__(self, base_optimizer,alpha=0.5, k=6):
        if not 0.0 <= alpha <= 1.0:
            raise ValueError(f'Invalid slow update rate: {alpha}')
        if not 1 <= k:
            raise ValueError(f'Invalid lookahead steps: {k}')
        self.optimizer = base_optimizer
        self.param_groups = self.optimizer.param_groups
        self.alpha = alpha
        self.k = k
        for group in self.param_groups:
            group["step_counter"] = 0
        self.slow_weights = [[p.clone().detach() for p in group['params']]
                                for group in self.param_groups]

        for w in it.chain(*self.slow_weights):
            w.requires_grad = False

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()
        loss = self.optimizer.step()
        for group,slow_weights in zip(self.param_groups,self.slow_weights):
            group['step_counter'] += 1
            if group['step_counter'] % self.k != 0:
                continue
            for p,q in zip(group['params'],slow_weights):
                if p.grad is None:
                    continue
                q.data.add_(self.alpha,p.data - q.data)
                p.data.copy_(q.data)
        return loss

In [13]:
res = []
num_epoch=20
learn = Learner(data, models.xresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.146302,2.223222,0.28,0.752,02:06
1,1.842076,2.114983,0.35,0.768,02:06
2,1.685652,1.592352,0.512,0.922,02:05
3,1.538648,1.827383,0.444,0.924,02:06
4,1.456481,2.143965,0.344,0.846,02:06
5,1.347925,1.288381,0.644,0.956,02:06
6,1.28733,1.329307,0.622,0.964,02:06
7,1.212908,1.628311,0.524,0.934,02:06
8,1.172403,1.094053,0.758,0.976,02:06
9,1.132418,1.180146,0.738,0.96,02:06


In [0]:

loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [15]:
learn = Learner(data, models.xresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.147192,2.248813,0.28,0.774,02:06
1,1.884788,2.852487,0.288,0.756,02:07
2,1.703208,1.634853,0.48,0.924,02:07
3,1.556314,1.556094,0.508,0.922,02:07
4,1.451079,1.776497,0.472,0.864,02:07
5,1.330307,1.257899,0.684,0.954,02:06
6,1.274105,1.281263,0.642,0.974,02:06
7,1.20501,1.348686,0.616,0.95,02:06
8,1.173257,1.111886,0.738,0.976,02:06
9,1.130309,1.118891,0.724,0.968,02:06


In [0]:

loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [17]:
learn = Learner(data, models.xresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.114615,2.323089,0.254,0.766,02:06
1,1.867955,1.952459,0.396,0.786,02:06
2,1.661879,1.598179,0.498,0.932,02:06
3,1.565409,1.740468,0.456,0.91,02:05
4,1.451349,1.821241,0.43,0.916,02:06
5,1.358532,1.269572,0.688,0.952,02:06
6,1.287325,1.226601,0.664,0.966,02:06
7,1.218368,1.374674,0.63,0.962,02:06
8,1.178592,1.069834,0.752,0.972,02:05
9,1.118158,1.140674,0.736,0.97,02:06


In [0]:
loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [19]:
learn = Learner(data, models.xresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.210266,2.254127,0.262,0.76,02:06
1,1.86158,2.128787,0.328,0.838,02:07
2,1.700174,1.567581,0.508,0.924,02:07
3,1.579693,1.799343,0.432,0.892,02:07
4,1.440089,1.878111,0.41,0.876,02:06
5,1.359647,1.322391,0.628,0.962,02:07
6,1.29127,1.447528,0.586,0.934,02:07
7,1.240058,1.316909,0.658,0.956,02:07
8,1.194371,1.163841,0.72,0.97,02:07
9,1.127161,1.400065,0.6,0.944,02:07


In [0]:
loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [21]:
learn = Learner(data, models.xresnet50(c_out=10), wd=1e-2, opt_func=opt_func,
               metrics=[accuracy, top_k_accuracy],
               bn_wd=False, true_wd=True,
               loss_func=LabelSmoothingCrossEntropy())
n = len(learn.data.train_dl)
anneal_start = int(n*20*0.7)
phase0 = TrainingPhase(anneal_start).schedule_hp('lr', lr)
phase1 = TrainingPhase(n*20 - anneal_start).schedule_hp('lr', lr, anneal=annealing_cos)
phases = [phase0, phase1]
sched = GeneralScheduler(learn, phases)
learn.callbacks.append(sched)
learn.fit(num_epoch)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.217478,2.04185,0.27,0.786,02:06
1,1.904245,2.071724,0.342,0.83,02:07
2,1.723249,1.616565,0.468,0.932,02:07
3,1.583716,1.627288,0.526,0.918,02:06
4,1.479077,1.729208,0.464,0.892,02:07
5,1.377752,1.302867,0.63,0.964,02:07
6,1.319221,1.24734,0.684,0.956,02:07
7,1.242403,1.434739,0.636,0.94,02:07
8,1.205399,1.159465,0.746,0.964,02:07
9,1.148004,1.211858,0.706,0.962,02:07


In [0]:
loss, acc, topk = learn.validate()
res.append(acc.numpy())

In [23]:
np.mean(res)

0.82559997

In [24]:
np.std(res)

0.0044541988

In [25]:
res

[array(0.822, dtype=float32),
 array(0.834, dtype=float32),
 array(0.826, dtype=float32),
 array(0.822, dtype=float32),
 array(0.824, dtype=float32)]