In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import numpy

In [3]:
from fastai.script import *
from fastai.vision import *
from fastai.callbacks import *
from fastai.distributed import *
from fastprogress import fastprogress
from torchvision.models import *
from fastai.vision.models.xresnet import *
from fastai.vision.models.xresnet2 import *
from fastai.vision.models.presnet import *

In [4]:
torch.backends.cudnn.benchmark = True

# XResNet baseline

In [5]:
#https://github.com/fastai/fastai_docs/blob/master/dev_course/dl2/11_train_imagenette.ipynb

In [6]:
def noop(x): return x

class Flatten(nn.Module):
    def forward(self, x): return x.view(x.size(0), -1)

def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

In [7]:
act_fn = nn.ReLU(inplace=True)

def init_cnn(m):
    if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
    if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
    for l in m.children(): init_cnn(l)

def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    layers = [conv(ni, nf, ks, stride=stride), bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

In [8]:
class ResBlock(nn.Module):
    def __init__(self, expansion, ni, nh, stride=1):
        super().__init__()
        nf,ni = nh*expansion,ni*expansion
        layers  = [conv_layer(ni, nh, 3, stride=stride),
                   conv_layer(nh, nf, 3, zero_bn=True, act=False)
        ] if expansion == 1 else [
                   conv_layer(ni, nh, 1),
                   conv_layer(nh, nh, 3, stride=stride),
                   conv_layer(nh, nf, 1, zero_bn=True, act=False)
        ]
        self.convs = nn.Sequential(*layers)
        self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
        self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)

    def forward(self, x): return act_fn(self.convs(x) + self.idconv(self.pool(x)))

In [9]:
class XResNet(nn.Sequential):
    @classmethod
    def create(cls, expansion, layers, c_in=3, c_out=1000):
        nfs = [c_in, (c_in+1)*8, 64, 64]
        stem = [conv_layer(nfs[i], nfs[i+1], stride=2 if i==0 else 1)
            for i in range(3)]

        nfs = [64//expansion,64,128,256,512]
        res_layers = [cls._make_layer(expansion, nfs[i], nfs[i+1],
                                      n_blocks=l, stride=1 if i==0 else 2)
                  for i,l in enumerate(layers)]
        res = cls(
            *stem,
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            *res_layers,
            nn.AdaptiveAvgPool2d(1), Flatten(),
            nn.Linear(nfs[-1]*expansion, c_out),
        )
        init_cnn(res)
        return res

    @staticmethod
    def _make_layer(expansion, ni, nf, n_blocks, stride):
        return nn.Sequential(
            *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1)
              for i in range(n_blocks)])

In [10]:
def xresnet18 (**kwargs): return XResNet.create(1, [2, 2,  2, 2], **kwargs)
def xresnet34 (**kwargs): return XResNet.create(1, [3, 4,  6, 3], **kwargs)
def xresnet50 (**kwargs): return XResNet.create(4, [3, 4,  6, 3], **kwargs)
def xresnet101(**kwargs): return XResNet.create(4, [3, 4, 23, 3], **kwargs)
def xresnet152(**kwargs): return XResNet.create(4, [3, 8, 36, 3], **kwargs)

# XResNet with Self Attention

In [23]:
#Unmodified from https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py
def conv1d(ni:int, no:int, ks:int=1, stride:int=1, padding:int=0, bias:bool=False):
    "Create and initialize a `nn.Conv1d` layer with spectral normalization."
    conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias)
    nn.init.kaiming_normal_(conv.weight)
    if bias: conv.bias.data.zero_()
    return spectral_norm(conv)



# Adapted from SelfAttention layer at https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py
# Inspired by https://arxiv.org/pdf/1805.08318.pdf
class SimpleSelfAttention(nn.Module):
    
    def __init__(self, n_in:int, ks=1):#, n_out:int):
        super().__init__()
        
        
        self.n_in = n_in
        self.conv = conv1d(n_in, n_in, ks, padding=ks//2, bias=False)
       
       
        self.gamma = nn.Parameter(tensor([0.]))
        
        

    def forward(self,x):
        # symmetry hack
        c = self.conv.weight.view(self.n_in,self.n_in)
        c = (c + c.t())/2
        self.conv.weight = c.view(self.n_in,self.n_in,1)
        
        size = x.size()
        x = x.view(*size[:2],-1)
        o = torch.bmm(x.permute(0,2,1).contiguous(),self.conv(x))
        
       
        o = self.gamma * torch.bmm(x,o) + x
        
           
        return o.view(*size).contiguous()        
        

In [24]:
#unmodified from https://github.com/fastai/fastai/blob/9b9014b8967186dc70c65ca7dcddca1a1232d99d/fastai/vision/models/xresnet.py

def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

def noop(x): return x

def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    layers = [conv(ni, nf, ks, stride=stride), bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

In [25]:
# Modified from https://github.com/fastai/fastai/blob/9b9014b8967186dc70c65ca7dcddca1a1232d99d/fastai/vision/models/xresnet.py
# Added self attention
class ResBlock(nn.Module):
    def __init__(self, expansion, ni, nh, stride=1,sa=False):
        super().__init__()
        
        
        nf,ni = nh*expansion,ni*expansion
        layers  = [conv_layer(ni, nh, 3, stride=stride),
                   conv_layer(nh, nf, 3, zero_bn=True, act=False)
        ] if expansion == 1 else [
                   conv_layer(ni, nh, 1),
                   conv_layer(nh, nh, 3, stride=stride),
                   
                   conv_layer(nh, nf, 1, zero_bn=True, act=False)
                
        ]
        
        self.sa = SimpleSelfAttention(nf,ks=1) if sa else noop
        
        self.convs = nn.Sequential(*layers)
        self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
        self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)

    def forward(self, x): 
        
        
        return act_fn(self.sa(self.convs(x)) + self.idconv(self.pool(x)))
        

In [26]:
# Modified from https://github.com/fastai/fastai/blob/9b9014b8967186dc70c65ca7dcddca1a1232d99d/fastai/vision/models/xresnet.py
# Added self attention

class XResNet_sa(nn.Sequential):
    @classmethod
    def create(cls, expansion, layers, c_in=3, c_out=1000):
        nfs = [c_in, (c_in+1)*8, 64, 64]
        stem = [conv_layer(nfs[i], nfs[i+1], stride=2 if i==0 else 1)
            for i in range(3)]

        nfs = [64//expansion,64,128,256,512]
        res_layers = [cls._make_layer(expansion, nfs[i], nfs[i+1],
                                      n_blocks=l, stride=1 if i==0 else 2, sa = True if i in[len(layers)-4] else False)
                  for i,l in enumerate(layers)]
        res = cls(
            *stem,
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            *res_layers,
            
            nn.AdaptiveAvgPool2d(1), Flatten(),
            nn.Linear(nfs[-1]*expansion, c_out),
        )
        init_cnn(res)
        return res

    @staticmethod
    def _make_layer(expansion, ni, nf, n_blocks, stride, sa = False):
        return nn.Sequential(
            *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1, sa if i in [n_blocks -1] else False)
              for i in range(n_blocks)])

In [27]:
def xresnet50_sa (**kwargs): return XResNet_sa.create(4, [3, 4,  6, 3], **kwargs)

# Data loading

In [28]:
#https://github.com/fastai/fastai/blob/master/examples/train_imagenette.py

def get_data(size, woof, bs, workers=None):
    if   size<=128: path = URLs.IMAGEWOOF_160 if woof else URLs.IMAGENETTE_160
    elif size<=224: path = URLs.IMAGEWOOF_320 if woof else URLs.IMAGENETTE_320
    else          : path = URLs.IMAGEWOOF     if woof else URLs.IMAGENETTE
    path = untar_data(path)

    n_gpus = num_distrib() or 1
    if workers is None: workers = min(8, num_cpus()//n_gpus)

    return (ImageList.from_folder(path).split_by_folder(valid='val')
            .label_from_folder().transform(([flip_lr(p=0.5)], []), size=size)
            .databunch(bs=bs, num_workers=workers)
            .presize(size, scale=(0.35,1))
            .normalize(imagenet_stats))

# Train

In [29]:
opt_func = partial(optim.Adam, betas=(0.9,0.99), eps=1e-6)

## Imagewoof

### Image size = 256

In [30]:
image_size = 256
data = get_data(image_size,woof =True,bs=64)

#### Epochs = 5

In [31]:
# we use the same parameters for baseline and new model
epochs = 5
lr = 3e-3
bs = 64
mixup = 0

##### Baseline

In [32]:
m = xresnet50(c_out=10)

In [21]:
learn = (Learner(data, m, wd=1e-2, opt_func=opt_func,
             metrics=[accuracy,top_k_accuracy],
             bn_wd=False, true_wd=True,
             loss_func = LabelSmoothingCrossEntropy())
            )

In [22]:
if mixup: learn = learn.mixup(alpha=mixup)

In [23]:
learn = learn.to_fp16(dynamic=True)

In [24]:
learn.fit_one_cycle(epochs, lr, div_factor=10, pct_start=0.3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.187593,2.313087,0.23,0.682,00:54
1,1.935577,2.094558,0.308,0.832,00:49
2,1.759941,1.733822,0.436,0.902,00:49
3,1.531103,1.475231,0.572,0.95,00:50
4,1.368547,1.361501,0.622,0.964,00:49


In [29]:
learn.fit_one_cycle(epochs, lr, div_factor=10, pct_start=0.3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.199389,2.135389,0.28,0.77,00:48
1,1.99785,1.96902,0.344,0.844,00:48
2,1.823346,1.901781,0.374,0.864,00:48
3,1.603943,1.608806,0.502,0.93,00:48
4,1.465439,1.456332,0.576,0.96,00:48


In [41]:
learn.fit_one_cycle(epochs, lr, div_factor=10, pct_start=0.3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.154308,2.246162,0.302,0.764,00:48
1,1.919446,1.925215,0.32,0.832,00:48
2,1.733867,1.723924,0.462,0.892,00:48
3,1.526224,1.61998,0.542,0.922,00:48
4,1.38823,1.369903,0.63,0.956,00:48


In [48]:
learn.fit_one_cycle(epochs, lr, div_factor=10, pct_start=0.3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.142307,2.298228,0.258,0.73,00:48
1,1.91846,1.881806,0.406,0.864,00:48
2,1.720017,1.803558,0.386,0.884,00:48
3,1.50266,1.593556,0.516,0.94,00:48
4,1.359015,1.35341,0.626,0.954,00:48


In [53]:
learn.fit_one_cycle(epochs, lr, div_factor=10, pct_start=0.3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.12797,2.116206,0.284,0.79,00:48
1,1.917599,1.971122,0.394,0.852,00:48
2,1.721118,2.203537,0.32,0.826,00:48
3,1.509157,1.545533,0.544,0.942,00:48
4,1.351729,1.338784,0.648,0.958,00:48


In [42]:
results = [61.8,64.8,57.4,62.4,63,61.8, 57.6,63,62.6, 64.8]    #included some from previous notebook iteration

In [43]:
np.mean(results), np.std(results), np.min(results), np.max(results)

(61.919999999999995, 2.4235511135521772, 57.4, 64.8)

##### New model

In [40]:
def do_cycle():
    m = xresnet50_sa(c_out=10)
    learn = None
    gc.collect()
    learn = (Learner(data, m, wd=1e-2, opt_func=opt_func,
             metrics=[accuracy,top_k_accuracy],
             bn_wd=False, true_wd=True,
             loss_func = LabelSmoothingCrossEntropy())
            )
    if mixup: learn = learn.mixup(alpha=mixup)
    learn = learn.to_fp16(dynamic=True)
    learn.fit_one_cycle(5, lr, div_factor=10, pct_start=0.3)

In [41]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.128679,2.462333,0.288,0.748,01:17
1,1.827118,2.270838,0.32,0.792,01:18
2,1.622725,1.699327,0.464,0.908,01:19
3,1.405163,1.480093,0.578,0.942,01:18
4,1.264791,1.241489,0.7,0.968,01:19


In [42]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.148315,2.282931,0.25,0.72,01:18
1,1.922272,1.88446,0.352,0.872,01:19
2,1.705846,1.714496,0.458,0.906,01:18
3,1.465296,1.552459,0.568,0.934,01:18
4,1.315377,1.314569,0.67,0.964,01:18


In [43]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.14409,2.249005,0.272,0.788,01:18
1,1.865969,2.265085,0.3,0.818,01:18
2,1.641836,1.625244,0.51,0.91,01:19
3,1.430338,1.341905,0.638,0.968,01:18
4,1.26867,1.251989,0.68,0.974,01:18


In [44]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.127926,2.14728,0.26,0.754,01:18
1,1.863186,2.599874,0.3,0.794,01:18
2,1.625333,1.650801,0.458,0.932,01:18
3,1.416107,1.374374,0.626,0.956,01:18
4,1.261172,1.23523,0.706,0.97,01:19


In [45]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.156996,2.558034,0.262,0.71,01:19
1,1.872568,1.97439,0.35,0.84,01:18
2,1.665857,1.737017,0.438,0.894,01:19
3,1.427912,1.46179,0.6,0.962,01:18
4,1.290767,1.24904,0.69,0.974,01:19


In [46]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.170368,2.135584,0.262,0.776,01:18
1,1.878662,2.434813,0.27,0.766,01:18
2,1.657103,1.876308,0.386,0.892,01:19
3,1.451584,1.406772,0.596,0.958,01:18
4,1.301576,1.296676,0.66,0.968,01:19


In [47]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.108457,3.455038,0.178,0.716,01:18
1,1.822983,1.925351,0.364,0.87,01:19
2,1.631994,1.618349,0.476,0.936,01:19
3,1.405918,1.416708,0.578,0.954,01:20
4,1.281371,1.260469,0.692,0.966,01:19


In [48]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.147905,2.182101,0.286,0.758,01:20
1,1.879727,1.944445,0.312,0.826,01:19
2,1.677079,1.646459,0.478,0.926,01:20
3,1.471212,1.457453,0.586,0.944,01:19
4,1.305898,1.28115,0.678,0.966,01:19


In [49]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.13512,2.576595,0.19,0.716,01:19
1,1.887473,1.856529,0.36,0.886,01:19
2,1.651269,1.676584,0.48,0.896,01:19
3,1.458867,1.364787,0.636,0.958,01:20
4,1.293055,1.271542,0.664,0.974,01:19


In [50]:
do_cycle()

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.160044,2.183353,0.284,0.772,01:19
1,1.868882,1.98873,0.318,0.812,01:18
2,1.64828,1.690857,0.506,0.914,01:18
3,1.430153,1.371021,0.62,0.958,01:18
4,1.292977,1.264984,0.672,0.964,01:19


In [61]:
results = [0.7, 0.67, 0.68, 0.706, 0.69, 0.66, 0.692, 0.678, 0.664, 0.672]; results = [r*100 for r in results]

In [62]:
np.mean(results), np.std(results), np.min(results), np.max(results)

(68.12, 1.4593149077563732, 66.0, 70.6)

In [46]:
# with no symmetry
#np.mean(results), np.std(results), np.min(results), np.max(results)

(67.64, 1.724644890984808, 65.6, 70.6)