nntrain/acceleration.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/04_acceleration.ipynb.

# %% auto 0
__all__ = ['SGD', 'Momentum', 'RMSProp', 'Adam', 'SchedulerS', 'conv_conn', 'ResBlock', 'resnet', 'ModelMonitorS', 'AugmentS']

# %% ../nbs/04_acceleration.ipynb 2
import torchvision.transforms.functional as TF
import torch
import torch.nn as nn
import torch.nn.functional as F
from operator import attrgetter
from functools import partial
import fastcore.all as fc
import math
import torcheval.metrics as tem
import matplotlib.pyplot as plt
import random
import numpy as np
from .learner import Subscriber
from .activations import conv_block

# %% ../nbs/04_acceleration.ipynb 3
class SGD:
    def __init__(self, params, lr, wd=0.):
        self.params = list(params)
        self.lr = lr
        self.wd = wd
        self.i = 0

    def step(self):                    # this is the method that get's called by the Learner
        with torch.no_grad():
            for p in self.params:
                self.reg_step(p)       # first add regularization
                self.opt_step(p)       # then do the actual step
        self.i +=1

    def opt_step(self, p):
        p -= p.grad * self.lr          # regular step
    
    def reg_step(self, p):
        if self.wd != 0:               # only regularize when the weight decay parameter is set
            p *= 1 - self.lr*self.wd   # update the weights as described above

    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_()

# %% ../nbs/04_acceleration.ipynb 4
class Momentum(SGD):
    def __init__(self, params, lr, wd=0., mom=0.9):
        super().__init__(params, lr=lr, wd=wd)
        self.mom=mom

    def opt_step(self, p):
        if not hasattr(p, 'grad_avg'): p.grad_avg = torch.zeros_like(p.grad)
        p.grad_avg = p.grad_avg*self.mom + p.grad*(1-self.mom)
        p -= self.lr * p.grad_avg

# %% ../nbs/04_acceleration.ipynb 5
class RMSProp(SGD):
    def __init__(self, params, lr, wd=0., sqr_mom=0.99, eps=1e-5):
        super().__init__(params, lr=lr, wd=wd)
        self.sqr_mom = sqr_mom
        self.eps = eps

    def opt_step(self, p):
        if not hasattr(p, 'sqr_avg'): 
            p.sqr_avg = p.grad**2
        p.sqr_avg = p.sqr_avg*self.sqr_mom + (1-self.sqr_mom)*p.grad**2
        p -= self.lr * p.grad/(p.sqr_avg.sqrt() + self.eps)

# %% ../nbs/04_acceleration.ipynb 6
class Adam(SGD):
    def __init__(self, params, lr, wd=0., beta1=0.9, beta2=0.99, eps=1e-5):
        super().__init__(params, lr=lr, wd=wd)
        self.beta1,self.beta2,self.eps = beta1,beta2,eps

    def opt_step(self, p):
        if not hasattr(p, 'avg'): 
            p.avg = torch.zeros_like(p.grad.data)
            p.sqr_avg = torch.zeros_like(p.grad.data)
            
        p.avg = self.beta1*p.avg + (1-self.beta1)*p.grad
        unbias_avg = p.avg / (1 - (self.beta1**(self.i+1)))
        p.sqr_avg = self.beta2*p.sqr_avg + (1-self.beta2)*(p.grad**2)
        unbias_sqr_avg = p.sqr_avg / (1 - (self.beta2**(self.i+1)))
        p -= self.lr * unbias_avg / (unbias_sqr_avg + self.eps).sqrt()

# %% ../nbs/04_acceleration.ipynb 7
class SchedulerS(Subscriber):
    def __init__(self, scheduler_class):
        self.scheduler_class = scheduler_class
    
    # intialize the scheduler instance after the optimizer has been intialized
    def before_fit(self, learn):
        self.scheduler = self.scheduler_class(learn.opt) 
        
    # step the scheduler after the optimizer has stepped
    def after_step(self, learn):
        self.scheduler.step()

# %% ../nbs/04_acceleration.ipynb 8
def conv_conn(in_c, out_c, kernel_size=3, stride=2):
    return nn.Sequential(
        conv_block(in_c, out_c, kernel_size=kernel_size, stride=1, act=True, norm=True),
        conv_block(out_c, out_c, kernel_size=kernel_size, stride=stride, act=False, norm=True)
    )

# %% ../nbs/04_acceleration.ipynb 9
class ResBlock(nn.Module):
    def __init__(self, in_c, out_c, stride=2):
        super().__init__()
        self.in_c = in_c
        self.out_c = out_c
        self.stride = stride
        self.conv_conn = conv_conn(in_c, out_c, stride=stride)
        self.identity_conn = conv_block(in_c, out_c, kernel_size=1, stride=1, act=False, norm=False)
        self.pooling = torch.nn.AvgPool2d(2, ceil_mode=True)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        y_conv = self.conv_conn(x)
        if self.in_c == self.out_c: y_id = x
        elif self.stride == 1:
            y_id = self.identity_conn(x)
        else:
            y_id = self.pooling(self.identity_conn(x))
        return self.relu(y_conv + y_id)

# %% ../nbs/04_acceleration.ipynb 10
def resnet():
    return nn.Sequential(                             # pixel grid input: 28x28  
        ResBlock(1 , 8, stride=1),                    # 28x28
        ResBlock(8 ,16),                              # 14x14
        ResBlock(16,32),                              # 7x7
        ResBlock(32,64),                              # 4x4
        ResBlock(64,128),                             # 2x2
        ResBlock(128,256),                            # 1x1
        nn.Flatten(),                                 # flatten to 256 features
        nn.Linear(256, 10, bias=False),               # linear layer to map to 10 output features
        nn.BatchNorm1d(10)                            # final batchnorm layer
    )

# %% ../nbs/04_acceleration.ipynb 11
class ModelMonitorS(Subscriber):
    
    def __init__(self, modules): self.modules = modules
    
    def before_fit(self, learn):
        self.hooks = [Hook(i, module, partial(self.record_stats, learn)) for i, module in enumerate(self.modules)]
        
    def record_stats(self, learn, hook, layer, inp, outp):
        if learn.model.training:
            hook.nparams = sum(submodule.numel() for submodule in layer.parameters())
            if isinstance(layer, ResBlock):
                # K × K × Cin × Hout × Wout × Cout source=https://machinethink.net/blog/how-fast-is-my-model/
                mac_conv1 = 9 * layer.in_c * inp[0].shape[2] * inp[0].shape[3] * layer.out_c
                mac_conv2 = 9 * layer.out_c * outp.shape[2] * outp.shape[3] * layer.out_c    
                hook.mac = (mac_conv1 + mac_conv2) / 1e6
                if layer.stride != 1:
                    # Add identity conv
                    hook.mac += (layer.in_c * outp.shape[2] * outp.shape[3] * layer.out_c / 1e6)
            else:
                hook.mac = hook.nparams / 1e6
            hook.batch_size = inp[0].shape[0]
            hook.in_shape = list(inp[0].shape[1:])
            hook.out_shape = list(outp.shape[1:])
            
    def after_batch(self, learn):
        for h in self.hooks: h.remove()
        raise CancelFitException                   # Only run this for a single batch, then cancel
        
    def __repr__(self):
        out = f'{"layer":<20} : {"input":<20} : {"output":<20} : {"# params":>10} : {"# MACs":>10}\n'
        total_params = 0
        total_mac = 0
        for h in self.hooks:
            out += f'{h.layer_name:<20} : {str(h.in_shape):<20} : {str(h.out_shape):<20} : {h.nparams:>10d} : {h.mac: 10.1f}\n'
            total_params += h.nparams
            total_mac += h.mac
        return f'{"Total parameters:":<20}{total_params:>10d} \n{"Total MACs:":<20}{total_mac:10.1f} \n\n' + out

# %% ../nbs/04_acceleration.ipynb 12
class AugmentS(Subscriber):
    def __init__(self, transform):
        self.transform = transform
        
    def before_batch(self, learn):
        if learn.model.training:                    # augmentations are only applied to the training data
            learn.batch[0] = self.transform(learn.batch[0])