In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from dltools.callback import AvgStatsCallback, sched_lin, sched_cos, Recorder, AvgStatsCallback
from dltools.callback import ParamScheduler, combine_scheds, Callback, LearningrateFinder
from dltools.data import get_data
from dltools.databunch import DataBunch
from dltools.functions import create_learner, get_dls, get_model, get_model_func, listify
from dltools.metrics import accuracy
from dltools.runner import Runner
from torch.utils.data import TensorDataset
import torch.nn.functional as F
import torch

In [3]:
from functools import partial

In [None]:
# from fastaimanual import *

In [None]:
# !type fastaimanual.py

# Fully connected network (nb02)

## Get data

In [None]:
x_train, y_train, x_valid, y_valid = get_data()
x_train, x_train.shape, y_train, y_train.shape, y_train.min(), y_train.max()

In [None]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

In [None]:
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [None]:
n, m = x_train.shape
c = y_train.max()+1
n,m,c

### Check mean, std

In [None]:
x_train.mean(), x_train.std()

In [None]:
x_train.shape

In [None]:
n, m = x_train.shape
c = y_train.max()+1
n,m,c

In [None]:
assert n==y_train.shape[0]==50000
test_eq(m, 28*28)
test_eq(y_train.min(),0)
test_eq(y_train.max(),9)

In [None]:
img = x_train[0]

In [None]:
img.view(28,28).type()

In [None]:
plt.imshow(img.view((28,28)))

In [None]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

## Initial python model

### Matmul

In [None]:
weights = torch.randn(784, 10)
bias = torch.zeros(10)

In [None]:
weights, weights.shape, bias

In [None]:
def matmul(a,b):
    ar,ac = a.shape
    br,bc = b.shape
    assert ac==br
    c = torch.zeros(ar,bc)
    for i in range(ar):
        for j in range(bc):
            for k in range(ac):
                c[i,j] += a[i,k] * b[k,j]
    return c

In [None]:
m1 = x_valid[:5]
m2 = weights

In [None]:
m1.shape, m2.shape

In [None]:
%time t1 = matmul(m1,m2)

In [None]:
t1.shape

In [None]:
len(x_train)

In [None]:
def matmul(a,b):
    ar,ac = a.shape
    br,bc = b.shape
    assert ac==br
    c = torch.zeros(ar,bc)
    
    for i in range(ar):
        for j in range(bc):
            c[i,j] = (a[i,:] * b[:,j]).sum()
    return c

In [None]:
%timeit -n 10 _=matmul(m1, m2)

In [None]:
t1.dtype

In [None]:
def near(a,b): return torch.allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

In [None]:
test_near(t1, matmul(m1, m2))

#### Broadcasting

In [None]:
mm = tensor([[1., 2, 3], [4,5,6], [7,8,9]]); mm

In [None]:
cc = tensor([10., 20, 30]); cc

In [None]:
2*mm

In [None]:
tt = c.expand_as(mm); tt

In [None]:
mm+tt

In [None]:
tt.storage()

In [None]:
cc, cc.shape, cc.unsqueeze(0)

#### Matmul with broadcasting

In [None]:
def matmul(a,b):
    ar,ac = a.shape
    br,bc = b.shape
    assert ac==br
    c = torch.zeros(ar,bc)
    for i in range(ar):
        c[i] = (a[i].unsqueeze(-1) * b).sum(dim=0)
    return c

In [None]:
%timeit -n 10 _=matmul(m1,m2)

In [None]:
test_near(t1, matmul(m1,m2))

#### Matmul with Pytorch op

In [None]:
%timeit -n 10 t2 = m1.matmul(m2)

In [None]:
t2 = m1@m2

In [None]:
test_near(t1, t2)

In [None]:
m1.shape, m2.shape

### Basic architecture - weight init

#### Manual kaiming init

In [None]:
# num hidden
nh = 50

In [None]:
# simplified kaiming init
w1 = torch.randn(m, nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)/math.sqrt(nh)
b2 = torch.zeros(1)

In [None]:
test_near_zero(w1.mean())
test_near_zero(1/math.sqrt(m)-w1.std())

In [None]:
# Approx. ~(0,1) from the normalization with training mean and std
x_valid.mean(), x_valid.std()

In [None]:
def lin(x, w, b):
    return x@w + b

In [None]:
t = lin(x_train, w1, b1)

In [None]:
# This should also be ~ (0,1) because kaiming init is designed to do this
t.mean(), t.std()

In [None]:
def relu(x):
    return x.clamp_min(0.)

In [None]:
t = relu(lin(x_train, w1, b1))

In [None]:
# This is not ~(0,1) because of the ReLU
t.mean(), t.std()

In [None]:
# Kaiming init for relu
w1 = torch.randn(m, nh) * math.sqrt(2/m)

In [None]:
w1.mean(), w1.std(), w1.std()/math.sqrt(2/m)

In [None]:
t = relu(lin(x_train, w1, b1))

In [None]:
t.mean(), t.std()

#### PyTorch kaiming for w1

In [None]:
# Torch kaiming init
w1 = torch.zeros(m, nh)
init.kaiming_normal_(w1, mode='fan_out')

In [None]:
w1.mean(), w1.std(), w1.std()/math.sqrt(2/m)

In [None]:
t = relu(lin(x_train, w1, b1))
t.mean(), t.std()

In [None]:
??init.kaiming_normal_

In [None]:
import torch.nn

In [None]:
torch.nn.Linear(m, nh).weight.shape

In [None]:
torch.nn.Linear.forward??

In [None]:
torch.nn.functional.linear??

In [None]:
torch.nn.Conv2d??

In [None]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [None]:
# Adjusted ReLU
def relu(x):
    return x.clamp_min(0.)-0.5

In [None]:
# Using kaiming init and ReLU with adjusted mean
t = relu(lin(x_train, w1, b1))
t.mean(), t.std()

In [None]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [None]:
%timeit -n 10 _=model(x_valid)

In [None]:
assert model(x_valid).shape == torch.Size([x_valid.shape[0], 1])

#### Full kaiming init pytorch

In [None]:
# num hidden
nh = 50

In [None]:
w1 = torch.zeros(m, nh)
init.kaiming_normal_(w1, mode='fan_out')
b1 = torch.zeros(nh)
w2 = torch.zeros(nh, 1)
init.kaiming_normal_(w2, mode='fan_out')
b2 = torch.zeros(1)

### Loss function: MSE

In [None]:
model(x_valid).shape

In [None]:
mse(model(x_valid), y_valid)

### Gradients and backward pass

In [None]:
y_train.shape

In [None]:
def mse_grad(inp, targ):
    # Gradient of loss wrt output of previous layer
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [None]:
def relu_grad(inp, out):
    # grad of relu wrt input activation
    inp.g = (inp>0).float() * out.g

In [None]:
def lin_grad(inp, out, w, b):
    # grad of matmul wrt input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [None]:
def forward_and_backward(inp, targ):
    # forward pass
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    # we only need the loss in forward
    loss = mse(out, targ)
    
    # backward
    mse_grad(out, targ)
    lin_grad(l2, out, w2, b2)
    relu_grad(l1, l2)
    lin_grad(inp, l1, w1, b1)

In [None]:
forward_and_backward(x_train, y_train)

In [None]:
x_train.g

In [None]:
torch.Tensor??

In [None]:
w1g = w1.g.clone()
b1g = b1.g.clone()
w2g = w2.g.clone()
b2g = b2.g.clone()
ig = x_train.g.clone()

Use torch.autograd to check the results

In [None]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [None]:
def forward(inp, targ):
    # forward
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    return mse(out, targ)

In [None]:
loss = forward(xt2, y_train)

In [None]:
loss.backward()

In [None]:
test_near(w22.grad, w2.g)
test_near(b22.grad, b2.g)
test_near(w12.grad, w1.g)
test_near(b12.grad, b1.g)
test_near(xt2.grad, x_train.g)

## Refactor model

### Layers as classes

#### No superclass

In [None]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)-0.5
        return self.out
    
    def backward(self):
        self.inp.g = (self.inp>0).float() * self.out.g

In [None]:
class Lin():
    def __init__(self, w, b):
        self.w = w
        self.b = b    
    
    def __call__(self, inp):
        self.inp = inp
        self.out = self.inp @ self.w + self.b
        return self.out    
    
    def backward(self):        
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse():
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = (self.inp.squeeze() - self.targ).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [None]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
    
    def __call__(self, x, targ):
        for l in self.layers:
            x = l(x)
        return self.loss(x, targ)
        # the x in self.loss(x, targ) is the output of the last layer (Lin(w2,b2)) and gets its gradient defined here.
        # This is why out.g is available when called in backward for the first of the reversed layers.
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers):
            l.backward()

In [None]:
w1.g, b1.g, w2.g, b2.g = [None]*4

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
%time loss = model(x_valid, y_valid)

In [None]:
%time loss = model(x_train, y_train)

In [None]:
%time model.backward()

#### Module superclass with Module.forward() 

In [None]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self):
        raise Exception("not implemented")
    
    def backward(self):
        self.bwd(self.out, *self.args)

In [None]:
class Relu(Module):
    def forward(self, inp):
        return inp.clamp_min(0.5) - 0.5
    
    def bwd(self, out, inp):
        inp.g = (inp>0).float() * out.g

In [None]:
class Lin(Module):
    def __init__(self, w, b):
        self.w = w
        self.b = b
        
    def forward(self, inp):
        return inp @ self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = torch.einsum("bi, bj -> ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [None]:
class Mse(Module):
    def forward(self, inp, targ):
        return (inp.squeeze() - targ).pow(2).mean()
        
    def bwd(self, out, inp, targ):
        inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / targ.shape[0]

In [None]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
    
    def __call__(self, x, targ):
        #set_trace()
        for l in self.layers:
            x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers):
            l.backward()

In [None]:
w1.g, b1.g, w2.g, b2.g = [None] * 4
model = Model()

In [None]:
%time loss = model(x_valid, y_valid)

In [None]:
%time model.backward()

In [None]:
x_valid.shape, w1.shape, w2.shape

#### class Lin() without einsum

In [None]:
class Lin(Module):
    def __init__(self, w, b):
        self.w = w
        self.b = b
    
    def forward(self, inp):
        return inp @ self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        self.b.g = out.g.sum(0)

In [None]:
model = Model()

In [None]:
%time loss = model(x_train, y_train)

In [None]:
%time model.backward()

#### nn.Linear and nn.Module

In [None]:
nn.Module??

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers:
            x = l(x)
        return self.loss(x.squeeze(), targ)

In [None]:
model = Model(m, nh, 1)

In [None]:
%time loss = model(x_train, y_train)

In [None]:
%time loss.backward()

# nb03

### Simple model start

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn. ReLU(), nn.Linear(nh, n_out)]
        self.loss = mse
    
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x       

In [None]:
model = Model(m, nh, 10)

In [None]:
pred = model(x_train)

### Cross entropy loss

#### log_softmax with division

In [None]:
def log_softmax(x):
    return (x.exp()/(x.exp().sum(-1, keepdim=True))).log()

In [None]:
sm_pred = log_softmax(pred)

In [None]:
def nll(input, target):
    return -input[range(target.shape[0]), target].mean()

In [None]:
loss = nll(sm_pred, y_train)

In [None]:
loss

#### Rewrite log_softmax to get rid of division

In [None]:
def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()

In [None]:
test_near(nll(log_softmax(pred), y_train), loss)

#### Rewrite to increase numerical stability by subtracting the max

In [None]:
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:, None]).exp().sum(-1).log()

In [None]:
test_near(logsumexp(pred), pred.logsumexp(-1))

In [None]:
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)

In [None]:
test_near(nll(log_softmax(pred), y_train), loss)

In [None]:
test_near(F.nll_loss(F.log_softmax(pred, -1), y_train), loss)

#### Use pytorch F.cross_entropy

In [None]:
test_near(F.cross_entropy(pred, y_train), loss)

## Basic training loop

#### Definitions

In [None]:
loss_func = F.cross_entropy

In [None]:
bs = 64                    # batchsize

xb = x_train[:bs]          # mini batch from x
yb = y_train[:bs]
# preds = model(xb)
# preds[0], preds.shape

In [None]:
lr = 0.5                   # learning rate
epochs = 1                 # how many epochs to train for

#### fit() 1: Manually subtracting gradients and zeroing gradients in a loop

In [None]:
yb = y_train[:bs]

In [None]:
loss_func(preds, yb)

In [None]:
accuracy(preds, yb)

https://youtu.be/AcA8HAYh7IE?t=2813

In [None]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i + bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        loss = loss_func(model(xb), yb)
        
        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias -= l.bias.grad * lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()

In [None]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

### Refactor parameter updates

#### Initializing layers in init, using nn.Module

nn.Module does the module registering for us. That's why we call ``super().__init__()``.

https://youtu.be/AcA8HAYh7IE?t=3003

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)
    
    def __call__(self, x):
        return self.l2(F.relu(self.l1(x)))

In [None]:
model = Model(m, nh, 10)

In [None]:
for name, l in model.named_children():
    print(f"{name}: {l}")

In [None]:
model

In [None]:
model.l1

### fit()

In [None]:
def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            loss = loss_func(model(xb), yb)
            
            loss.backward()
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()

##### Results of fit()

In [None]:
fit()

In [None]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

#### DummyModule to register modules and traverse model parameters with __setattr__ (this is actually before using nn.Module)

In [None]:
class DummyModule():
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)
    
    def __setattr__(self, k, v):
        if not k.startswith("_"):
            self._modules[k] = v
        super().__setattr__(k, v)
    
    def __repr__(self):
        return f"{self._modules}"
    
    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters():
                yield p

In [None]:
mdl = DummyModule(m, nh, 10)

In [None]:
[o.shape for o in mdl.parameters()]

#### Registering modules manually with self.add_module from a list of layers

In [None]:
layers = [nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10)]

In [None]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i, l in enumerate(self.layers):
            self.add_module(f"layer_{i}", l)
    
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [None]:
model = Model(layers)

In [None]:
model(x_train)

In [None]:
model

#### Using nn.ModuleList to register modules from a layers list

Without having to use ``self.add_module``

In [None]:
layers = [nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10)]

In [None]:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
    
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [None]:
model = SequentialModel(layers)

In [None]:
fit()

In [None]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

#### nn.Sequential()

Does the same as above so we don't need to write it ourselves

In [None]:
model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))

In [None]:
fit()

In [None]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

###  optim

#### Manual optimizer

Has to do opt.step() and opt.zero_grad()

In [None]:
class Optimizer():
    def __init__(self, params, lr=0.5):
        self.params = list(params)
        self.lr = lr
    
    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= p.grad * self.lr
    
    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_()

In [None]:
model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))

In [None]:
opt = Optimizer(model.parameters())

In [None]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i + bs
        xb, yb = x_train[start_i:end_i], y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(model(xb), yb)
        
        loss.backward()
        opt.step()
        opt.zero_grad()

In [None]:
loss, acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss, acc

#### Optimizer with pytorch

In [None]:
# optim.SGD.step??

In [None]:
opt = optim.SGD(model.parameters(), lr=lr)

#### get_model() - combining model and pytorch optimizer

In [None]:
def get_model():
    model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))
    return model, optim.SGD(model.parameters(), lr=lr)

##### get_model() execute

In [None]:
model, opt = get_model()

In [None]:
loss_func(model(xb), yb)

##### fit() after get_model()

In [None]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i + bs
        xb, yb = x_train[start_i:end_i], y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(model(xb), yb)
        
        loss.backward()
        opt.step()
        opt.zero_grad()

In [None]:
yb.shape

In [None]:
loss, acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss, acc

## Dataset and DataLoader

### Dataset

In [None]:
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

#### Dataset check

In [None]:
assert len(train_ds) == len(x_train)

In [None]:
xb, yb = train_ds[0:5]

In [None]:
assert xb.shape==(5, 28*28)

In [None]:
xb, yb

#### train with dataset

In [None]:
model, opt = get_model()

In [None]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        xb, yb = train_ds[i*bs:i*bs+bs]
        pred = model(xb)
        loss = loss_func(pred, yb)
        
        loss.backward()
        opt.step()
        opt.zero_grad()

In [None]:
loss, acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss, acc

### DataLoader

#### DataLoader without sampler

In [None]:
class DataLoader():
    def __init__(self, ds, bs):
        self.ds, self.bs = ds, bs
    
    def __iter__(self):
        for i in range(0, len(self.ds), self.bs):
            yield self.ds[i:i+self.bs]

In [None]:
train_dl, valid_dl = DataLoader(train_ds, bs), DataLoader(valid_ds, bs)

#### DataLoader check

In [None]:
xb, yb = next(iter(valid_dl))

In [None]:
assert xb.shape==(bs, 28*28)

In [None]:
plt.imshow(xb[0].view(28,28))

#### model, new fit() for ds, dl

In [None]:
def fit():
    for epoch in range(epochs):
        for xb, yb in train_dl:
            loss = loss_func(model(xb), yb)
            loss.backward()
            
            opt.step()
            opt.zero_grad()

##### fit() exec

In [None]:
model, opt = get_model()

In [None]:
fit()

In [None]:
loss, acc = loss_func(model(x_train), y_train), accuracy(model(x_train), y_train)
loss, acc

### Random sampling

In [None]:
class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n, self.bs, self.shuffle = len(ds), bs, shuffle
    
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs):
            yield self.idxs[i:i+self.bs]

#### Sampler check

In [None]:
small_ds = Dataset(*train_ds[:10])

In [None]:
s = Sampler(small_ds, 3, False)

In [None]:
[o for o in s]

In [None]:
s = Sampler(small_ds, 3, True)
[o for o in s]

### DataLoader with Sampler (and collate)

#### collate()

In [None]:
def collate(b):
    xs, ys = zip(*b)
    return torch.stack(xs), torch.stack(ys)

#### DataLoader with sampler

In [None]:
class DataLoader():
    def __init__(self, ds, sampler, collate_fn=collate):
        self.ds, self.sampler, self.collate_fn = ds, sampler, collate_fn
    
    def __iter__(self):
        for s in self.sampler:
            yield self.collate_fn([self.ds[i] for i in s])

In [None]:
train_samp = Sampler(train_ds, bs, shuffle=True)
valid_samp = Sampler(valid_ds, bs, shuffle=False)

In [None]:
train_dl = DataLoader(train_ds, train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, valid_samp, collate_fn=collate)

#### DataLoader with Sampler check

In [None]:
xb, yb = next(iter(valid_dl))
plt.imshow(xb[0].view(28,28))
yb[0]

In [None]:
xb, yb = next(iter(train_dl))
plt.imshow(xb[0].view(28,28))
yb[0]

#### fit() exec


In [None]:
model, opt = get_model()

In [None]:
fit()

In [None]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

#### DataLoader with pytorch sampler

In [None]:
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

In [None]:
train_dl = DataLoader(train_ds, bs, sampler=RandomSampler(train_ds), collate_fn=collate)
valid_dl = DataLoader(valid_ds, bs, sampler=SequentialSampler(valid_ds), collate_fn=collate)

#### fit() exec

In [None]:
model, opt = get_model()
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

#### DataLoader from pytorch with Sampler from pytorch

Most of the time the flexibility of using your own sampler or collation function is not needed, so just pass shuffle=True or False

In [None]:
train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_ds, bs, shuffle=False)

#### fit() exec

In [None]:
model, opt = get_model()
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

## Validation

In [None]:
model.eval??

In [None]:
model.train??

In [None]:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        # print(model.training)
        for xb, yb in train_dl:
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        
        model.eval()
        # print(model.training)
        with torch.no_grad():
            tot_loss, tot_acc = 0., 0.
            for xb, yb in valid_dl:
                tot_loss += loss_func(model(xb), yb)
                tot_acc += accuracy(model(xb), yb)
        nv = len(valid_dl)
        print(epoch, tot_loss/nv, tot_acc/nv)
    return tot_loss/nv, tot_acc/nv

In [None]:
train_dl, valid_dl = get_dls(train_ds, valid_ds, bs)
model, opt = get_model()
loss, acc = fit(5, model, loss_func, opt, train_dl, valid_dl)

# nb04

## DataBunch

In [None]:
x_train, y_train, x_valid, y_valid = get_data()
train_mean, train_std = x_train.mean(), x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

nh, bs = 50, 64
c = y_train.max().item() + 1
loss_func = F.cross_entropy

In [None]:
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

In [None]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

In [None]:
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

Factor out input to fit()

In [None]:
class DataBunch():
    def __init__(self, train_dl, valid_dl, c=None):
        self.train_dl, self.valid_dl, self.c = train_dl, valid_dl, c
    
    @property
    def train_ds(self):
        return self.train_dl.dataset
    
    @property
    def valid_ds(self):
        return self.valid_dl.dataset

In [None]:
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

The `*` is because get_model() returns 2 values: `model` and `opt`

## Learner

In [None]:
class Learner():
    def __init__(self, model, opt, loss_func, data):
        self.model, self.opt, self.loss_func, self.data = model, opt, loss_func, data

In [None]:
learn = Learner(*get_model(data), loss_func, data)

### fit() exec

In [None]:
def fit(epochs, learn):
    for epoch in range(epochs):
        # set model.training to True
        learn.model.train()
        for xb, yb in learn.data.train_dl:
            loss = learn.loss_func(learn.model(xb), yb)
            loss.backward()
            learn.opt.step()
            learn.opt.zero_grad()
        
        learn.model.eval()
        with torch.no_grad():
            tot_loss, tot_acc = 0., 0.
            for xb, yb in learn.data.valid_dl:
                pred = learn.model(xb)
                tot_loss += learn.loss_func(pred, yb)
                tot_acc += accuracy(pred, yb)
        nv = len(learn.data.valid_dl)
        print(epoch, tot_loss/nv, tot_acc/nv)
    return tot_loss/nv, tot_acc/nv

In [None]:
loss, acc = fit(1, learn)

## Callback

### one_batch()

In [None]:
def one_batch(xb, yb):
    pred = model(xb)
    loss = loss_func(pred, yb)
    loss.backward()
    opt.step()
    opt.zero_grad()

    
def fit():
    for epoch in range(epochs):
        for b in data.train_dl:
            one_batch(*b)

#### model

In [None]:
model, opt = get_model(data)

In [None]:
epochs = 1

In [None]:
fit()

### one_batch() with callbacks

In [None]:
def one_batch(xb, yb, cb):
    if not cb.begin_batch(xb, yb):
        return
    loss = cb.learn.loss_func(cb.learn.model(xb), yb)
    if not cb.after_loss(loss):
        return
    loss.backward()
    if cb.after_backward():
        cb.learn.opt.step()
    if cb.after_step():
        cb.learn.opt.zero_grad()

In [None]:
def all_batches(dl, cb):
    for xb, yb in dl:
        one_batch(xb, yb, cb)
        if cb.do_stop():
            return

In [None]:
def fit(epochs, learn, cb):
    if not cb.begin_fit(learn):
        return
    for epoch in range(epochs):
        if not cb.begin_epoch(epoch):
            continue
        all_batches(learn.data.train_dl, cb)
        if cb.begin_validate():
            with torch.no_grad():
                all_batches(learn.data.valid_dl, cb)
        if cb.do_stop() or not cb.after_epoch():
            break
    cb.after_fit()

### Callback and CallbackHandler

#### Callback

In [None]:
class Callback():
    def begin_fit(self, learn: Learner) -> bool:
        self.learn = learn
        return True
    
    def after_fit(self) -> None:
        return True
    
    def begin_epoch(self, epoch: int) -> bool:
        self.epoch = epoch
        return True
    
    def begin_batch(self, xb: torch.Tensor, yb: torch.Tensor) -> bool:
        self.xb, self.yb = xb, yb
        return True  
    
    def after_loss(self, loss: torch.Tensor) -> bool:
        self.loss = loss
        return True
    
    def after_backward(self) -> bool:
        return True
        
    def after_step(self) -> bool:
        return True
    
    def begin_validate(self) -> bool:
        return True
    
    def after_epoch(self) -> bool:
        return True


#### CallbackHandler

In [None]:
class CallbackHandler():
    def __init__(self, cbs=None):
        self.cbs = cbs if cbs else []
    
    def begin_fit(self, learn: Learner) -> bool:
        self.learn = learn
        self.in_train = True
        learn.stop = False
        res = True
        for cb in self.cbs:
            res = res and cb.begin_fit(learn)
        return res
    
    def after_fit(self) -> bool:
        res = True
        for cb in self.cbs:
            res = res and cb.after_fit()
        return res
    
    def begin_epoch(self, epoch: int):
        self.learn.model.train()
        self.in_train = True
        res = True
        for cb in self.cbs:
            res = res and cb.begin_epoch(epoch)
        return res
    
    def begin_batch(self, xb: torch.Tensor, yb: torch.Tensor) -> bool:
        res = True
        for cb in self.cbs:
            res = res and cb.begin_batch(xb, yb)
        return res
    
    def after_loss(self, loss) -> bool:
        res = self.in_train
        for cb in self.cbs:
            res = res and cb.after_loss(loss)
        return res
    
    def after_backward(self) -> bool:
        res = True
        for cb in self.cbs:
            res = res and cb.after_backward()
        return res
    
    def after_step(self) -> bool:
        res = True
        for cb in self.cbs:
            res = res and cb.after_step()
        return res
    
    def begin_validate(self) -> bool:
        self.learn.model.eval()
        self.in_train = False
        res = True
        for cb in self.cbs:
            res = res and cb.begin_validate()
        return res
    
    def after_epoch(self) -> bool:
        res = True
        for cb in self.cbs:
            res = res and cb.after_epoch()
        return res
    
    def do_stop(self) -> bool:
        try:
            return self.learn.stop
        finally:
            self.learn.stop = False
        

#### TestCallback and fit()

In [None]:
class TestCallback(Callback):
    def begin_fit(self, learn):
        super().begin_fit(learn)
        self.n_iters = 0
        return True
    
    def after_step(self):
        self.n_iters += 1
        print(self.n_iters)
        if self.n_iters >=10:
            self.learn.stop = True
        return True

In [None]:
fit(1, learn, cb=CallbackHandler([TestCallback()]))

### Runner and new Callback definition

In [None]:
import re

_camel_re1 = re.compile('(.)([A-Z][a-z]+)')
_camel_re2 = re.compile('(a-z0-9)([A-Z])')

def camel2snake(name):
    s1 = re.sub(_camel_re1, r'\1_\2', name)
    return re.sub(_camel_re2, r'\1_\2', s1).lower()

In [None]:
class Callback():
    _order = 0
    def set_runner(self, run):
        self.run = run
    
    def __getattr__(self, k):
        return getattr(self.run, k)
    
    @property
    def name(self):
        name = re.sub(r'Callback$', '', self.__class__.__name__)
        return camel2snake(name or 'callback')

#### TrainEvalCallback

In [None]:
class TrainEvalCallback(Callback):
    """
    Switches back and forth between training and validation mode.
    """
    def begin_fit(self):
        self.run.n_epochs = 0
        self.run.n_iter = 0
    
    def after_batch(self):
        if not self.in_train:
            return
        self.run.n_epochs += 1./self.iters
        self.run.n_iter += 1
    
    def begin_epoch(self):
        self.run.n_epochs = self.epoch
        self.model.train()
        self.run.in_train = True
    
    def begin_validate(self):
        self.model.eval()
        self.run.in_train = False

In [None]:
TrainEvalCallback().name

#### TestCallback

In [None]:
class TestCallback(Callback):
    def after_step(self):
        if self.train_eval.n_iters >= 10:
            return True

#### listify()

In [None]:
# from typing import *  # fastai-remake.py

def listify(o):
    if o is None:
        return []
    if isinstance(o, list):
        return o
    if isinstance(o, str):
        return [o]
    if isinstance(o, Iterable):
        return list(o)
    return [o]

### Runner (container for model, opt, loss_func, data, one_batch, all_batches)

In [None]:
class Runner():
    def __init__(self, cbs=None, cb_funcs=None):
        cbs = listify(cbs)
        for cbf in listify(cb_funcs):
            cb = cbf()
            setattr(self, cb.name, cb)
            cbs.append(cb)
        self.stop = False
        self.cbs = [TrainEvalCallback()] + cbs
    
    @property
    def opt(self):
        return self.learn.opt
    
    @property
    def model(self):
        return self.learn.model
    
    @property
    def loss_func(self):
        return self.learn.loss_func
    
    @property
    def data(self):
        return self.learn.data
    
    def one_batch(self, xb, yb):
        self.xb, self.yb = xb, yb
        if self('begin_batch'): return
        self.pred = self.model(self.xb)
        if self('after_pred'): return
        self.loss = self.loss_func(self.pred, self.yb)
        if self('after_loss') or not self.in_train: return
        self.loss.backward()
        if self('after_backward'): return
        self.opt.step()
        if self('after_step'): return
        self.opt.zero_grad()
    
    def all_batches(self, dl):
        self.iters = len(dl)
        for xb, yb in dl:
            if self.stop: break
            self.one_batch(xb, yb)
            self('after_batch')
        self.stop = False
    
    def fit(self, epochs, learn):
        self.epochs, self.learn = epochs, learn
        
        try:
            for cb in self.cbs:
                cb.set_runner(self)
            if self('begin_fit'): return
            for epoch in range(epochs):
                self.epoch = epoch
                if not self('begin_epoch'):
                    self.all_batches(self.data.train_dl)
                
                with torch.no_grad():
                    if not self('begin_validate'):
                        self.all_batches(self.data.valid_dl)
                if self('after_epoch'): break
        
        finally:
            self('after_fit')
            self.learn = None
    
    def __call__(self, cb_name):
        for cb in sorted(self.cbs, key=lambda x: x._order):
            f = getattr(cb, cb_name, None)
            if f and f(): return True
        return False

### AvgStats Callback

In [None]:
class AvgStats():
    def __init__(self, metrics, in_train):
        self.metrics, self.in_train = listify(metrics), in_train
    
    def reset(self):
        self.tot_loss, self.count = 0., 0
        self.tot_mets = [0.] * len(self.metrics)
    
    @property
    def all_stats(self):
        return [self.tot_loss.item()] + self.tot_mets
    
    @property
    def avg_stats(self):
        return [o/self.count for o in self.all_stats]
    
    def __repr__(self):
        if not self.count: return ""
        return f"{'train' if self.in_train else 'valid'}: {self.avg_stats}"
    
    def accumulate(self, run):
        bn = run.xb.shape[0]
        self.tot_loss += run.loss * bn
        self.count += bn
        for i, m in enumerate(self.metrics):
            self.tot_mets[i] += m(run.pred, run.yb) * bn

In [None]:
class AvgStatsCallback(Callback):
    def __init__(self, metrics):
        self.train_stats = AvgStats(metrics, True)
        self.valid_stats = AvgStats(metrics, False)
    
    def begin_epoch(self):
        self.train_stats.reset()
        self.valid_stats.reset()
    
    def after_loss(self):
        stats = self.train_stats if self.in_train else self.valid_stats
        with torch.no_grad():
            stats.accumulate(self.run)
    
    def after_epoch(self):
        print(self.train_stats)
        print(self.valid_stats)

In [None]:
learn = Learner(*get_model(data), loss_func, data)

In [None]:
stats = AvgStatsCallback([accuracy])
run = Runner(cbs=stats)

In [None]:
run.fit(2, learn)

In [None]:
loss, acc = stats.valid_stats.avg_stats

### Partial

In [None]:
# from functools import partial  # fastai-remake.py

In [None]:
acc_cbf = partial(AvgStatsCallback, accuracy)

In [None]:
run = Runner(cb_funcs = acc_cbf)

In [None]:
run.fit(1, learn)

In [None]:
run.avg_stats.valid_stats

# NB05

## data, learn, run 

In [4]:
x_train,y_train,x_valid,y_valid = get_data()
train_ds,valid_ds = TensorDataset(x_train, y_train),TensorDataset(x_valid, y_valid)
nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy

In [5]:
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

### learn, run

In [None]:
learn = create_learner(get_model, loss_func, data)

In [None]:
run = Runner([AvgStatsCallback([accuracy])])

In [None]:
run.fit(3, learn)

## LR annealing 

### Intro to decorators

See https://realpython.com/primer-on-python-decorators/

In [None]:
def my_decorator(func):
    def wrapper():
        print("Something is happening before the function is called.")
        func()
        print("Something is happening after the function is called.")
    return wrapper

def say_whee():
    print("Whee!")

say_whee = my_decorator(say_whee)

In [None]:
say_whee()

In [None]:
def my_decorator(func):
    def wrapper():
        print("Something is happening before the function is called.")
        func()
        print("Something is happening after the function is called.")
    return wrapper

@my_decorator
def say_whee():
    print("Whee!")

In [None]:
say_whee()

### Annealer with param scheduler cb

In [None]:
torch.Tensor.ndim = property(lambda x: len(x.shape))

In [None]:
fun = sched_lin(1, 2)

In [None]:
fun(0.3)

In [None]:
pcts = [0.3, 0.7]

In [None]:
#export
def combine_scheds(pcts, scheds):
    assert sum(pcts) == 1.
    pcts = tensor([0] + listify(pcts))
    assert torch.all(pcts >= 0)
    pcts = torch.cumsum(pcts, 0)
    def _inner(pos):
        idx = (pos >= pcts).nonzero().max()
        actual_pos = (pos-pcts[idx]) / (pcts[idx+1]-pcts[idx])
        return scheds[idx](actual_pos)
    return _inner

### Scheduler

In [6]:
pcts = [0.3, 0.7]

In [7]:
scheds = [sched_cos(0.3, 0.6), sched_cos(0.6, 0.2)]

In [8]:
scheduler = combine_scheds([0.3, 0.7], scheds)

#### Test plot

In [None]:
from matplotlib import pyplot as plt

In [None]:
a = torch.arange(0, 100)
p = torch.linspace(0.01,1,100)

In [None]:
plt.plot(a, [scheduler(o) for o in p])

## Runner with annealer

In [9]:
cbfs = [Recorder, partial(AvgStatsCallback, accuracy), partial(ParamScheduler, 'lr', scheduler)]

In [10]:
learn = create_learner(get_model_func(0.3), loss_func, data)

In [11]:
run = Runner(cb_funcs=cbfs)

In [12]:
run.fit(3, learn)

RuntimeError: Expected object of backend CUDA but got backend CPU for argument #4 'mat1'

In [None]:
run.recorder.plot_lr()

In [None]:
run.recorder.plot_loss()

## Early Stopping new

In [None]:
learn = create_learner(get_model, loss_func, data)

In [None]:
from dltools.runner import CancelTrainException

In [None]:
class TestCallback(Callback):
    _order = 1
    def after_step(self):
        print(self.n_iter)
        if self.n_iter >= 10:
            raise CancelTrainException()

In [None]:
run = Runner(cb_funcs=TestCallback)

In [None]:
run.fit(3, learn)

## Other callbacks: AvgStats, Recorder, ParamScheduler

In [None]:
learn = create_learner(get_model, loss_func, data)

In [None]:
run = Runner(cb_funcs = [LearningrateFinder, Recorder])

In [None]:
run.fit(2, learn)

In [None]:
run.recorder.plot(skip_last=5)

In [None]:
run.recorder.plot_lr()