In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [8]:
#export
from exp.nb_02 import *
import torch.nn.functional as F
import torch.nn as nn

In [4]:
mpl.rcParams['image.cmap'] = 'gray'

In [154]:
x_train, y_train, x_valid, y_valid = get_data()

torch.Size([50000, 784]) torch.Size([50000]) torch.Size([10000, 784]) torch.Size([10000])
torch.ByteTensor torch.LongTensor torch.ByteTensor torch.LongTensor


In [155]:
x_train, x_valid = x_train.float(), x_valid.float()

In [21]:
n, m = x_train.shape
c = y_train.max() + 1
nh = 50

In [22]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x


In [23]:
model = Model(m, nh, c)

In [121]:
pred = model(x_train)

### Loss function

<strong>Cross Entropy Loss</strong>

Softmax + negative log likelihood

Model provides output for each class, e^(output_example)
We then sum up the exponents and divide each exponent by the sum.

It's essentially the average of the exponents, which provides a probability for each category.

In [34]:
def log_softmax(x): return (x.exp() / x.exp().sum(-1, keepdim=True)).log()

In [51]:
sm_pred = log_softmax(pred)

Negative Loglikelihood 

2 class scenario:
One has 1-hot encoded values 

```
isCat = 1, isDog = 0, pred_output, logPredCat, logPredDog

NLL = isCat * logPredCat + isDog * logPredDog

```

Better to get the location of the 1 and index into the value, because multiplication by 0 takes time and is wasted effort/computation

Since the xs are 1-hot encoded, the can be rewritten as `-log(p_i)` where i is the index of the desired target.

The trick is to use numpy-style integer array indexing. 

In [48]:
y_train[:3]

tensor([5, 0, 4])

In [28]:
y_train.shape

torch.Size([50000])

In [52]:
sm_pred.shape

torch.Size([50000, 10])

In [77]:
sm_pred[0]

tensor([-22.6832, -17.8532, -29.3814,   0.0000, -43.0066, -47.1221, -68.2005,
        -53.0743, -52.5669, -49.1978], grad_fn=<SelectBackward>)

In [76]:
sm_pred[0].exp().sum()

tensor(1., grad_fn=<SumBackward0>)

In [36]:
sm_pred[[0,1,2], [5,0,4]]

tensor([-4.7122e+01, -4.4081e-04, -2.9715e+01], grad_fn=<IndexBackward>)

In [82]:
y_valid[0]

tensor(3)

In [88]:
x_valid[0].shape

torch.Size([784])

You are indexing into the prediction matrix which in this case is a 50k x 10 matrix. If the target matches the input at row n, nll should return a 0, if not it returns a non-zero value.

In [44]:
# negative log likelihood
def nll(input, target): return -input[range(target.shape[0]), target].mean() 

In [40]:
range(y_train.shape[0])

range(0, 50000)

In [50]:
def log_softmax(x): return x - x.exp().sum(-1, keepdim=True).log()

In [53]:
loss = nll(sm_pred, y_train)

In [103]:
sm_pred.shape

torch.Size([50000, 10])

In [105]:
y_valid[0]

tensor(3)

In [55]:
test_near(nll(log_softmax(pred), y_train), loss)

LogSumExp numerical stability trick

We use this because taking exponents of something can create very large numbers very quickly. Large numbers in floating point are very inaccurate. 
As we depart further away from 0, the computer can confuse numbers even if they are 1000 apart. 

Especially when we're calculating gradients, we don't want big numbers.

In [61]:
# pytorch already has this implemented
def logsumexp(x):
    m = x.max(-1)[0] # in this example max is 255 
    return m + (x - m[:, None]).exp().sum(-1).log()

In [62]:
test_near(logsumexp(pred), pred.logsumexp(-1))

In [63]:
def log_softmax(x): return x - x.logsumexp(-1, keepdim=True)

In [64]:
test_near(nll(log_softmax(pred), y_train), loss)

In [65]:
test_near(F.nll_loss(F.log_softmax(pred, dim=-1), y_train), loss)

In [66]:
test_near(F.cross_entropy(pred, y_train), loss)

In [None]:
# review the loss functions!!!

In [133]:
loss_func = F.cross_entropy

In [134]:
#export
def accuracy(out, yb): return (torch.argmax(out, dim=1) == yb).float().mean()

In [135]:
bs = 64

# minibatch
xb = x_train[0:bs]
preds = model(xb)
preds[0], preds.shape

(tensor([ 0.0450,  0.0219,  0.0253,  0.0674, -0.0251, -0.0102, -0.0569,  0.0012,
         -0.0283, -0.0402], grad_fn=<SelectBackward>),
 torch.Size([64, 10]))

In [70]:
yb = y_train[0:bs]
loss_func(preds, yb)

tensor(22.8236, grad_fn=<NllLossBackward>)

In [71]:
accuracy(preds, yb)

tensor(0.1562)

In [139]:
lr = 0.5
epochs = 1

In [108]:
(50000-1) // bs + 1

782

In [110]:
# examples for batch ranges
for e in range(epochs):
    for i in range((n-1) // bs + 1): # n is 50k, i will range from 0 to n divided by bs
        start_i = i * bs
        end_i = start_i + bs
        print(start_i, end_i)
        if i == 2: break

0 64
64 128
128 192


In [143]:
for e in range(5):
    for i in range(n // bs + 1): # loop through batches
        start_i = i * bs
        end_i = start_i + bs
        xb = x_train[start_i:end_i] # set batch intervals
        yb = y_train[start_i:end_i]
        loss = loss_func(model(xb), yb) # compare batch preds to actuals to calculate loss i.e. forward pass

        loss.backward() # get gradients
        with torch.no_grad():
            for l in model.layers: # loop through layers and adjust weights by gradients w.r.t. loss * learning rate
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr 
                    l.bias   -= l.bias.grad * lr
                    l.weight.grad.zero_() # set gradients to 0 - in-place
                    l.bias.grad.zero_()

In [144]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(2.3044, grad_fn=<NllLossBackward>), tensor(0.0625))

The first thing we want to do is re-factor the code to update the parameters instead of needing to update the weights and the biases separately.

In [145]:
class DummyModule():
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)

    # python method is called everytime one sets self.something
    def __setattr__(self, k, v):
        if not k.startswith("_"): self._modules[k] = v # ignoring the internal python private modules
        super().__setattr__(k, v) # inherit from __setattr__
    
    def __repr__(self): return f'{self._modules}'

    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters(): yield p

In [146]:
mdl = DummyModule(m, nh, 10)
mdl

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [147]:
[o.shape for o in mdl.parameters()]

[torch.Size([50, 784]),
 torch.Size([50]),
 torch.Size([10, 50]),
 torch.Size([10])]

We can use `nn.Module.__setattr__` instead now, which is why we inherit from nn.Module in Pytorch

In [148]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)

    def __call__(self, x): return self.l2(F.relu(self.l1(x)))

In [149]:
model = Model(m, nh, 10)

In [150]:
for name, l in model.named_children(): print(f'{name}: {l}')

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)


In [151]:
model # __rerp__ equivalent

Model(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [152]:
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [153]:
def fit():
    for e in range(epochs):
        for i in range( (n-1) // bs + 1 ):
            start_i = i*bs
            end_i = start_i + bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            loss = loss_func(model(xb), yb)

            loss.backward()
            with torch.no_grad():
                for p in model.parameters(): p -= p.grad * lr
                model.zero_grad()

In [158]:
fit()
loss_func(model(xb),yb), accuracy(model(xb), yb)

(tensor(2.3054, grad_fn=<NllLossBackward>), tensor(0.0625))

In [159]:
# layer list construction
layers = [nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh,10)]

In [160]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i, l in enumerate(self.layers): self.add_module(f'layer_{i}', l)

    def __call__(self, x): 
        for l in self.layers: x = l(x)
        return x

In [161]:
model = Model(layers)
model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

In [162]:
# Pytorch version
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)

    def __call__(self,x):
        for l in self.layers: x = l(x)
        return x


In [163]:
model = SequentialModel(layers)
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [164]:
fit()
loss_func(model(xb),yb), accuracy(model(xb), yb)

(tensor(2.3054, grad_fn=<NllLossBackward>), tensor(0.0625))

In [165]:
model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))

In [166]:
fit()
loss_func(model(xb),yb), accuracy(model(xb), yb)

(tensor(2.3054, grad_fn=<NllLossBackward>), tensor(0.0625))

In [169]:
class Optimizer():
    def __init__(self, params, lr=0.5):
        self.params, self.lr = params, lr

    def step(self):
        with torch.no_grad():
            for p in self.params: p -= p.grad * lr

    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()

In [170]:
model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))

In [171]:
loss, acc = loss_func(model(xb),yb), accuracy(model(xb), yb)
loss, acc

(tensor(27.9437, grad_fn=<NllLossBackward>), tensor(0.0625))

In [172]:
#export 
from torch import optim

In [175]:
def get_model():
    model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))
    return model, optim.SGD(model.parameters(), lr=lr)

In [177]:
model, opt = get_model()

In [191]:
for e in range(epochs):
    for i in range( (n-1) // bs + 1 ):
        start_i = i*bs
        end_i = start_i + bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        loss = loss_func(model(xb), yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [184]:
loss, acc = loss_func(model(xb),yb), accuracy(model(xb), yb)
loss, acc

(tensor(2.3054, grad_fn=<NllLossBackward>), tensor(0.0625))

In [192]:
loss, acc = loss_func(model(xb),yb), accuracy(model(xb), yb)
loss, acc

(tensor(2.3054, grad_fn=<NllLossBackward>), tensor(0.0625))

In [186]:
#assert acc>0.7