In [5]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [0]:
import torch,pickle, gzip,math, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
from fastai import datasets
from pathlib import Path

In [0]:
mnist_url = 'http://deeplearning.net/data/mnist/mnist.pkl'
def get_data():
  path = datasets.download_data(mnist_url , ext = '.gz')
  with gzip.open(path,'rb') as f:
    ((x_train,y_train), (x_valid,y_valid),_) = pickle.load(f, encoding= 'latin-1')
  return map(tensor, (x_train,y_train, x_valid,y_valid))

def normalize(x,m,s): return (x-m)/s

In [7]:
(x_train,y_train, x_valid,y_valid) = get_data()

Downloading http://deeplearning.net/data/mnist/mnist.pkl


In [8]:
n,m,= x_train.shape ;n,m

(50000, 784)

In [9]:
train_mean = x_train.mean()
train_std = x_train.std()
train_mean, train_std

(tensor(0.1304), tensor(0.3073))

In [0]:
def test_near_zero(a):  assert a.abs()<1e-03, f"Near Zero: {a}"

In [0]:
# normalize the train and valid datasets
# normalize the valid dataset with the mean and std obtained by the train mean and std
x_train = normalize(x_train ,train_mean, train_std)

# valid set
x_valid = normalize(x_valid, train_mean, train_std)

In [12]:
x_train.mean(), x_train.std()

(tensor(0.0001), tensor(1.))

In [13]:
x_valid.mean(), x_valid.std()

(tensor(-0.0057), tensor(0.9924))

In [0]:
# test_near_zero(x_valid.mean())
# test_near_zero(1-x_valid.std())
# x_valid.std().abs(),1+x_valid.mean().abs()

In [15]:
c = y_train.max()+1;c

tensor(10)

In [16]:
# hidden units
nh = 50
n,m,c

(50000, 784, tensor(10))

Define the model

In [0]:
# kaiming init simplified
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)
w1.mean(), w1.std() #should be zero and 1/math.sqrt(m)
test_near_zero(w1.std()-1/math.sqrt(m))

In [0]:
# Linear Layer
def lin(x,w,b): return x@w + b

In [0]:
# Relu
def relu(x):  return x.clamp_min(0.)

In [19]:
t1 = lin(x_valid, w1,b1)
t1.mean(), t1.std() # this is exactly what we had expected.

NameError: ignored

In [0]:
t1_relu = relu(t1)
t1_relu.mean(), t1_relu.std() # and mean and the std is different because of the non-linearity of the Relu.

(tensor(0.4469), tensor(0.6575))

**Improving the kaiming init**

In [20]:
w1 = torch.randn(m,nh)*math.sqrt(2/m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)*math.sqrt(2/nh)
b2 = torch.zeros(1)

# Kaiming init considers the relu activation function and improve the std.
# def Relu(x):  return x.clamp_min(0.) - 0.5

t1 = relu(lin(x_valid, w1, b1))
t1.mean(), t1.std()

(tensor(0.5362), tensor(0.7973))

In [0]:
from torch.nn import init

In [0]:
# w1 = torch.zeros(m,nh)
# b1 = torch.zeros(nh)
# w1 = init.kaiming_normal_(w1, mode = 'fan_out')

In [22]:
w1.mean(), w1.std()

(tensor(-0.0001), tensor(0.0507))

In [0]:
import torch.nn

In [0]:
torch.nn.functional.linear??

In [0]:
torch.nn.Linear(m,nh).weight.shape

torch.Size([50, 784])

In [24]:
w1.shape,b1.shape,x_train.shape

(torch.Size([784, 50]), torch.Size([50]), torch.Size([50000, 784]))

In [25]:
t1 = lin(x_valid,w1,b1)
t1.shape

torch.Size([10000, 50])

In [0]:
def model(xb):
  l1 = lin(xb,w1,b1)
  l2 = relu(l1)
  out = lin(l2,w2,b2)
  return out

In [0]:
preds = model(x_valid)

In [0]:
def mse(out, target):
  return (out.squeeze(-1) - target).pow(2).mean()

In [32]:
loss = mse(preds,y_valid);loss

tensor(25.6047)

In [28]:
preds.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [37]:
y_valid.unsqueeze(1).shape[0]

10000

Gradients and Backward Pass

In [0]:
def mse_grad(out,tar):
  out.g = 2. * (out - tar.unsqueeze(1))/out.shape[0]

In [0]:
def relu_grad(inp,out):
  inp.g = (inp>0).float() * out.g

In [0]:
def lin_grad(inp, out, w, b):
  inp.g = out.g @ w.t()
  w.g = (inp * out.g).sum()
  b.g = out.g.sum()

In [0]:
t1 = lin(x_valid[:10], w1, b1)
t2 = relu(t1)