In [None]:
%install '.package(path: "$cwd/FastaiNotebooks")' FastaiNotebooks

Installing packages:
	.package(path: "/home/ubuntu/notebooks/swift/FastaiNotebooks")
		FastaiNotebooks
Working in: /tmp/tmp6fq5ddem
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 1.19s
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'FastaiNotebooks' (3 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Installation complete!

In [None]:
import FastaiNotebooks

In [None]:
// export
import Path
import TensorFlow

## The forward and backward passes

In [None]:
// export
public func normalize(_ x:Tensor<Float>, mean:Tensor<Float>, std:Tensor<Float>) -> Tensor<Float> {
    return (x-mean)/std
}

In [None]:
let mnist = MnistDataset(path: Path.home/".fastai"/"data"/"mnist_tst")
var x_train = mnist.xTrain
var y_train = mnist.yTrain
var x_valid = mnist.xValid
var y_valid = mnist.yValid

There is no std method to Tensor in swift yet, so we add it as an extension:

In [None]:
//export
public extension Tensor where Scalar:FloatingPoint {
    func std() -> Tensor<Scalar> {
        return sqrt(self.variance(alongAxes: Array(self.shape.indices)).flattened()[0])
    }
}

Normalize the training and validation sets.

In [None]:
let train_mean = x_train.mean()
let train_std  = x_train.std()

In [None]:
x_train = normalize(x_train, mean: train_mean, std: train_std)
x_valid = normalize(x_valid, mean: train_mean, std: train_std)

In [None]:
//export
public func test_near_zero(_ a:Tensor<Float>, tol:Float=1e-3) {
    assert(Raw.abs(a)<tol, "Near zero: \(a)")
}

In [None]:
test_near_zero(x_train.mean())
test_near_zero(x_train.std() - 1.0)

In [None]:
x_train.shape

▿ TensorShape
  ▿ dimensions : 2 elements
    - 0 : 60000
    - 1 : 784


In [None]:
let (n,m) = (x_train.shape[0],x_train.shape[1])
let c = y_train.max()+1
print(n,m,c)

60000 784 10


## Foundations version

### Basic architecture

In [None]:
//num hidden
let nh:Int32 = 50

In [None]:
// simplified kaiming init / he init
let w1:Tensor<Float> = Tensor(randomNormal: [m,nh]) / sqrt(Float(m))
let b1:Tensor<Float> = Tensor(repeating: 0.0, shape: [nh])
let w2:Tensor<Float> = Tensor(randomNormal: [nh,1]) / sqrt(Float(nh))
let b2:Tensor<Float> = Tensor(repeating: 0.0, shape: [1])

In [None]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/sqrt(Float(m)))

In [None]:
// This should be ~ (0,1) (mean,std)...
(x_valid.mean(),x_valid.std())

▿ 2 elements
  - .0 : 0.006017865
  - .1 : 1.0077024


In [None]:
func lin(_ x:Tensor<Float>, _ w:Tensor<Float>, _ b:Tensor<Float>) ->Tensor<Float> {return matmul(x, w) + b}

In [None]:
let t = lin(x_valid, w1, b1)

In [None]:
//...so should this, because we used kaiming init, which is designed to do this
(t.mean(),t.std())

▿ 2 elements
  - .0 : 0.028343419
  - .1 : 1.0052483


In [None]:
func my_relu(_ x:Tensor<Float>) -> Tensor<Float> {return max(x, 0)}

In [None]:
let t = relu(lin(x_valid, w1, b1))

In [None]:
//...actually it really should be this!
(t.mean(),t.std())

▿ 2 elements
  - .0 : 0.41025376
  - .1 : 0.5939523


In [None]:
// kaiming init / he init for relu
let w1:Tensor<Float> = Tensor(randomNormal: [m,nh]) * sqrt(2.0/Float(m))

In [None]:
(w1.mean(),w1.std())

▿ 2 elements
  - .0 : 0.00012205474
  - .1 : 0.05062286


In [None]:
let t = my_relu(lin(x_valid, w1, b1))
(t.mean(),t.std())

▿ 2 elements
  - .0 : 0.5840942
  - .1 : 0.83771217


In [None]:
func model(_ xb: Tensor<Float>) -> Tensor<Float>{
    let l1 = lin(xb, w1, b1)
    let l2 = my_relu(l1)
    let l3 = lin(l2, w2, b2)
    return l3
}

In [None]:
time(repeating: 10) {let _ = model(x_valid)}

9.0663763 ms


### Loss function

In [None]:
let preds = model(x_train)

In [None]:
// export
func mse(_ out:Tensor<Float>, _ targ:Tensor<Float>) -> Tensor<Float> {
    return (out.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
func to_float(_ x:Tensor<Int32>) -> Tensor<Float>{
    return Tensor(shape: x.shape, scalars: x.array.scalars.compactMap{ Float($0)})
}

In [None]:
var y_trainf = to_float(y_train)
var y_validf = to_float(y_valid)

In [None]:
mse(preds, y_trainf)

28.569824


### Gradients and backward pass

In [None]:
class TensorWithGrad {
    var inner: Tensor<Float>
    var grad:  Tensor<Float>
    
    init(_ x: Tensor<Float>) {
        inner = x
        grad = Tensor(repeating: 0.0, shape:x.shape)
    } 
}

In [None]:
func lin(_ x:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(matmul(x.inner, w.inner) + b.inner)
}
func my_relu(_ x:TensorWithGrad) -> TensorWithGrad {return TensorWithGrad(max(x.inner, 0))}
func mse(_ inp: TensorWithGrad, _ targ : Tensor<Float>) -> Tensor<Float>{
    //grad of loss with respect to output of previous layer
    return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
func mse_grad(_ inp: TensorWithGrad, _ targ : Tensor<Float>){
    //grad of loss with respect to output of previous layer
    inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])
}

In [None]:
func relu_grad(_ inp:TensorWithGrad, _ out:TensorWithGrad){
    //grad of relu with respect to input activations
    inp.grad = (inp.inner .> 0).selecting(out.grad, Tensor<Float>(repeating:0.0, shape:inp.inner.shape))
}

In [None]:
func lin_grad(_ inp:TensorWithGrad, _ out:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad){
    //grad of relu with respect to input activations
    inp.grad = matmul(out.grad, w.inner.transposed())
    w.grad = matmul(inp.inner.transposed(), out.grad)
    b.grad = out.grad.sum(squeezingAxes: 0)
}

In [None]:
let w1a = TensorWithGrad(w1)
let b1a = TensorWithGrad(b1)
let w2a = TensorWithGrad(w2)
let b2a = TensorWithGrad(b2)

In [None]:
func forward_and_backward(_ inp:TensorWithGrad, _ targ:Tensor<Float>){
    //forward pass:
    let l1 = lin(inp, w1a, b1a)
    let l2 = my_relu(l1)
    let out = lin(l2, w2a, b2a)
    //we don't actually need the loss in backward!
    let loss = mse(out, targ)
    
    //backward pass:
    mse_grad(out, targ)
    lin_grad(l2, out, w2a, b2a)
    relu_grad(l1, l2)
    lin_grad(inp, l1, w1a, b1a)
}

In [None]:
let inp = TensorWithGrad(x_train)

In [None]:
forward_and_backward(inp, y_trainf)

Let's compare to swift autodiff now:

In [None]:
@differentiable
func foward(_ inp:Tensor<Float>, _ targ:Tensor<Float>, w1:Tensor<Float>, b1:Tensor<Float>, 
            w2:Tensor<Float>, b2:Tensor<Float>) -> Tensor<Float>{
    let l1 = matmul(inp, w1) + b1
    let l2 = relu(l1)
    let l3 = matmul(l2, w2) + b2
    return (l3.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
let grad_x = gradient(at: x_train) {x_train in foward(x_train, y_trainf, w1:w1, b1:b1, w2:w2, b2:b2)}
let grad_w1 = gradient(at: w1) {w1 in foward(x_train, y_trainf, w1:w1, b1:b1, w2:w2, b2:b2)}
let grad_b1 = gradient(at: b1) {b1 in foward(x_train, y_trainf, w1:w1, b1:b1, w2:w2, b2:b2)}
let grad_w2 = gradient(at: w2) {w2 in foward(x_train, y_trainf, w1:w1, b1:b1, w2:w2, b2:b2)}
let grad_b2 = gradient(at: b2) {b2 in foward(x_train, y_trainf, w1:w1, b1:b1, w2:w2, b2:b2)}

In [None]:
test_near_zero(grad_x - inp.grad)
test_near_zero(grad_w1 - w1a.grad)
test_near_zero(grad_b1 - b1a.grad)
test_near_zero(grad_w2 - w2a.grad)
test_near_zero(grad_b2 - b2a.grad)

In [None]:
time(repeating: 10) { forward_and_backward(inp, y_trainf) }

307.67580069999997 ms


### Export

In [None]:
export_notebooks(Path.cwd)

Converting 02_fully_connected.ipynb
Converting 00_load_data.ipynb
Converting 01_matmul.ipynb
