# Lesson 8 - Part 02

This is a reimplementation of fastai part 2 version 3 in Swift.
https://github.com/fastai/fastai_docs/blob/master/dev_course/dl2/02_fully_connected.ipynb

note: this requires my fork of [swift-jupyter](https://github.com/metachi/swift-jupyter) and a clone of my [TimeMagic](https://github.com/metachi/TimeMagic) repo for the ```%%time``` and ```%%timeit``` magic commands to work

In [1]:
%install '.package(path: "~/gitrepos/TimeMagic")' TimeMagic
%install '.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")' Path
%install '.package(url: "https://github.com/JustHTTP/Just", from: "0.7.1")' Just

Installing packages:
	.package(path: "~/gitrepos/TimeMagic")
		TimeMagic
	.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")
		Path
	.package(url: "https://github.com/JustHTTP/Just", from: "0.7.1")
		Just
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 1.36s
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Compile Swift Module 'TimeMagic' (1 sources)
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Installation complete!

In [2]:
import TimeMagic
import Foundation
import Path
import Just

### Download Dataset

In [3]:
print(Path.home/".fastai"/"data"/"test.txt")

/home/jeff/.fastai/data/test.txt


In [4]:
public func download(_ url: String, dest: String){
    let r = Just.get(url, allowRedirects:false)
    do {
        try r.content!.write(to: URL.init(fileURLWithPath: dest))
    } catch {
        print("error downloading \(url)")
    }
}

In [5]:
let base = "http://yann.lecun.com/exdb/mnist/"
let trn_imgs = "train-images-idx3-ubyte"
let trn_lbls = "train-labels-idx1-ubyte"
let val_imgs = "t10k-images-idx3-ubyte"
let val_lbls = "t10k-labels-idx1-ubyte" 

In [6]:
let dest = Path.home/".fastai"/"data"/"mnist"

In [7]:
if !dest.exists{
    dest.mkdir()
}

In [8]:
for fileName in [trn_imgs, trn_lbls, val_imgs, val_lbls] {
    var destPath = dest/(fileName)
    if !destPath.exists{
        download("\(base)/\(fileName)", dest: destPath.string + ".gz")
    }
}

In [9]:
for n in dest.ls() {
    print(n.path)
}

/home/jeff/.fastai/data/mnist/t10k-labels-idx1-ubyte
/home/jeff/.fastai/data/mnist/train-labels-idx1-ubyte
/home/jeff/.fastai/data/mnist/train-images-idx3-ubyte
/home/jeff/.fastai/data/mnist/t10k-images-idx3-ubyte


gunzip the files

In [10]:
for n in dest.ls() {
    print(n.path)
}

/home/jeff/.fastai/data/mnist/t10k-labels-idx1-ubyte
/home/jeff/.fastai/data/mnist/train-labels-idx1-ubyte
/home/jeff/.fastai/data/mnist/train-images-idx3-ubyte
/home/jeff/.fastai/data/mnist/t10k-images-idx3-ubyte


### Load the dataset

In [11]:
import TensorFlow

See [Yann Lecunn's site](http://yann.lecun.com/exdb/mnist/) for info on why we have to drop the first few k bits.  See the "TRAINING SET LABEL FILE" heading and similarly named headings.

In [12]:
func loadData(path: String, shape: [Int32], is_label: Bool) -> Tensor<Float> {
    let dropK: Int = (is_label ? 8 : 16)
    let data = try! Data.init(contentsOf: 
                     URL.init(fileURLWithPath: path)
                    ).dropFirst(dropK)
    let tensorShape = TensorShape.init(shape)
    return Tensor(data.map(Float.init)).reshaped(to: tensorShape)
}

In [13]:
var xTrain: Tensor<Float> = loadData(path: (dest/trn_imgs).string,
                   shape: [60000, 784],
                   is_label: false)/255

In [14]:
let yTrain: Tensor<Float> = loadData(path: (dest/trn_lbls).string,
                   shape: [60000],
                   is_label: true)

In [15]:
var xValid: Tensor<Float> = loadData(path: (dest/val_imgs).string,
                   shape: [10000, 784],
                   is_label: false)/255

In [16]:
let yValid: Tensor<Float> = loadData(path: (dest/val_lbls).string,
                   shape: [10000],
                   is_label: true)

### The forward and backward passes

In [17]:
public extension Tensor where Scalar : FloatingPoint {
    func stddev(alongAxes axes: [Int32])-> Tensor<Scalar>{
        let mean = self.mean(alongAxes: axes)
        return sqrt((self - mean).squared().mean(alongAxes: axes))
    }
    
    func stddev()-> Tensor<Scalar>{
        let mean = self.mean()
        return sqrt((self - mean).squared().mean())
    }
}

In [18]:
func normalize<Scalar: FloatingPoint>(_ x: Tensor<Scalar>, _ mean: Tensor<Scalar>? = nil, _ stddev: Tensor<Scalar>? = nil) ->  Tensor<Scalar>{
    var mean = (mean ?? x.mean())
    var stddev = (stddev ?? x.stddev())
    return (x-mean)/stddev
}

In [19]:
var xTrainNormal: Tensor<Float> = normalize(xTrain)
var xValidNormal: Tensor<Float> = normalize(xValid,
                                            xTrain.mean(),
                                            sqrt(xTrain.variance(alongAxes: [0,1]))
                                           )

In [20]:
func almostEqual<Scalar: SignedNumeric & FloatingPoint>(_ x: Tensor<Scalar>, _ y: Tensor<Scalar>, _ tolerance: Tensor<Scalar>) -> Bool{
    return (abs(x - y) .< tolerance).all()
}

In [21]:
print(xTrainNormal.mean(), xTrainNormal.variance(alongAxes: [0,1]))

-1.4449139e-08 [[1.0000017]]


In [22]:
almostEqual(xTrainNormal.mean(), Tensor(0), Tensor(1e-3))

true


In [23]:
almostEqual(sqrt(xTrainNormal.variance(alongAxes: [0,1])),
            Tensor(1),
            Tensor(1e-3))

true


This should be near 0,1

In [24]:
print(xValidNormal.mean(), xValidNormal.variance(alongAxes: [0,1]))

0.0060177604 [[1.0154601]]


In [25]:
almostEqual(xValidNormal.mean(), Tensor(0), Tensor(1e-2))

true


In [26]:
almostEqual(sqrt(xValidNormal.variance(alongAxes: [0,1])),
            Tensor(1),
            Tensor(1e-2))

true


In [27]:
var shp = xTrainNormal.shape
let n = shp[0]
let m = shp[1]
let c = yTrain.max() + 1
print(n, m, c)

60000 784 10.0


## Foundations version

### Basic architecture

In [28]:
var nh: Int32 = 50

In [29]:
public extension Tensor where Scalar : BinaryFloatingPoint {
    init(simpleKaiming shape: TensorShape){
        self.init(Tensor(randomNormal: shape) / sqrt(Scalar(shape[0])))
    }
    
    init(kaiming shape: TensorShape){
        self.init(Tensor(randomNormal: shape) * sqrt(2/Scalar(shape[0])))
    }
}

In [30]:
// kaiming init / he init
var w1 = Tensor<Float>(simpleKaiming: [m, nh])
var b1 = Tensor<Float>(zeros: [nh])
var w2 = Tensor<Float>(simpleKaiming: [nh, 1])
var b2 = Tensor<Float>(zeros: [1])

In [31]:
func linear<Scalar: TensorFlowFloatingPoint>(_ x: Tensor<Scalar>, _ w: Tensor<Scalar>, _ b: Tensor<Scalar>) -> Tensor<Scalar>{
    return matmul(x, w) + b
}

@differentiating(linear)
func linearDerivative<Scalar: TensorFlowFloatingPoint>(_ x: Tensor<Scalar>, _ w: Tensor<Scalar>, _ b: Tensor<Scalar>) -> (value: Tensor<Scalar>, pullback: (Tensor<Scalar>) -> (Tensor<Scalar>, Tensor<Scalar>, Tensor<Scalar>)){
    let out = Tensor<Scalar>(linear(x, w, b))
    return (value: out, pullback: {v in (matmul(v, w.transposed()), matmul(x.transposed(), v), v.sum(squeezingAxes: 0))})
}


In [32]:
//just a quick gutcheck
pullback(at: Tensor([[3.0, 3.0]]), Tensor([[1.0, 1.0], [1.0, 1.0]]), Tensor([1.0, 1.0]), in: linear)(Tensor([[1.0, 1.0]]))

▿ 3 elements
  - .0 : [[2.0, 2.0]]
  - .1 : [[3.0, 3.0], [3.0, 3.0]]
  - .2 : [1.0, 1.0]


In [33]:
//just a quick gutcheck
pullback(at: Tensor([-3.0, 3.0]), in: relu)(Tensor([1.0, 1.0]))


[0.0, 1.0]


In [34]:
// there is already a relu function in s4tf
func myRelu<Scalar: Numeric & Comparable>(_ x: Tensor<Scalar>) -> Tensor<Scalar> {
    return max(0, x)
}

In [35]:
var t = linear(xValidNormal, w1, b1)

In [36]:
print(t.mean(), t.stddev())

0.1326691 1.022629


In [37]:
var t = myRelu(linear(xValidNormal, w1, b1))

In [38]:
print(t.mean(), t.stddev())

0.471653 0.6623134


In [39]:
var w1 = Tensor<Float>(kaiming: [m, nh])

In [40]:
print(w1.mean(), w1.stddev())

-0.00034631704 0.0504034


In [41]:
var t = myRelu(linear(xValidNormal, w1, b1))

In [42]:
print(t.mean(), t.stddev())

0.52433354 0.8077385


In [43]:
// there is already a relu function in s4tf
func myRelu<Scalar: BinaryFloatingPoint & Comparable>(_ x: Tensor<Scalar>) -> Tensor<Scalar> {
    return max(0, x) - 0.5
}

In [44]:
var w1 = Tensor<Float>(kaiming: [m, nh])
var t1 = myRelu(linear(xValidNormal, w1, b1))
print(t1.mean(), t1.stddev())

0.009324098 0.7946501


In [45]:
func myModel<Scalar: TensorFlowFloatingPoint>(_ x: Tensor<Scalar>,
             _ wt1: Tensor<Scalar>,
             _ bias1: Tensor<Scalar>,
             _ wt2: Tensor<Scalar>,
             _ bias2: Tensor<Scalar>) -> Tensor<Scalar> {
    let l1: Tensor<Scalar> = linear(x, wt1, bias1)
    let l2: Tensor<Scalar> = relu(l1)
    let l3: Tensor<Scalar> = linear(l2, wt2, bias2)
    return l3
}

In [46]:
%%timeit 100
var _ = myModel(xValidNormal, w1, b1, w2, b2)

Max: 404.592 µs
Min: 302.016 µs
Mean: 360.37352000000004 µs
Std Dev: 21.900115580736102 µs


### Loss function: MSE

In [47]:
func mse<Scalar: TensorFlowFloatingPoint>(_ pred: Tensor<Scalar>, _ target: Tensor<Scalar>)->Tensor<Scalar>{
    return (pred.squeezingShape(at: -1) - target).squared().mean()
}

@differentiating(mse)
func mseDerivative<Scalar: TensorFlowFloatingPoint>(_ pred: Tensor<Scalar>, _ target: Tensor<Scalar>) -> (value: Tensor<Scalar>, pullback: (Tensor<Scalar>) -> (Tensor<Scalar>, Tensor<Scalar>)){
    let out: Tensor<Scalar> = mse(pred, target)
    let dPred: Tensor<Scalar> = 2.0 * (pred.squeezingShape(at: -1) - target).expandingShape(at: -1) / Tensor<Scalar>(pred.shapeTensor[0])
    return (value: out, pullback: {v in (dPred, v)})
}

In [48]:
var preds = myModel(xTrainNormal, w1, b1, w2, b2)

In [49]:
mse(preds, yTrain)

25.002272


### Gradients and backward pass

In [94]:
public class TensorWithGrad<Scalar: TensorFlowNumeric>{
    var tensor: Tensor<Scalar>
    var g: Tensor<Scalar>
    
    init (_ x: Tensor<Scalar>){
        tensor = x
        g = Tensor<Scalar>(zeros: tensor.shape)
    }
}

In [95]:
func myLin<Scalar: Numeric>(_ x: TensorWithGrad<Scalar>, _ w: TensorWithGrad<Scalar>, _ b: TensorWithGrad<Scalar>) -> TensorWithGrad<Scalar>{
    return TensorWithGrad(matmul(x.tensor, w.tensor) + b.tensor)
}

In [96]:
func myRelu<Scalar: Numeric & Comparable>(_ x: TensorWithGrad<Scalar>) -> TensorWithGrad<Scalar> {
    return TensorWithGrad(max(0, x.tensor))
}

In [97]:
func myMse(_ pred: TensorWithGrad<Float>, _ target: Tensor<Float>)->Tensor<Float>{
    return (pred.tensor.squeezingShape(at: -1) - target).squared().mean()
}

Grads

In [98]:
func mseGrad<Scalar: Numeric & Comparable>(_ inp: TensorWithGrad<Scalar>, _ target: TensorWithGrad<Scalar>){
    inp.g = 2 * (inp.tensor.squeezingShape(at: -1) - target.tensor).expandingShape(at: -1) / Tensor<Scalar>(inp.tensor.scalarCountTensor)
}

In [99]:
func reluGrad<Scalar: Numeric & Comparable>(_ inp: TensorWithGrad<Scalar>, _ out: TensorWithGrad<Scalar>){
    inp.g = Tensor<Scalar>(inp.tensor .> 0) * out.g
}

In [100]:
func linGrad<Scalar: Numeric & Comparable>(_ inp: TensorWithGrad<Scalar>,
                                           _ out: TensorWithGrad<Scalar>,
                                           _ w: TensorWithGrad<Scalar>,
                                           _ b: TensorWithGrad<Scalar>){
    inp.g = matmul(out.g, w.tensor.transposed())
    w.g = matmul(inp.tensor.transposed(), out.g)
    b.g = out.g.sum(squeezingAxes: 0)
}

In [101]:
var inpt = TensorWithGrad(xTrainNormal)
var w1t = TensorWithGrad(w1)
var b1t = TensorWithGrad(b1)
var w2t = TensorWithGrad(w2)
var b2t = TensorWithGrad(b2)

In [102]:
func forwardBackward(_ inp: TensorWithGrad<Float>, _ target: Tensor<Float>) -> Tensor<Float> {
    //forward pass
    let l1 = myLin(inp, w1t, b1t)
    let l2 = myRelu(l1)
    let out = myLin(l2, w2t, b2t)
    let loss = myMse(out, target)
    
    //backward pass
    mseGrad(out, TensorWithGrad(target))
    linGrad(l2, out, w2t, b2t)
    reluGrad(l1, l2)
    linGrad(inp, l1, w1t, b1t)
    return loss
}

In [103]:
var loss = forwardBackward(inpt, yTrain)

In [107]:
loss

25.002272


And forward and backward a different way

In [113]:
func forwardBackward<Scalar: TensorFlowFloatingPoint>(
       _ x: Tensor<Scalar>, _ y: Tensor<Scalar>,
       _ wt1: Tensor<Scalar>, _ bias1: Tensor<Scalar>,
       _ wt2: Tensor<Scalar>, _ bias2: Tensor<Scalar>
) -> (Tensor<Scalar>, Tensor<Scalar>, Tensor<Scalar>, Tensor<Scalar>, Tensor<Scalar>) {
//     print("starting forward")
    let (l1, l1Pb) = valueWithPullback(at: x, wt1, bias1, in: linear)
    let (l2, l2Pb) = valueWithPullback(at: l1, in: relu)
    let (preds, l3Pb) = valueWithPullback(at: l2, wt2, bias2, in: linear)
    let (loss, msePb) = valueWithPullback(at: preds, y, in: mse)
//     print("done w forward")
    
//     print("starting backprop")
    let (dPred, _ ) = msePb(Tensor<Scalar>([0.0]))
    let (dL2, dWt2, dBias2) = l3Pb(dPred)
    let (dL1) = l2Pb(dL2)
    let (dInput, dWt1, dBias1) = l1Pb(dL1)
//     print("done with backprop")
    
    return (loss, dWt1, dBias1, dWt2, dBias2)
}

In [114]:
var (loss2, dWt1, dBias1, dWt2, dBias2) = forwardBackward(xTrainNormal, yTrain, w1, b1, w2, b2)

In [115]:
loss2

25.002272


Let's make sure they agree

In [129]:
loss2 == loss

true


In [122]:
dWt1 == w1t.g

true


In [123]:
dBias1 == b1t.g

true


In [127]:
dWt2 == w2t.g

true


In [128]:
dBias2 == b2t.g

true
