In [None]:
//default_exp nn

# nn

> A few modules that can be used to build neural nets.

In [1]:
/**
Imports we need in nn.module.js
*/
import {exp,shape,transpose,dotProduct,randn,zeros,argmax,mean,round} from './src/util.module.js';
import {matrixSum1d,matrixSum2d,matrixSubtract1d,matrixSubtract2d,matrixMultiply1d,matrixMultiply2d} from './src/util.module.js';
import {head,tail,parseCsv,IRIS_CLASS_MAP,IrisRowHandler,shuffle,split,batches} from './src/data.module.js';

In [2]:
// Imports we need for testing
import {testEq} from './src/testutil.module.js'

In [3]:
/**
yTrue can be either 2d (one-hot encoded targets) or 1d (array of class IDs).
*/
function accuracy(yPred2d,yTrue) {
    const yPredShape=shape(yPred2d);
    const yTrueShape=shape(yTrue);
    if (yPredShape[0] != yTrueShape[0]) {
        throw `Expected yPred2d.length ${yPredShape[0]} to equal yTrue.length ${yTrueShape[0]}`;
    }
    if (yTrueShape.length == 2 && yPredShape[1] != yTrueShape[1]) {
        throw `Expected shape(yPred2d)[1] ${yPredShape[1]} to equal shape(yTrue)[1] ${yTrueShape[1]}`;
    }
    let correctCount=0;
    for (let i=0; i<yPred2d.length; i++) {
        let p = argmax(yPred2d[i]);
        let t = (yTrueShape.length == 2) ? argmax(yTrue[i]) : yTrue[i];
        if (p == t) {
            correctCount++;
        }
    }
    return correctCount/yPredShape[0];
}

In [4]:
let yPred=[[0,.1],[0,.9],[0,.33],[0,1],[0,1],[0,1],[0,1],[0,1],[1,0]];
let yTrue=[[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[1,0]];
testEq(1.0, accuracy(yPred,yTrue));
yPred[0][0]=.8;
testEq(8/9, accuracy(yPred,yTrue));
yPred.map(a=>a[0]=1.1); // accuracy doesn't care if we're not within 0 and 1
testEq(1/9, accuracy(yPred,yTrue));
// test with class IDs
testEq(1/9, accuracy(yPred,[1,1,1,1,1,1,1,1,0]));

In [5]:
/**
*/
class MSE {
    forward(yPred2d,yTrue2d) {
        this.error=matrixSubtract2d(yPred2d,yTrue2d);
        return mean(this.error.map(row=>row.map(elem=>elem**2)));
    }
    backward() {
        this.grad=matrixMultiply2d(this.error, 2/this.error.length);
        return this.grad;
    }
}

In [6]:
let mse=new MSE();
let mseValue=mse.forward(
    [[-0.1684, -1.0158, -1.3667,  1.4327],
    [ 0.0245, -0.6284, -2.5182,  2.2007],
    [-1.8774, -0.0352, -0.5946,  0.4272]],
    [[-0.3516,  0.5787,  0.8858,  0.9198],
    [ 0.1892, -0.6473,  2.1278,  0.1345],
    [ 2.2919, -0.9939, -0.3137, -0.4314]]);
testEq(4.409, Math.round(mseValue*1000)/1000);

## cross entropy: negative log likelyhood of log softmax

The following is taken from https://github.com/fastai/course-v3/blob/master/nbs/dl2/03_minibatch_training.ipynb
```
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()

def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

def nll(input, target): return -input[range(target.shape[0]), target].mean()
```

In [7]:
/**
Takes a 2d array and returns a 1d array of the log of the sum of the exp for each row.
*/
function logsumexp(x) {
    const m = x.map(a => Math.max(...a));
    let temp = x.map((row,i) => row.map(e => e-m[i])); // x-m[:,None]
    temp = temp.map(row => row.map(e => exp(e)));      // .exp()
    temp = temp.map(row => row.reduce((a,b) => a+b))   // .sum(-1)
    temp = temp.map(a => Math.log(a));                 // .log()
    return matrixSum1d(m, temp);                       // return m + ...
}

In [8]:
let testData=[[1.6392130817141863, 0.12928212984246149],
              [0.000843200027605633, -0.12680858189363003],
              [-0.9898354893794594, -1.5028466126461082]]
testEq([1.8388, 0.6322, -0.5207], round(logsumexp(testData),4));

In [9]:
/**
Takes a 2d array and returns a 2d array of log softmax for each element.
*/
function log_softmax(x) {
    const _logsumexp = logsumexp(x);
    return x.map((row,i) => row.map(e => e-_logsumexp[i]));
}

In [10]:
testEq([[-0.1996089581238054, -1.7095399099955302],
        [-0.6313567803288485, -0.7590085622500842],
        [-0.4691846265662769, -0.9821957498329257]], log_softmax(testData));

In [11]:
/**
Takes a 2d input (log softmax predictions) and a 1d array of target class IDs and returns the negative log likelihood.
*/
function nll(input, target) {
    return -mean(input.map((row,i) => row[target[i]]));
}

In [12]:
testEq(-0.2167, round(nll(testData,[0,0,0]),4));

`CrossEntropyLoss` uses an approach borrowed from https://beckernick.github.io/logistic-regression-from-scratch/

In [13]:
/**
Cross entropy with softmax.
yTrue1d is an array of target class IDs - not a 2d array of 1 hot encoded targets.
*/
class CrossEntropyLoss {
    softmax1d(a) {
        const maxValue=Math.max(...a); // normalize values for numerical stability (log sum exp)
        const temp=a.map(e => exp(e-maxValue));
        const sum=temp.reduce((a,b)=>a+b);
        return temp.map(e=>e/sum);
    }
        
    forward(yPred2d,yTrue1d) {
        this.yPred2d=yPred2d.map(yPred1d => this.softmax1d(yPred1d));
        this.yTrue1d=yTrue1d;
        const temp=this.yPred2d.map((yPred1d,i) => Math.log(yPred1d[yTrue1d[i]])); // TODO: add tiny value to avoid log(0)
        return -temp.reduce((a,b) => a+b) / temp.length;
    }
    
    backward() {
        const yTrue1d=this.yTrue1d;
        this.grad=this.yPred2d.map(yPred1d => [...yPred1d]); // copy preds
        this.grad.forEach((yPred1d,i)=>yPred1d[yTrue1d[i]]-=1);
        return this.grad;
    }
}

In [15]:
// show that both ways of calculating cross entropy loss give the same values
let lossFn = new CrossEntropyLoss()
testEq(round(nll(log_softmax(testData),[0,0,0]),4), round(lossFn.forward(testData,[0,0,0]),4));

In [16]:
/**
*/
class BinaryCrossEntropyLoss {
    _forward1d(yPred1d,yTrue1d) {
        const temp=yPred1d.map((yPred,i) => Math.log((yTrue1d[i]==1.) ? yPred : 1-yPred));
        return -temp.reduce((a,b) => a+b) / temp.length;
    }
    forward(yPred2d,yTrue2d) {
        this.yPred2d=yPred2d;
        this.yTrue2d=yTrue2d;
        const lossValue1d=yPred2d.map((yPred1d,i) => this._forward1d(yPred1d,yTrue2d[i]));
        return lossValue1d.reduce((a,b) => a+b) / lossValue1d.length;
    }
    _backward1d(yPred1d,yTrue1d) {
        return yPred1d.map((yPred,i) => (yTrue1d[i]==1.) ? -1/yPred : 1/(1-yPred));
    }
    backward() {
        const yTrue2d=this.yTrue2d;
        this.grad=this.yPred2d.map((yPred1d,i) => this._backward1d(yPred1d,yTrue2d[i]));
        return this.grad;
    }
}

In [17]:
/**
*/
class Sigmoid {
    forward(x2d) {
        this.results=x2d.map(x1d => x1d.map(x => 1./(1.+exp(-x))));
        return this.results;
    }
    backward(gradients) {
        // `s * (1.-s)` calculates sigmoid grad, then we chain gradients passed in
        this.grad=this.results.map((result,i) => result.map((s,j) => s * (1.-s) * gradients[i][j]));
        return this.grad;
    }
}

In [18]:
/**
*/
class ReLU {
    forward(x2d) {
        this.gradMask=zeros(...shape(x2d));
        return x2d.map((x1d,rowIndex) => x1d.map((x,colIndex) => {
            if (x>0) {
                this.gradMask[rowIndex][colIndex]=1;
            }
            return Math.max(0,x);
        }));
    }
    backward(gradient) {
        return matrixMultiply2d(this.gradMask,gradient);
    }
}

In [19]:
let relu = new ReLU();
let data = [
  [ -0.3132450550822199, 0.06746248970796562, 0.7502210053477679 ],
  [ 0.32586239499711434, 0.276573231917191, 0.4718188033994297 ],
  [ 0.3375259522729109, -1.4738907605515226, -0.11109898767917284 ],
  [ -0.6095143988686595, 1.094470501593892, -0.4982351760328258 ],
  [ 0.28664244098736347, -0.35879217465991975, -0.754257906608068 ]
];
testEq(relu.forward(data),relu.backward(data));

In [20]:
/**
*/
class Linear {
    constructor(inputDim,numHidden=1,bias=true) {
        this.inputDim=inputDim;
        this.numHidden=numHidden;
        // Kaiming Init
        this.weights=matrixMultiply2d(randn(inputDim,numHidden), Math.sqrt(2.0/inputDim));
        this.bias=zeros(numHidden)
        this.updateBias=bias;
    }
    forward(x) {
        this.x=x; // shape(bs,inputDim)
        return matrixSum2d(dotProduct(x,this.weights), this.bias);
    }
    backward(gradient) { // gradient shape(bs,numHidden)
        // weightsGradient/biasGradient need to be the same shape as weights/bias
        this.weightsGradient=dotProduct(transpose(this.x), gradient);
        // this.biasGradient=gradient.sum(axis=0)
        this.biasGradient=transpose(gradient).map(col => col.reduce((a,b) => a+b));
        this.xGradient=dotProduct(gradient,transpose(this.weights));
        return this.xGradient;
    }
    update(lr) {
        // gradient calculations in backward don't account for batch size, so we do it here
        lr=lr/this.x.length; // TODO: change gradient calc to account for batch size - all XxxLoss classes
        this.weights=matrixSubtract2d(this.weights,matrixMultiply2d(this.weightsGradient,lr));
        if (this.updateBias) {
            this.bias=matrixSubtract1d(this.bias,matrixMultiply1d(this.biasGradient,lr));
        }
    }
}

In [21]:
/**
*/
class Learner {
    constructor(model, lossFn, data, metrics=[accuracy]) {
        this.model=model;
        this.lossFn=lossFn;
        this.metrics=metrics;
        const splitData=split(shuffle(data));
        this.xTrain=splitData[0][0];
        this.xValid=splitData[0][1];
        this.yTrain=splitData[1][0];
        this.yValid=splitData[1][1];
        // shame that we can destructure into this. )o:
//         [[this.xTrain,this.xValid],[this.yTrain,this.yValid]]=split(data);
    }
    forward(x) {
        for (let i=0; i<this.model.length; i++) {
            x=this.model[i].forward(x);
        }
        return x;
    }
    backward(gradients) {
        for (let i=this.model.length-1; i>=0; i--) {
            gradients=this.model[i].backward(gradients);
        }
        return gradients;
    }
    step(lr) {
        this.model.forEach(m => {
            if (typeof m.update=='function') {
                m.update(lr);
            }
        });
    }
    validate(epoch) {
        const preds=this.forward(this.xValid);
        const lossValue=this.lossFn.forward(preds,this.yValid);
        const metricValues=this.metrics.map(metric=>metric(preds,this.yValid));
        console.log('epoch',epoch,'valid loss',lossValue,'metrics',metricValues);
    }
    fit(epochs, lr=0.1, bs=64) {
        this.validate(-1); // Note: we use epoch -1 to indicate before training
        for (let epoch=0; epoch<epochs; epoch++) {
            batches([this.xTrain,this.yTrain]).forEach(batch => {
                const [xb,yb]=batch;
                const preds=this.forward(xb);
                const lossValue=this.lossFn.forward(preds,yb);
                this.lossFn.backward();
                this.backward(this.lossFn.grad);
                this.step(lr);
            });
            this.validate(epoch);
        }
    }
    predict(x,y,yToLabelFn=(a=>a)) {
        const preds=this.forward(x);
        return preds.map((pred,rowIndex) => {
            const row=[pred,yToLabelFn(pred)];
            if (y!=null) {
                row.push(yToLabelFn(y[rowIndex]));
            }
            return row;
        });
    }
}

## Train a linear model to classify iris flowers

Note: we use `BinaryCrossEntropyLoss` here just as an example. README.md and index.ipynb shows how to train with `CrossEntropyLoss`.

In [22]:
let stringData=require('fs').readFileSync('data/iris.data').toString();
let data=parseCsv(stringData, new IrisRowHandler()).result;
let lossFn=new BinaryCrossEntropyLoss();
let model=[new Linear(4,3), new Sigmoid()];
let learn=new Learner(model, lossFn, data);
learn.fit(25);

epoch -1 valid loss 0.7210415659408119 metrics [ 0.3333333333333333 ]
epoch 0 valid loss 0.681603308853917 metrics [ 0.43333333333333335 ]
epoch 1 valid loss 0.647636678668952 metrics [ 0.43333333333333335 ]
epoch 2 valid loss 0.6187155608522726 metrics [ 0.5666666666666667 ]
epoch 3 valid loss 0.5937510902428557 metrics [ 0.5666666666666667 ]
epoch 4 valid loss 0.5725834754551206 metrics [ 0.5666666666666667 ]
epoch 5 valid loss 0.5547588942571242 metrics [ 0.5666666666666667 ]
epoch 6 valid loss 0.5392094565081945 metrics [ 0.5666666666666667 ]
epoch 7 valid loss 0.5257326467000477 metrics [ 0.6333333333333333 ]
epoch 8 valid loss 0.5138858140493918 metrics [ 0.6666666666666666 ]
epoch 9 valid loss 0.5031579255606787 metrics [ 0.6666666666666666 ]
epoch 10 valid loss 0.49356252138201095 metrics [ 0.6666666666666666 ]
epoch 11 valid loss 0.4847578209911884 metrics [ 0.6666666666666666 ]
epoch 12 valid loss 0.4767924295313482 metrics [ 0.6666666666666666 ]
epoch 13 valid loss 0.4693622

## Train a neural net to classify iris flowers

In [23]:
let model=[new Linear(4,50), new ReLU(), new Linear(50,3), new Sigmoid()];
let learn=new Learner(model, lossFn, data);
learn.fit(25);

epoch -1 valid loss 0.9544850777153755 metrics [ 0.3333333333333333 ]
epoch 0 valid loss 0.4772857260011183 metrics [ 0.6666666666666666 ]
epoch 1 valid loss 0.3785130032647902 metrics [ 0.7 ]
epoch 2 valid loss 0.33558066956184474 metrics [ 0.7666666666666667 ]
epoch 3 valid loss 0.31139052989210747 metrics [ 0.7666666666666667 ]
epoch 4 valid loss 0.2937689742464834 metrics [ 0.8 ]
epoch 5 valid loss 0.2806396850531637 metrics [ 0.8 ]
epoch 6 valid loss 0.27192462472494827 metrics [ 0.8 ]
epoch 7 valid loss 0.26450894052831625 metrics [ 0.8 ]
epoch 8 valid loss 0.25786153151757457 metrics [ 0.8 ]
epoch 9 valid loss 0.2520059933499623 metrics [ 0.8 ]
epoch 10 valid loss 0.24484250251669143 metrics [ 0.8 ]
epoch 11 valid loss 0.24043759144420304 metrics [ 0.8 ]
epoch 12 valid loss 0.23505814502789057 metrics [ 0.8 ]
epoch 13 valid loss 0.23142922230072563 metrics [ 0.8 ]
epoch 14 valid loss 0.22798985768489013 metrics [ 0.8 ]
epoch 15 valid loss 0.224103129512089 metrics [ 0.8 ]
epoch 

### Look at some predictions 

We use the lambda ```(y=>`${argmax(y)}: ${IRIS_CLASS_MAP[argmax(y)]}`)``` to convert predictions like `[0.000, 0.183, 0.843]` to readable labels.

In [24]:
// head(learn.predict(learn.xValid, learn.yValid)); run this to see "raw" targets
head(learn.predict(learn.xValid, learn.yValid, (y=>`${argmax(y)}: ${IRIS_CLASS_MAP[argmax(y)]}`)));

0 [
  [ 0.000003083883493382887, 0.03980108307300796, 0.9824464784167263 ],
  '2: Iris-virginica',
  '2: Iris-virginica'
]
1 [
  [ 0.005712331123562325, 0.944045080024074, 0.09338924667776642 ],
  '1: Iris-versicolor',
  '1: Iris-versicolor'
]
2 [
  [ 0.015015240144698608, 0.25511576053535273, 0.7727902143792333 ],
  '2: Iris-virginica',
  '2: Iris-virginica'
]
3 [
  [ 0.004965843084547645, 0.4709298195635195, 0.6636975377120846 ],
  '2: Iris-virginica',
  '1: Iris-versicolor'
]
4 [
  [ 0.00011758612246877341, 0.09335210891096556, 0.9162445951186915 ],
  '2: Iris-virginica',
  '2: Iris-virginica'
]
5 [
  [ 0.0003141267416208043, 0.03246172473372288, 0.7861595658934284 ],
  '2: Iris-virginica',
  '2: Iris-virginica'
]
6 [
  [ 0.01025025257603669, 0.36465488141600955, 0.5899431992417702 ],
  '2: Iris-virginica',
  '1: Iris-versicolor'
]
7 [
  [ 0.9691683493445117, 0.04231205232544927, 0.006490749828945177 ],
  '0: Iris-setosa',
  '0: Iris-setosa'
]
8 [
  [ 0.9920779134719121, 0.015680672

Show how we could train a linear layer without `Learner` - this is not a proper training loop, we just;
- forward pass
- print training loss
- backward pass
- update

In [25]:
let data=parseCsv(stringData, new IrisRowHandler()).result;
let x=data[0],y=data[1];
console.log('shape(x)',shape(x), 'shape(y)',shape(y));
let loss_fn=new BinaryCrossEntropyLoss()
let sig=new Sigmoid()
let lin=new Linear(4,3);
for (let epoch = 0; epoch < 10; epoch++) {
    let y_pred=sig.forward(lin.forward(x));
    let loss_value=loss_fn.forward(y_pred,y);
    console.log('epoch',epoch,'loss_value',loss_value);
    loss_fn.backward();
    sig.backward(loss_fn.grad);
    lin.backward(sig.grad);
    lin.update(.1);
}

shape(x) [ 150, 4 ] shape(y) [ 150, 3 ]
epoch 0 loss_value 0.7456237404624781
epoch 1 loss_value 0.7083050326952027
epoch 2 loss_value 0.6750960154499097
epoch 3 loss_value 0.6456738843500313
epoch 4 loss_value 0.6196645232383754
epoch 5 loss_value 0.5966752272399144
epoch 6 loss_value 0.5763212925279265
epoch 7 loss_value 0.5582443237905863
epoch 8 loss_value 0.5421225072946342
epoch 9 loss_value 0.5276744683398552


## Can we teach a linear layer to convert one hot encoded integers to their bitwise representations?

In [26]:
let x=[
    [1,0,0,0,0,0,0,0,0,0],
    [0,1,0,0,0,0,0,0,0,0],
    [0,0,1,0,0,0,0,0,0,0],
    [0,0,0,1,0,0,0,0,0,0],
    [0,0,0,0,1,0,0,0,0,0],
    [0,0,0,0,0,1,0,0,0,0],
    [0,0,0,0,0,0,1,0,0,0],
    [0,0,0,0,0,0,0,1,0,0],
    [0,0,0,0,0,0,0,0,1,0],
    [0,0,0,0,0,0,0,0,0,1]
];
let y=[
    [0,0,0,0],
    [1,0,0,0],
    [0,1,0,0],
    [1,1,0,0],
    [0,0,1,0],
    [1,0,1,0],
    [0,1,1,0],
    [1,1,1,0],
    [0,0,0,1],
    [1,0,0,1]
];

`x` is an identity matrix, so ... `x.y` is `y`

In [27]:
testEq(y,dotProduct(x,y))

so ... will `y` make the perfect weights (if bias is zero)?

In [28]:
let loss_fn=new BinaryCrossEntropyLoss()
let sig=new Sigmoid()
let linearNoBias=new Linear(10,4,false);
let y_pred=null;
for (let epoch = 0; epoch<10; epoch++) {
    y_pred=sig.forward(linearNoBias.forward(x));
    const loss_value=loss_fn.forward(y_pred,y);
    if (epoch%10==9) {
        console.log('epoch',epoch,'loss_value',loss_value);
    }
    loss_fn.backward();
    sig.backward(loss_fn.grad);
    linearNoBias.backward(sig.grad);
    linearNoBias.update(50);
}
console.log(y_pred)

epoch 9 loss_value 0.018167369338112204
[
  [
    0.018419955736153345,
    0.017912951871688565,
    0.018235536054458837,
    0.017601739536903506
  ],
  [
    0.9816460419899544,
    0.01851251520639118,
    0.01819493931774505,
    0.018482264267305634
  ],
  [
    0.017706703674205408,
    0.9818826400504213,
    0.01756896075146458,
    0.01802703779365143
  ],
  [
    0.9821918265400942,
    0.9815438220201382,
    0.018214569680693155,
    0.017457467681792623
  ],
  [
    0.017758723795838843,
    0.017940734670875647,
    0.982115044156826,
    0.017544494277273073
  ],
  [
    0.9822971831135902,
    0.018141939952034763,
    0.9819953931672954,
    0.018088397873372603
  ],
  [
    0.017744144856964045,
    0.9818603281264469,
    0.9817086094802139,
    0.018077385207505466
  ],
  [
    0.9825125112010706,
    0.9824859780833001,
    0.982205941346248,
    0.018514279512330027
  ],
  [
    0.01850733499318103,
    0.018464845626273582,
    0.01750969179256001,
    0.982286

dump our linear layer to output - so we can look at the learned weights.

In [29]:
linearNoBias

Linear {
  inputDim: 10,
  numHidden: 4,
  weights: [
    [
      -4.067828714273833,
      -4.093720688051305,
      -4.077156885603839,
      -4.1100077498905
    ],
    [
      4.071155340777657,
      -4.0631848375597395,
      -4.079223976740948,
      -4.06469982092297
    ],
    [
      -4.104480148029213,
      4.083187997957094,
      -4.111741204562182,
      -4.0878262330286645
    ],
    [
      4.099169964239447,
      4.066008386333582,
      -4.078223824418697,
      -4.117663466620966
    ],
    [
      -4.101753719297217,
      -4.092281524200758,
      4.0951733298404545,
      -4.113037343010202
    ],
    [
      4.104684204422819,
      -4.081930073664264,
      4.088981990930716,
      -4.0846725441970815
    ],
    [
      -4.102516949365545,
      4.082046069590711,
      4.074320999484547,
      -4.085237706586823
    ],
    [
      4.116064822231172,
      4.114654369073136,
      4.099906677791539,
      -4.063096562629115
    ],
    [
      -4.06344407578390

In [None]:
export {accuracy,Sigmoid,MSE,BinaryCrossEntropyLoss,CrossEntropyLoss,ReLU,Linear,Learner}