In [1]:
//default_exp nn

# nn

> A few modules that can be used to build neural nets.

In [2]:
/**
Imports we need in nn.module.js
*/
import {exp,shape,transpose,dotProduct,randn,zeros,argmax,mean} from './src/util.module.js';
import {matrixSum1d,matrixSum2d,matrixSubtract1d,matrixSubtract2d,matrixMultiply1d,matrixMultiply2d} from './src/util.module.js';
import {head,tail,parseCsv,IRIS_CLASS_MAP,IrisRowHandler,shuffle,split,batches} from './src/data.module.js';

In [3]:
// Imports we need for testing
import {testEq} from './src/testutil.module.js'

In [4]:
/**
yTrue can be either 2d (one-hot encoded targets) or 1d (array of class IDs).
*/
function accuracy(yPred2d,yTrue) {
    const yPredShape=shape(yPred2d);
    const yTrueShape=shape(yTrue);
    if (yPredShape[0] != yTrueShape[0]) {
        throw `Expected yPred2d.length ${yPredShape[0]} to equal yTrue.length ${yTrueShape[0]}`;
    }
    if (yTrueShape.length == 2 && yPredShape[1] != yTrueShape[1]) {
        throw `Expected shape(yPred2d)[1] ${yPredShape[1]} to equal shape(yTrue)[1] ${yTrueShape[1]}`;
    }
    let correctCount=0;
    for (let i=0; i<yPred2d.length; i++) {
        let p = argmax(yPred2d[i]);
        let t = (yTrueShape.length == 2) ? argmax(yTrue[i]) : yTrue[i];
        if (p == t) {
            correctCount++;
        }
    }
    return correctCount/yPredShape[0];
}

In [5]:
let yPred=[[0,.1],[0,.9],[0,.33],[0,1],[0,1],[0,1],[0,1],[0,1],[1,0]];
let yTrue=[[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[1,0]];
testEq(1.0, accuracy(yPred,yTrue));
yPred[0][0]=.8;
testEq(8/9, accuracy(yPred,yTrue));
yPred.map(a=>a[0]=1.1); // accuracy doesn't care if we're not within 0 and 1
testEq(1/9, accuracy(yPred,yTrue));
// test with class IDs
testEq(1/9, accuracy(yPred,[1,1,1,1,1,1,1,1,0]));

In [6]:
/**
*/
class MSE {
    forward(yPred2d,yTrue2d) {
        this.error=matrixSubtract2d(yPred2d,yTrue2d);
        return mean(this.error.map(row=>row.map(elem=>elem**2)));
    }
    backward() {
        this.grad=matrixMultiply2d(this.error, 2/this.error.length);
        return this.grad;
    }
}

In [7]:
let mse=new MSE();
let mseValue=mse.forward(
    [[-0.1684, -1.0158, -1.3667,  1.4327],
    [ 0.0245, -0.6284, -2.5182,  2.2007],
    [-1.8774, -0.0352, -0.5946,  0.4272]],
    [[-0.3516,  0.5787,  0.8858,  0.9198],
    [ 0.1892, -0.6473,  2.1278,  0.1345],
    [ 2.2919, -0.9939, -0.3137, -0.4314]]);
testEq(4.409, Math.round(mseValue*1000)/1000);

cross entropy: negative log likelyhood of log softmax
```
def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdim=True))).log()
def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()

def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

def nll(input, target): return -input[range(target.shape[0]), target].mean()
```

input==log softmax predictions

```
import numpy as np
import torch
import torch.nn as nn
ce_loss_fn=nn.CrossEntropyLoss()
nll_loss_fn=nn.NLLLoss()
input=torch.randn(10,3,requires_grad=True)
input
target=torch.tensor(np.random.choice(3,10))
target
nn.NLLLoss(reduction='none')(input,target), nn.NLLLoss()(input,target)
# input=torch.randn(10,3,requires_grad=True)
input=torch.tensor(np.eye(3)[target],requires_grad=True)
loss_value=nll_loss_fn( input,target)
print(loss_value)
loss_value.backward()
input.grad
tensor(-1., dtype=torch.float64, grad_fn=<NllLossBackward>)
tensor([[ 0.0000, -0.1000,  0.0000],
        [ 0.0000, -0.1000,  0.0000],
        [ 0.0000,  0.0000, -0.1000],
        [ 0.0000, -0.1000,  0.0000],
        [-0.1000,  0.0000,  0.0000],
        [-0.1000,  0.0000,  0.0000],
        [ 0.0000,  0.0000, -0.1000],
        [ 0.0000, -0.1000,  0.0000],
        [ 0.0000,  0.0000, -0.1000],
        [ 0.0000,  0.0000, -0.1000]], dtype=torch.float64)
```

In [8]:
/**
Cross entropy with softmax.
yTrue1d is an array of target class IDs - not a 2d array of 2 hot encoded targets.
*/
class CrossEntropyLoss {
    softmax1d(a) {
        const maxValue=Math.max(...a); // normalize values for numerical stability (log sum exp)
        const temp=a.map(e => exp(e-maxValue));
        const sum=temp.reduce((a,b)=>a+b);
        return temp.map(e=>e/sum);
    }
        
    forward(yPred2d,yTrue1d) {
        this.yPred2d=yPred2d.map(yPred1d => this.softmax1d(yPred1d));
        this.yTrue1d=yTrue1d;
        const temp=this.yPred2d.map((yPred1d,i) => Math.log(yPred1d[yTrue1d[i]])); // TODO: add tiny value to avoid log(0)
        return -temp.reduce((a,b) => a+b) / temp.length;
    }
    
    backward() {
        const yTrue1d=this.yTrue1d;
        this.grad=this.yPred2d.map(yPred1d => [...yPred1d]); // copy preds
        this.grad.forEach((yPred1d,i)=>yPred1d[yTrue1d[i]]-=1);
//         this.grad=matrixMultiply2d(this.grad,1/this.grad.length); // TODO: xxx
        return this.grad; // TODO: mean for all loss functions
        // output_error_signal = (output_probs - training_labels) / output_probs.shape[0]
    }
}

In [9]:
var input=
        [[ 0.7168,  0.7141, -0.8293],
        [-1.3728,  1.9252,  0.6447],
        [-0.1829, -0.0563,  1.3041],
        [-0.7842, -0.1915,  1.0120],
        [-0.2175, -1.0389, -0.1599],
        [ 0.3811,  0.2495, -1.8275],
        [-0.8893, -0.6678,  0.7714],
        [ 0.9122,  0.0140, -1.6649],
        [-0.9544, -0.3054, -0.2324],
        [-2.4975,  0.0240,  0.8549]];
var target=[2, 2, 0, 2, 2, 2, 1, 2, 2, 0];
let lossFn = new CrossEntropyLoss()
lossFn.forward(input,target);

1.9296404163515675


In [10]:
/**
*/
class BinaryCrossEntropyLoss {
    _forward1d(yPred1d,yTrue1d) {
        const temp=yPred1d.map((yPred,i) => Math.log((yTrue1d[i]==1.) ? yPred : 1-yPred));
        return -temp.reduce((a,b) => a+b) / temp.length;
    }
    forward(yPred2d,yTrue2d) {
        this.yPred2d=yPred2d;
        this.yTrue2d=yTrue2d;
        const lossValue1d=yPred2d.map((yPred1d,i) => this._forward1d(yPred1d,yTrue2d[i]));
        return lossValue1d.reduce((a,b) => a+b) / lossValue1d.length;
    }
    _backward1d(yPred1d,yTrue1d) {
        return yPred1d.map((yPred,i) => (yTrue1d[i]==1.) ? -1/yPred : 1/(1-yPred));
    }
    backward() {
        const yTrue2d=this.yTrue2d;
        this.grad=this.yPred2d.map((yPred1d,i) => this._backward1d(yPred1d,yTrue2d[i]));
        return this.grad;
    }
}

In [11]:
/**
*/
class Sigmoid {
    forward(x2d) {
        this.results=x2d.map(x1d => x1d.map(x => 1./(1.+exp(-x))));
        return this.results;
    }
    backward(gradients) {
        // `s * (1.-s)` calculates sigmoid grad, then we chain gradients passed in
        this.grad=this.results.map((result,i) => result.map((s,j) => s * (1.-s) * gradients[i][j]));
        return this.grad;
    }
}

In [12]:
/**
*/
class ReLU {
    forward(x2d) {
        this.gradMask=zeros(...shape(x2d));
        return x2d.map((x1d,rowIndex) => x1d.map((x,colIndex) => {
            if (x>0) {
                this.gradMask[rowIndex][colIndex]=1;
            }
            return Math.max(0,x);
        }));
    }
    backward(gradient) {
        return matrixMultiply2d(this.gradMask,gradient);
    }
}

In [13]:
let relu = new ReLU();
let data = [
  [ -0.3132450550822199, 0.06746248970796562, 0.7502210053477679 ],
  [ 0.32586239499711434, 0.276573231917191, 0.4718188033994297 ],
  [ 0.3375259522729109, -1.4738907605515226, -0.11109898767917284 ],
  [ -0.6095143988686595, 1.094470501593892, -0.4982351760328258 ],
  [ 0.28664244098736347, -0.35879217465991975, -0.754257906608068 ]
];
testEq(relu.forward(data),relu.backward(data));

In [14]:
/**
*/
class Linear {
    constructor(inputDim,numHidden=1,bias=true) {
        this.inputDim=inputDim;
        this.numHidden=numHidden;
        // Kaiming Init
        this.weights=matrixMultiply2d(randn(inputDim,numHidden), Math.sqrt(2.0/inputDim));
        this.bias=zeros(numHidden)
        this.updateBias=bias;
    }
    forward(x) {
        this.x=x; // shape(bs,inputDim)
        return matrixSum2d(dotProduct(x,this.weights), this.bias);
    }
    backward(gradient) { // gradient shape(bs,numHidden)
        // weightsGradient/biasGradient need to be the same shape as weights/bias
        this.weightsGradient=dotProduct(transpose(this.x), gradient);
        // this.biasGradient=gradient.sum(axis=0)
        this.biasGradient=transpose(gradient).map(col => col.reduce((a,b) => a+b));
        this.xGradient=dotProduct(gradient,transpose(this.weights));
        return this.xGradient;
    }
    update(lr) {
        // gradient calculations in backward don't account for batch size, so we do it here
        lr=lr/this.x.length; // TODO: change gradient calc to account for batch size - all XxxLoss classes
        this.weights=matrixSubtract2d(this.weights,matrixMultiply2d(this.weightsGradient,lr));
        if (this.updateBias) {
            this.bias=matrixSubtract1d(this.bias,matrixMultiply1d(this.biasGradient,lr));
        }
    }
}

In [15]:
/**
*/
class Learner {
    constructor(model, lossFn, data, metrics=[accuracy]) {
        this.model=model;
        this.lossFn=lossFn;
        this.metrics=metrics;
        const splitData=split(shuffle(data));
        this.xTrain=splitData[0][0];
        this.xValid=splitData[0][1];
        this.yTrain=splitData[1][0];
        this.yValid=splitData[1][1];
        // shame that we can destructure into this. )o:
//         [[this.xTrain,this.xValid],[this.yTrain,this.yValid]]=split(data);
    }
    forward(x) {
        for (let i=0; i<this.model.length; i++) {
            x=this.model[i].forward(x);
        }
        return x;
    }
    backward(gradients) {
        for (let i=this.model.length-1; i>=0; i--) {
            gradients=this.model[i].backward(gradients);
        }
        return gradients;
    }
    step(lr) {
        this.model.forEach(m => {
            if (typeof m.update=='function') {
                m.update(lr);
            }
        });
    }
    validate(epoch) {
        const preds=this.forward(this.xValid);
        const lossValue=this.lossFn.forward(preds,this.yValid);
        const metricValues=this.metrics.map(metric=>metric(preds,this.yValid));
        console.log('epoch',epoch,'valid loss',lossValue,'metrics',metricValues);
    }
    fit(epochs, lr=0.1, bs=64) {
        this.validate(-1); // Note: we use epoch -1 to indicate before training
        for (let epoch=0; epoch<epochs; epoch++) {
            batches([this.xTrain,this.yTrain]).forEach(batch => {
                const [xb,yb]=batch;
                const preds=this.forward(xb);
                const lossValue=this.lossFn.forward(preds,yb);
                this.lossFn.backward();
                this.backward(this.lossFn.grad);
                this.step(lr);
            });
            this.validate(epoch);
        }
    }
    predict(x,y,yToLabelFn=(a=>a)) {
        const preds=this.forward(x);
        return preds.map((pred,rowIndex) => {
            const row=[pred,yToLabelFn(pred)];
            if (y!=null) {
                row.push(yToLabelFn(y[rowIndex]));
            }
            return row;
        });
    }
}

## Train a linear model to classify iris flowers

Note: we use `BinaryCrossEntropyLoss` here just as an example. README.md and index.ipynb shows how to train with `CrossEntropyLoss`.

In [16]:
let stringData=require('fs').readFileSync('data/iris.data').toString();
let data=parseCsv(stringData, new IrisRowHandler()).result;
let lossFn=new BinaryCrossEntropyLoss();
let model=[new Linear(4,3), new Sigmoid()];
let learn=new Learner(model, lossFn, data);
learn.fit(25);

epoch -1 valid loss 1.5204031253227748 metrics [ 0.1 ]
epoch 0 valid loss 1.3091236091890934 metrics [ 0.13333333333333333 ]
epoch 1 valid loss 1.125123067421352 metrics [ 0.23333333333333334 ]
epoch 2 valid loss 0.9699143897371635 metrics [ 0.23333333333333334 ]
epoch 3 valid loss 0.8436669648668303 metrics [ 0.23333333333333334 ]
epoch 4 valid loss 0.744136815804359 metrics [ 0.23333333333333334 ]
epoch 5 valid loss 0.6666801248639437 metrics [ 0.4666666666666667 ]
epoch 6 valid loss 0.6071409766744876 metrics [ 0.5666666666666667 ]
epoch 7 valid loss 0.5605350246403263 metrics [ 0.6 ]
epoch 8 valid loss 0.5236785153952593 metrics [ 0.6666666666666666 ]
epoch 9 valid loss 0.4947260913944664 metrics [ 0.8333333333333334 ]
epoch 10 valid loss 0.47106325376466057 metrics [ 0.8 ]
epoch 11 valid loss 0.451684116797768 metrics [ 0.8 ]
epoch 12 valid loss 0.4356360034389249 metrics [ 0.8 ]
epoch 13 valid loss 0.42192982563261666 metrics [ 0.8 ]
epoch 14 valid loss 0.4102082428217775 metrics

## Train a neural net to classify iris flowers

In [17]:
let model=[new Linear(4,50), new ReLU(), new Linear(50,3), new Sigmoid()];
let learn=new Learner(model, lossFn, data);
learn.fit(25);

epoch -1 valid loss 0.9112650310020266 metrics [ 0.06666666666666667 ]
epoch 0 valid loss 0.4691261728229647 metrics [ 0.8 ]
epoch 1 valid loss 0.3580242946436428 metrics [ 0.8333333333333334 ]
epoch 2 valid loss 0.31304584006049124 metrics [ 0.8333333333333334 ]
epoch 3 valid loss 0.28940128310183194 metrics [ 0.8333333333333334 ]
epoch 4 valid loss 0.2722530307901382 metrics [ 0.8666666666666667 ]
epoch 5 valid loss 0.25807572576302173 metrics [ 0.8666666666666667 ]
epoch 6 valid loss 0.2485991630696928 metrics [ 0.8333333333333334 ]
epoch 7 valid loss 0.2413695793641617 metrics [ 0.8333333333333334 ]
epoch 8 valid loss 0.2354008805581669 metrics [ 0.8333333333333334 ]
epoch 9 valid loss 0.23168153001376693 metrics [ 0.8333333333333334 ]
epoch 10 valid loss 0.22632484777798076 metrics [ 0.8333333333333334 ]
epoch 11 valid loss 0.22261109293322623 metrics [ 0.8333333333333334 ]
epoch 12 valid loss 0.22026958594314855 metrics [ 0.8666666666666667 ]
epoch 13 valid loss 0.216103584014487

### Look at some predictions 

We use the lambda ```(y=>`${argmax(y)}: ${IRIS_CLASS_MAP[argmax(y)]}`)``` to convert predictions like `[0.000, 0.183, 0.843]` to readable labels.

In [18]:
// head(learn.predict(learn.xValid, learn.yValid)); run this to see "raw" targets
head(learn.predict(learn.xValid, learn.yValid, (y=>`${argmax(y)}: ${IRIS_CLASS_MAP[argmax(y)]}`)));

0 [
  [ 0.0011209508142939284, 0.868588948674408, 0.18839857907396682 ],
  '1: Iris-versicolor',
  '2: Iris-virginica'
]
1 [
  [ 0.03160294893152106, 0.8619210305769701, 0.0686605377396662 ],
  '1: Iris-versicolor',
  '1: Iris-versicolor'
]
2 [
  [ 0.9563865002933649, 0.05844710206620156, 0.0022523456298663234 ],
  '0: Iris-setosa',
  '0: Iris-setosa'
]
3 [
  [ 0.9514775134519834, 0.07523967446272668, 0.0011493026435126436 ],
  '0: Iris-setosa',
  '0: Iris-setosa'
]
4 [
  [ 0.4078765566556188, 0.747607683074964, 0.00045367027710275777 ],
  '1: Iris-versicolor',
  '0: Iris-setosa'
]
5 [
  [ 0.010986773110423582, 0.11516426466816512, 0.9204752847569533 ],
  '2: Iris-virginica',
  '2: Iris-virginica'
]
6 [
  [ 0.9349790667787307, 0.0895305373210309, 0.004379558107638215 ],
  '0: Iris-setosa',
  '0: Iris-setosa'
]
7 [
  [ 0.000031527650954552385, 0.10930666946114984, 0.920677373630896 ],
  '2: Iris-virginica',
  '2: Iris-virginica'
]
8 [
  [ 0.0036005866175615286, 0.042814835829574065, 0.9

Show how we could train a linear layer without `Learner` - this is not a proper training loop, we just;
- forward pass
- print training loss
- backward pass
- update

In [19]:
let data=parseCsv(stringData, new IrisRowHandler()).result;
let x=data[0],y=data[1];
console.log('shape(x)',shape(x), 'shape(y)',shape(y));
let loss_fn=new BinaryCrossEntropyLoss()
let sig=new Sigmoid()
let lin=new Linear(4,3);
for (let epoch = 0; epoch < 10; epoch++) {
    let y_pred=sig.forward(lin.forward(x));
    let loss_value=loss_fn.forward(y_pred,y);
    console.log('epoch',epoch,'loss_value',loss_value);
    loss_fn.backward();
    sig.backward(loss_fn.grad);
    lin.backward(sig.grad);
    lin.update(.1);
}

shape(x) [ 150, 4 ] shape(y) [ 150, 3 ]
epoch 0 loss_value 1.0985206688321374
epoch 1 loss_value 1.035577401241871
epoch 2 loss_value 0.9764822829178839
epoch 3 loss_value 0.9213514939129728
epoch 4 loss_value 0.8702623923928795
epoch 5 loss_value 0.823239612585144
epoch 6 loss_value 0.7802451401146258
epoch 7 loss_value 0.7411744189432343
epoch 8 loss_value 0.7058593963417175
epoch 9 loss_value 0.6740779355375611


## Can we teach a linear layer to convert one hot encoded integers to their bitwise representations?

In [20]:
let x=[
    [1,0,0,0,0,0,0,0,0,0],
    [0,1,0,0,0,0,0,0,0,0],
    [0,0,1,0,0,0,0,0,0,0],
    [0,0,0,1,0,0,0,0,0,0],
    [0,0,0,0,1,0,0,0,0,0],
    [0,0,0,0,0,1,0,0,0,0],
    [0,0,0,0,0,0,1,0,0,0],
    [0,0,0,0,0,0,0,1,0,0],
    [0,0,0,0,0,0,0,0,1,0],
    [0,0,0,0,0,0,0,0,0,1]
];
let y=[
    [0,0,0,0],
    [1,0,0,0],
    [0,1,0,0],
    [1,1,0,0],
    [0,0,1,0],
    [1,0,1,0],
    [0,1,1,0],
    [1,1,1,0],
    [0,0,0,1],
    [1,0,0,1]
];

`x` is an identity matrix, so ... `x.y` is `y`

In [21]:
testEq(y,dotProduct(x,y))

so ... will `y` make the perfect weights (if bias is zero)?

In [22]:
let loss_fn=new BinaryCrossEntropyLoss()
let sig=new Sigmoid()
let linearNoBias=new Linear(10,4,false);
let y_pred=null;
for (let epoch = 0; epoch<10; epoch++) {
    y_pred=sig.forward(linearNoBias.forward(x));
    const loss_value=loss_fn.forward(y_pred,y);
    if (epoch%10==9) {
        console.log('epoch',epoch,'loss_value',loss_value);
    }
    loss_fn.backward();
    sig.backward(loss_fn.grad);
    linearNoBias.backward(sig.grad);
    linearNoBias.update(50);
}
console.log(y_pred)

epoch 9 loss_value 0.01817239238895991
[
  [
    0.018346571965820773,
    0.01789284935833282,
    0.017857961859873598,
    0.018106322765750707
  ],
  [
    0.9823319504531975,
    0.017878673813708754,
    0.018510728075674984,
    0.018253139927648116
  ],
  [
    0.01831435416274833,
    0.9823831160511766,
    0.0177013929780844,
    0.017759857283000436
  ],
  [
    0.9824892778131794,
    0.9818210178477551,
    0.018181094307748235,
    0.01760498668248131
  ],
  [
    0.01803044907905115,
    0.018357491465425733,
    0.9815807065162295,
    0.018349313933759687
  ],
  [
    0.9815440512194518,
    0.01835549957560014,
    0.9824442569091367,
    0.017934759688817126
  ],
  [
    0.017428776777345053,
    0.9824373227261594,
    0.9815920319748943,
    0.018043250190070633
  ],
  [
    0.9821080543156853,
    0.9819669682198938,
    0.981974034046849,
    0.018109569625291297
  ],
  [
    0.017907372398934297,
    0.01828016343947588,
    0.01779438811271174,
    0.981982431

dump our linear layer to output - so we can look at the learned weights.

In [23]:
linearNoBias

Linear {
  inputDim: 10,
  numHidden: 4,
  weights: [
    [
      -4.071528438085275,
      -4.094763508053605,
      -4.0965762975366085,
      -4.083753443318915
    ],
    [
      4.106511635997528,
      -4.095499624377472,
      -4.06327426375718,
      -4.076262078735197
    ],
    [
      -4.073157778804889,
      4.109208033089979,
      -4.104758971682219,
      -4.101694407727017
    ],
    [
      4.114829651753304,
      4.0800378396629,
      -4.079930069248298,
      -4.1098362186275255
    ],
    [
      -4.087650601831166,
      -4.070976909705129,
      4.0678620313224565,
      -4.071389911908574
    ],
    [
      4.066019892491755,
      -4.071077490837238,
      4.112441183593208,
      -4.092590828865962
    ],
    [
      -4.119194038917194,
      4.112073893487588,
      4.0684320003835115,
      -4.086991851218199
    ],
    [
      4.0948104158539165,
      4.087517654384536,
      4.0878814244941,
      -4.08358706509153
    ],
    [
      -4.094009997444608,

In [24]:
export {accuracy,Sigmoid,MSE,BinaryCrossEntropyLoss,CrossEntropyLoss,ReLU,Linear,Learner}